diff --git a/include/os/linux/spl/sys/kmem.h b/include/os/linux/spl/sys/kmem.h index 111924303e16..594425f7b297 100644 --- a/include/os/linux/spl/sys/kmem.h +++ b/include/os/linux/spl/sys/kmem.h @@ -1,216 +1,219 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_KMEM_H #define _SPL_KMEM_H #include #include #include #include #include extern int kmem_debugging(void); extern char *kmem_vasprintf(const char *fmt, va_list ap) __attribute__((format(printf, 1, 0))); extern char *kmem_asprintf(const char *fmt, ...) __attribute__((format(printf, 1, 2))); extern char *kmem_strdup(const char *str); extern void kmem_strfree(char *str); #define kmem_scnprintf scnprintf +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + /* * Memory allocation interfaces */ #define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ #define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ #define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ #define KM_ZERO 0x1000 /* zero the allocation */ #define KM_VMEM 0x2000 /* caller is vmem_* wrapper */ #define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE) static int spl_fstrans_check(void); void *spl_kvmalloc(size_t size, gfp_t flags); /* * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion * function is context aware which means that KM_SLEEP allocations can be * safely used in syncing contexts which have set PF_FSTRANS. */ static inline gfp_t kmem_flags_convert(int flags) { gfp_t lflags = __GFP_NOWARN | __GFP_COMP; if (flags & KM_NOSLEEP) { lflags |= GFP_ATOMIC | __GFP_NORETRY; } else { lflags |= GFP_KERNEL; if (spl_fstrans_check()) lflags &= ~(__GFP_IO|__GFP_FS); } if (flags & KM_PUSHPAGE) lflags |= __GFP_HIGH; if (flags & KM_ZERO) lflags |= __GFP_ZERO; return (lflags); } typedef struct { struct task_struct *fstrans_thread; unsigned int saved_flags; } fstrans_cookie_t; /* * Introduced in Linux 3.9, however this cannot be solely relied on before * Linux 3.18 as it doesn't turn off __GFP_FS as it should. */ #ifdef PF_MEMALLOC_NOIO #define __SPL_PF_MEMALLOC_NOIO (PF_MEMALLOC_NOIO) #else #define __SPL_PF_MEMALLOC_NOIO (0) #endif /* * PF_FSTRANS is removed from Linux 4.12 */ #ifdef PF_FSTRANS #define __SPL_PF_FSTRANS (PF_FSTRANS) #else #define __SPL_PF_FSTRANS (0) #endif #define SPL_FSTRANS (__SPL_PF_FSTRANS|__SPL_PF_MEMALLOC_NOIO) static inline fstrans_cookie_t spl_fstrans_mark(void) { fstrans_cookie_t cookie; BUILD_BUG_ON(SPL_FSTRANS == 0); cookie.fstrans_thread = current; cookie.saved_flags = current->flags & SPL_FSTRANS; current->flags |= SPL_FSTRANS; return (cookie); } static inline void spl_fstrans_unmark(fstrans_cookie_t cookie) { ASSERT3P(cookie.fstrans_thread, ==, current); ASSERT((current->flags & SPL_FSTRANS) == SPL_FSTRANS); current->flags &= ~SPL_FSTRANS; current->flags |= cookie.saved_flags; } static inline int spl_fstrans_check(void) { return (current->flags & SPL_FSTRANS); } /* * specifically used to check PF_FSTRANS flag, cannot be relied on for * checking spl_fstrans_mark(). */ static inline int __spl_pf_fstrans_check(void) { return (current->flags & __SPL_PF_FSTRANS); } /* * Kernel compatibility for GFP flags */ /* < 4.13 */ #ifndef __GFP_RETRY_MAYFAIL #define __GFP_RETRY_MAYFAIL __GFP_REPEAT #endif /* < 4.4 */ #ifndef __GFP_RECLAIM #define __GFP_RECLAIM __GFP_WAIT #endif #ifdef HAVE_ATOMIC64_T #define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) #define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) #define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) #define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) extern atomic64_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; #else /* HAVE_ATOMIC64_T */ #define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) #define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) #define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) #define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) extern atomic_t kmem_alloc_used; extern unsigned long long kmem_alloc_max; #endif /* HAVE_ATOMIC64_T */ extern unsigned int spl_kmem_alloc_warn; extern unsigned int spl_kmem_alloc_max; #define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__) #define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__) #define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) #define kmem_cache_reap_active spl_kmem_cache_reap_active extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line) __attribute__((alloc_size(1))); extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line) __attribute__((alloc_size(1))); extern void spl_kmem_free(const void *ptr, size_t sz); /* * 5.8 API change, pgprot_t argument removed. */ #ifdef HAVE_VMALLOC_PAGE_KERNEL #define spl_vmalloc(size, flags) __vmalloc(size, flags, PAGE_KERNEL) #else #define spl_vmalloc(size, flags) __vmalloc(size, flags) #endif /* * The following functions are only available for internal use. */ extern void *spl_kmem_alloc_impl(size_t size, int flags, int node); extern void *spl_kmem_alloc_debug(size_t size, int flags, int node); extern void *spl_kmem_alloc_track(size_t size, int flags, const char *func, int line, int node); extern void spl_kmem_free_impl(const void *buf, size_t size); extern void spl_kmem_free_debug(const void *buf, size_t size); extern void spl_kmem_free_track(const void *buf, size_t size); extern int spl_kmem_init(void); extern void spl_kmem_fini(void); extern int spl_kmem_cache_reap_active(void); #endif /* _SPL_KMEM_H */ diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index bd0ad5052d3d..cc9cafa84f99 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -1,215 +1,212 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_KMEM_CACHE_H #define _SPL_KMEM_CACHE_H #include /* * Slab allocation interfaces. The SPL slab differs from the standard * Linux SLAB or SLUB primarily in that each cache may be backed by slabs * allocated from the physical or virtual memory address space. The virtual * slabs allow for good behavior when allocation large objects of identical * size. This slab implementation also supports both constructors and * destructors which the Linux slab does not. */ typedef enum kmc_bit { KMC_BIT_NODEBUG = 1, /* Default behavior */ KMC_BIT_KVMEM = 7, /* Use kvmalloc linux allocator */ KMC_BIT_SLAB = 8, /* Use Linux slab cache */ KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ KMC_BIT_GROWING = 15, /* Growing in progress */ KMC_BIT_REAPING = 16, /* Reaping in progress */ KMC_BIT_DESTROY = 17, /* Destroy in progress */ KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ KMC_BIT_MAX = 20, /* Proc handler helper bit */ } kmc_bit_t; /* kmem move callback return values */ typedef enum kmem_cbrc { KMEM_CBRC_YES = 0, /* Object moved */ KMEM_CBRC_NO = 1, /* Object not moved */ KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ } kmem_cbrc_t; #define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) #define KMC_KVMEM (1 << KMC_BIT_KVMEM) #define KMC_SLAB (1 << KMC_BIT_SLAB) #define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) #define KMC_GROWING (1 << KMC_BIT_GROWING) #define KMC_REAPING (1 << KMC_BIT_REAPING) #define KMC_DESTROY (1 << KMC_BIT_DESTROY) #define KMC_TOTAL (1 << KMC_BIT_TOTAL) #define KMC_ALLOC (1 << KMC_BIT_ALLOC) #define KMC_MAX (1 << KMC_BIT_MAX) #define KMC_REAP_CHUNK INT_MAX #define KMC_DEFAULT_SEEKS 1 #define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ extern struct list_head spl_kmem_cache_list; extern struct rw_semaphore spl_kmem_cache_sem; #define SKM_MAGIC 0x2e2e2e2e #define SKO_MAGIC 0x20202020 #define SKS_MAGIC 0x22222222 #define SKC_MAGIC 0x2c2c2c2c #define SPL_KMEM_CACHE_OBJ_PER_SLAB 8 /* Target objects per slab */ #define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ #ifdef _LP64 #define SPL_KMEM_CACHE_MAX_SIZE 32 /* Max slab size in MB */ #else #define SPL_KMEM_CACHE_MAX_SIZE 4 /* Max slab size in MB */ #endif #define SPL_MAX_ORDER (MAX_ORDER - 3) #define SPL_MAX_ORDER_NR_PAGES (1 << (SPL_MAX_ORDER - 1)) #ifdef CONFIG_SLUB #define SPL_MAX_KMEM_CACHE_ORDER PAGE_ALLOC_COSTLY_ORDER #define SPL_MAX_KMEM_ORDER_NR_PAGES (1 << (SPL_MAX_KMEM_CACHE_ORDER - 1)) #else #define SPL_MAX_KMEM_ORDER_NR_PAGES (KMALLOC_MAX_SIZE >> PAGE_SHIFT) #endif -#define POINTER_IS_VALID(p) 0 /* Unimplemented */ -#define POINTER_INVALIDATE(pp) /* Unimplemented */ - typedef int (*spl_kmem_ctor_t)(void *, void *, int); typedef void (*spl_kmem_dtor_t)(void *, void *); typedef struct spl_kmem_magazine { uint32_t skm_magic; /* Sanity magic */ uint32_t skm_avail; /* Available objects */ uint32_t skm_size; /* Magazine size */ uint32_t skm_refill; /* Batch refill size */ struct spl_kmem_cache *skm_cache; /* Owned by cache */ unsigned int skm_cpu; /* Owned by cpu */ void *skm_objs[0]; /* Object pointers */ } spl_kmem_magazine_t; typedef struct spl_kmem_obj { uint32_t sko_magic; /* Sanity magic */ void *sko_addr; /* Buffer address */ struct spl_kmem_slab *sko_slab; /* Owned by slab */ struct list_head sko_list; /* Free object list linkage */ } spl_kmem_obj_t; typedef struct spl_kmem_slab { uint32_t sks_magic; /* Sanity magic */ uint32_t sks_objs; /* Objects per slab */ struct spl_kmem_cache *sks_cache; /* Owned by cache */ struct list_head sks_list; /* Slab list linkage */ struct list_head sks_free_list; /* Free object list */ unsigned long sks_age; /* Last modify jiffie */ uint32_t sks_ref; /* Ref count used objects */ } spl_kmem_slab_t; typedef struct spl_kmem_alloc { struct spl_kmem_cache *ska_cache; /* Owned by cache */ int ska_flags; /* Allocation flags */ taskq_ent_t ska_tqe; /* Task queue entry */ } spl_kmem_alloc_t; typedef struct spl_kmem_emergency { struct rb_node ske_node; /* Emergency tree linkage */ unsigned long ske_obj; /* Buffer address */ } spl_kmem_emergency_t; typedef struct spl_kmem_cache { uint32_t skc_magic; /* Sanity magic */ uint32_t skc_name_size; /* Name length */ char *skc_name; /* Name string */ spl_kmem_magazine_t **skc_mag; /* Per-CPU warm cache */ uint32_t skc_mag_size; /* Magazine size */ uint32_t skc_mag_refill; /* Magazine refill count */ spl_kmem_ctor_t skc_ctor; /* Constructor */ spl_kmem_dtor_t skc_dtor; /* Destructor */ void *skc_private; /* Private data */ void *skc_vmp; /* Unused */ struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ unsigned long skc_flags; /* Flags */ uint32_t skc_obj_size; /* Object size */ uint32_t skc_obj_align; /* Object alignment */ uint32_t skc_slab_objs; /* Objects per slab */ uint32_t skc_slab_size; /* Slab size */ atomic_t skc_ref; /* Ref count callers */ taskqid_t skc_taskqid; /* Slab reclaim task */ struct list_head skc_list; /* List of caches linkage */ struct list_head skc_complete_list; /* Completely alloc'ed */ struct list_head skc_partial_list; /* Partially alloc'ed */ struct rb_root skc_emergency_tree; /* Min sized objects */ spinlock_t skc_lock; /* Cache lock */ spl_wait_queue_head_t skc_waitq; /* Allocation waiters */ uint64_t skc_slab_fail; /* Slab alloc failures */ uint64_t skc_slab_create; /* Slab creates */ uint64_t skc_slab_destroy; /* Slab destroys */ uint64_t skc_slab_total; /* Slab total current */ uint64_t skc_slab_alloc; /* Slab alloc current */ uint64_t skc_slab_max; /* Slab max historic */ uint64_t skc_obj_total; /* Obj total current */ uint64_t skc_obj_alloc; /* Obj alloc current */ struct percpu_counter skc_linux_alloc; /* Linux-backed Obj alloc */ uint64_t skc_obj_max; /* Obj max historic */ uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ uint64_t skc_obj_emergency; /* Obj emergency current */ uint64_t skc_obj_emergency_max; /* Obj emergency max */ } spl_kmem_cache_t; #define kmem_cache_t spl_kmem_cache_t extern spl_kmem_cache_t *spl_kmem_cache_create(const char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim, void *priv, void *vmp, int flags); extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, kmem_cbrc_t (*)(void *, void *, size_t, void *)); extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags); extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc); extern void spl_kmem_reap(void); extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); #define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \ spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) #define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) #define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) #define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) #define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) #define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) #define kmem_reap() spl_kmem_reap() /* * The following functions are only available for internal use. */ extern int spl_kmem_cache_init(void); extern void spl_kmem_cache_fini(void); #endif /* _SPL_KMEM_CACHE_H */ diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c index a9318f7a05cf..9f735dbb558c 100644 --- a/module/os/freebsd/zfs/zfs_acl.c +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -1,2678 +1,2677 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE #define DENY ACE_ACCESS_DENIED_ACE_TYPE #define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE #define MIN_ACE_TYPE ALLOW #define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) #define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) #define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) #define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) #define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ ACE_DELETE|ACE_DELETE_CHILD) #define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) #define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) #define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) #define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) #define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) #define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ ZFS_ACL_PROTECTED) #define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ ZFS_ACL_OBJ_ACE) #define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) static uint16_t zfs_ace_v0_get_type(void *acep) { return (((zfs_oldace_t *)acep)->z_type); } static uint16_t zfs_ace_v0_get_flags(void *acep) { return (((zfs_oldace_t *)acep)->z_flags); } static uint32_t zfs_ace_v0_get_mask(void *acep) { return (((zfs_oldace_t *)acep)->z_access_mask); } static uint64_t zfs_ace_v0_get_who(void *acep) { return (((zfs_oldace_t *)acep)->z_fuid); } static void zfs_ace_v0_set_type(void *acep, uint16_t type) { ((zfs_oldace_t *)acep)->z_type = type; } static void zfs_ace_v0_set_flags(void *acep, uint16_t flags) { ((zfs_oldace_t *)acep)->z_flags = flags; } static void zfs_ace_v0_set_mask(void *acep, uint32_t mask) { ((zfs_oldace_t *)acep)->z_access_mask = mask; } static void zfs_ace_v0_set_who(void *acep, uint64_t who) { ((zfs_oldace_t *)acep)->z_fuid = who; } static size_t zfs_ace_v0_size(void *acep) { (void) acep; return (sizeof (zfs_oldace_t)); } static size_t zfs_ace_v0_abstract_size(void) { return (sizeof (zfs_oldace_t)); } static int zfs_ace_v0_mask_off(void) { return (offsetof(zfs_oldace_t, z_access_mask)); } static int zfs_ace_v0_data(void *acep, void **datap) { (void) acep; *datap = NULL; return (0); } static const acl_ops_t zfs_acl_v0_ops = { zfs_ace_v0_get_mask, zfs_ace_v0_set_mask, zfs_ace_v0_get_flags, zfs_ace_v0_set_flags, zfs_ace_v0_get_type, zfs_ace_v0_set_type, zfs_ace_v0_get_who, zfs_ace_v0_set_who, zfs_ace_v0_size, zfs_ace_v0_abstract_size, zfs_ace_v0_mask_off, zfs_ace_v0_data }; static uint16_t zfs_ace_fuid_get_type(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_type); } static uint16_t zfs_ace_fuid_get_flags(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_flags); } static uint32_t zfs_ace_fuid_get_mask(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_access_mask); } static uint64_t zfs_ace_fuid_get_who(void *args) { uint16_t entry_type; zfs_ace_t *acep = args; entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return (-1); return (((zfs_ace_t *)acep)->z_fuid); } static void zfs_ace_fuid_set_type(void *acep, uint16_t type) { ((zfs_ace_hdr_t *)acep)->z_type = type; } static void zfs_ace_fuid_set_flags(void *acep, uint16_t flags) { ((zfs_ace_hdr_t *)acep)->z_flags = flags; } static void zfs_ace_fuid_set_mask(void *acep, uint32_t mask) { ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; } static void zfs_ace_fuid_set_who(void *arg, uint64_t who) { zfs_ace_t *acep = arg; uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return; acep->z_fuid = who; } static size_t zfs_ace_fuid_size(void *acep) { zfs_ace_hdr_t *zacep = acep; uint16_t entry_type; switch (zacep->z_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: return (sizeof (zfs_object_ace_t)); case ALLOW: case DENY: entry_type = (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return (sizeof (zfs_ace_hdr_t)); zfs_fallthrough; default: return (sizeof (zfs_ace_t)); } } static size_t zfs_ace_fuid_abstract_size(void) { return (sizeof (zfs_ace_hdr_t)); } static int zfs_ace_fuid_mask_off(void) { return (offsetof(zfs_ace_hdr_t, z_access_mask)); } static int zfs_ace_fuid_data(void *acep, void **datap) { zfs_ace_t *zacep = acep; zfs_object_ace_t *zobjp; switch (zacep->z_hdr.z_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: zobjp = acep; *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); default: *datap = NULL; return (0); } } static const acl_ops_t zfs_acl_fuid_ops = { zfs_ace_fuid_get_mask, zfs_ace_fuid_set_mask, zfs_ace_fuid_get_flags, zfs_ace_fuid_set_flags, zfs_ace_fuid_get_type, zfs_ace_fuid_set_type, zfs_ace_fuid_get_who, zfs_ace_fuid_set_who, zfs_ace_fuid_size, zfs_ace_fuid_abstract_size, zfs_ace_fuid_mask_off, zfs_ace_fuid_data }; /* * The following three functions are provided for compatibility with * older ZPL version in order to determine if the file use to have * an external ACL and what version of ACL previously existed on the * file. Would really be nice to not need this, sigh. */ uint64_t zfs_external_acl(znode_t *zp) { zfs_acl_phys_t acl_phys; int error; if (zp->z_is_sa) return (0); /* * Need to deal with a potential * race where zfs_sa_upgrade could cause * z_isa_sa to change. * * If the lookup fails then the state of z_is_sa should have * changed. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), &acl_phys, sizeof (acl_phys))) == 0) return (acl_phys.z_acl_extern_obj); else { /* * after upgrade the SA_ZPL_ZNODE_ACL should have been * removed */ VERIFY(zp->z_is_sa); VERIFY3S(error, ==, ENOENT); return (0); } } /* * Determine size of ACL in bytes * * This is more complicated than it should be since we have to deal * with old external ACLs. */ static int zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, zfs_acl_phys_t *aclphys) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t acl_count; int size; int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if (zp->z_is_sa) { if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), &size)) != 0) return (error); *aclsize = size; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), &acl_count, sizeof (acl_count))) != 0) return (error); *aclcount = acl_count; } else { if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), aclphys, sizeof (*aclphys))) != 0) return (error); if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); *aclcount = aclphys->z_acl_size; } else { *aclsize = aclphys->z_acl_size; *aclcount = aclphys->z_acl_count; } } return (0); } int zfs_znode_acl_version(znode_t *zp) { zfs_acl_phys_t acl_phys; if (zp->z_is_sa) return (ZFS_ACL_VERSION_FUID); else { int error; /* * Need to deal with a potential * race where zfs_sa_upgrade could cause * z_isa_sa to change. * * If the lookup fails then the state of z_is_sa should have * changed. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), &acl_phys, sizeof (acl_phys))) == 0) return (acl_phys.z_acl_version); else { /* * After upgrade SA_ZPL_ZNODE_ACL should have * been removed. */ VERIFY(zp->z_is_sa); VERIFY3S(error, ==, ENOENT); return (ZFS_ACL_VERSION_FUID); } } } static int zfs_acl_version(int version) { if (version < ZPL_VERSION_FUID) return (ZFS_ACL_VERSION_INITIAL); else return (ZFS_ACL_VERSION_FUID); } static int zfs_acl_version_zp(znode_t *zp) { return (zfs_acl_version(zp->z_zfsvfs->z_version)); } zfs_acl_t * zfs_acl_alloc(int vers) { zfs_acl_t *aclp; aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), offsetof(zfs_acl_node_t, z_next)); aclp->z_version = vers; if (vers == ZFS_ACL_VERSION_FUID) aclp->z_ops = &zfs_acl_fuid_ops; else aclp->z_ops = &zfs_acl_v0_ops; return (aclp); } zfs_acl_node_t * zfs_acl_node_alloc(size_t bytes) { zfs_acl_node_t *aclnode; aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); if (bytes) { aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); aclnode->z_allocdata = aclnode->z_acldata; aclnode->z_allocsize = bytes; aclnode->z_size = bytes; } return (aclnode); } static void zfs_acl_node_free(zfs_acl_node_t *aclnode) { if (aclnode->z_allocsize) kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); kmem_free(aclnode, sizeof (zfs_acl_node_t)); } static void zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; while ((aclnode = list_head(&aclp->z_acl))) { list_remove(&aclp->z_acl, aclnode); zfs_acl_node_free(aclnode); } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } void zfs_acl_free(zfs_acl_t *aclp) { zfs_acl_release_nodes(aclp); list_destroy(&aclp->z_acl); kmem_free(aclp, sizeof (zfs_acl_t)); } static boolean_t zfs_acl_valid_ace_type(uint_t type, uint_t flags) { uint16_t entry_type; switch (type) { case ALLOW: case DENY: case ACE_SYSTEM_AUDIT_ACE_TYPE: case ACE_SYSTEM_ALARM_ACE_TYPE: entry_type = flags & ACE_TYPE_FLAGS; return (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE || entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP); default: if (type <= MAX_ACE_TYPE) return (B_TRUE); } return (B_FALSE); } static boolean_t zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) { /* * first check type of entry */ if (!zfs_acl_valid_ace_type(type, iflags)) return (B_FALSE); switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: if (aclp->z_version < ZFS_ACL_VERSION_FUID) return (B_FALSE); aclp->z_hints |= ZFS_ACL_OBJ_ACE; } /* * next check inheritance level flags */ if (obj_type == VDIR && (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) aclp->z_hints |= ZFS_INHERIT_ACE; if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { if ((iflags & (ACE_FILE_INHERIT_ACE| ACE_DIRECTORY_INHERIT_ACE)) == 0) { return (B_FALSE); } } return (B_TRUE); } static void * zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, uint32_t *access_mask, uint16_t *iflags, uint16_t *type) { zfs_acl_node_t *aclnode; ASSERT3P(aclp, !=, NULL); if (start == NULL) { aclnode = list_head(&aclp->z_acl); if (aclnode == NULL) return (NULL); aclp->z_next_ace = aclnode->z_acldata; aclp->z_curr_node = aclnode; aclnode->z_ace_idx = 0; } aclnode = aclp->z_curr_node; if (aclnode == NULL) return (NULL); if (aclnode->z_ace_idx >= aclnode->z_ace_count) { aclnode = list_next(&aclp->z_acl, aclnode); if (aclnode == NULL) return (NULL); else { aclp->z_curr_node = aclnode; aclnode->z_ace_idx = 0; aclp->z_next_ace = aclnode->z_acldata; } } if (aclnode->z_ace_idx < aclnode->z_ace_count) { void *acep = aclp->z_next_ace; size_t ace_size; /* * Make sure we don't overstep our bounds */ ace_size = aclp->z_ops->ace_size(acep); if (((caddr_t)acep + ace_size) > ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { return (NULL); } *iflags = aclp->z_ops->ace_flags_get(acep); *type = aclp->z_ops->ace_type_get(acep); *access_mask = aclp->z_ops->ace_mask_get(acep); *who = aclp->z_ops->ace_who_get(acep); aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; aclnode->z_ace_idx++; return ((void *)acep); } return (NULL); } static uintptr_t zfs_ace_walk(void *datap, uintptr_t cookie, int aclcnt, uint16_t *flags, uint16_t *type, uint32_t *mask) { (void) aclcnt; zfs_acl_t *aclp = datap; zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; uint64_t who; acep = zfs_acl_next_ace(aclp, acep, &who, mask, flags, type); return ((uintptr_t)acep); } /* * Copy ACE to internal ZFS format. * While processing the ACL each ACE will be validated for correctness. * ACE FUIDs will be created later. */ static int zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, zfs_fuid_info_t **fuidp, cred_t *cr) { int i; uint16_t entry_type; zfs_ace_t *aceptr = z_acl; ace_t *acep = datap; zfs_object_ace_t *zobjacep; ace_object_t *aceobjp; for (i = 0; i != aclcnt; i++) { aceptr->z_hdr.z_access_mask = acep->a_access_mask; aceptr->z_hdr.z_flags = acep->a_flags; aceptr->z_hdr.z_type = acep->a_type; entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE) { aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, cr, (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); } /* * Make sure ACE is valid */ if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, aceptr->z_hdr.z_flags) != B_TRUE) return (SET_ERROR(EINVAL)); switch (acep->a_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: zobjacep = (zfs_object_ace_t *)aceptr; aceobjp = (ace_object_t *)acep; memcpy(zobjacep->z_object_type, aceobjp->a_obj_type, sizeof (aceobjp->a_obj_type)); memcpy(zobjacep->z_inherit_type, aceobjp->a_inherit_obj_type, sizeof (aceobjp->a_inherit_obj_type)); acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); break; default: acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); } aceptr = (zfs_ace_t *)((caddr_t)aceptr + aclp->z_ops->ace_size(aceptr)); } *size = (caddr_t)aceptr - (caddr_t)z_acl; return (0); } /* * Copy ZFS ACEs to fixed size ace_t layout */ static void zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, void *datap, int filter) { uint64_t who; uint32_t access_mask; uint16_t iflags, type; zfs_ace_hdr_t *zacep = NULL; ace_t *acep = datap; ace_object_t *objacep; zfs_object_ace_t *zobjacep; size_t ace_size; uint16_t entry_type; while ((zacep = zfs_acl_next_ace(aclp, zacep, &who, &access_mask, &iflags, &type))) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: if (filter) { continue; } zobjacep = (zfs_object_ace_t *)zacep; objacep = (ace_object_t *)acep; memcpy(objacep->a_obj_type, zobjacep->z_object_type, sizeof (zobjacep->z_object_type)); memcpy(objacep->a_inherit_obj_type, zobjacep->z_inherit_type, sizeof (zobjacep->z_inherit_type)); ace_size = sizeof (ace_object_t); break; default: ace_size = sizeof (ace_t); break; } entry_type = (iflags & ACE_TYPE_FLAGS); if ((entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE)) { acep->a_who = zfs_fuid_map_id(zfsvfs, who, cr, (entry_type & ACE_IDENTIFIER_GROUP) ? ZFS_ACE_GROUP : ZFS_ACE_USER); } else { acep->a_who = (uid_t)(int64_t)who; } acep->a_access_mask = access_mask; acep->a_flags = iflags; acep->a_type = type; acep = (ace_t *)((caddr_t)acep + ace_size); } } static int zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, zfs_oldace_t *z_acl, int aclcnt, size_t *size) { int i; zfs_oldace_t *aceptr = z_acl; for (i = 0; i != aclcnt; i++, aceptr++) { aceptr->z_access_mask = acep[i].a_access_mask; aceptr->z_type = acep[i].a_type; aceptr->z_flags = acep[i].a_flags; aceptr->z_fuid = acep[i].a_who; /* * Make sure ACE is valid */ if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, aceptr->z_flags) != B_TRUE) return (SET_ERROR(EINVAL)); } *size = (caddr_t)aceptr - (caddr_t)z_acl; return (0); } /* * convert old ACL format to new */ void zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) { zfs_oldace_t *oldaclp; int i; uint16_t type, iflags; uint32_t access_mask; uint64_t who; void *cookie = NULL; zfs_acl_node_t *newaclnode; ASSERT3U(aclp->z_version, ==, ZFS_ACL_VERSION_INITIAL); /* * First create the ACE in a contiguous piece of memory * for zfs_copy_ace_2_fuid(). * * We only convert an ACL once, so this won't happen * everytime. */ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, KM_SLEEP); i = 0; while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, &access_mask, &iflags, &type))) { oldaclp[i].z_flags = iflags; oldaclp[i].z_type = type; oldaclp[i].z_fuid = who; oldaclp[i++].z_access_mask = access_mask; } newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * sizeof (zfs_object_ace_t)); aclp->z_ops = &zfs_acl_fuid_ops; VERIFY0(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count, &newaclnode->z_size, NULL, cr)); newaclnode->z_ace_count = aclp->z_acl_count; aclp->z_version = ZFS_ACL_VERSION; kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); /* * Release all previous ACL nodes */ zfs_acl_release_nodes(aclp); list_insert_head(&aclp->z_acl, newaclnode); aclp->z_acl_bytes = newaclnode->z_size; aclp->z_acl_count = newaclnode->z_ace_count; } /* * Convert unix access mask to v4 access mask */ static uint32_t zfs_unix_to_v4(uint32_t access_mask) { uint32_t new_mask = 0; if (access_mask & S_IXOTH) new_mask |= ACE_EXECUTE; if (access_mask & S_IWOTH) new_mask |= ACE_WRITE_DATA; if (access_mask & S_IROTH) new_mask |= ACE_READ_DATA; return (new_mask); } static void zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, uint16_t access_type, uint64_t fuid, uint16_t entry_type) { uint16_t type = entry_type & ACE_TYPE_FLAGS; aclp->z_ops->ace_mask_set(acep, access_mask); aclp->z_ops->ace_type_set(acep, access_type); aclp->z_ops->ace_flags_set(acep, entry_type); if ((type != ACE_OWNER && type != OWNING_GROUP && type != ACE_EVERYONE)) aclp->z_ops->ace_who_set(acep, fuid); } /* * Determine mode of file based on ACL. */ uint64_t zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, uint64_t *pflags, uint64_t fuid, uint64_t fgid) { int entry_type; mode_t mode; mode_t seen = 0; zfs_ace_hdr_t *acep = NULL; uint64_t who; uint16_t iflags, type; uint32_t access_mask; boolean_t an_exec_denied = B_FALSE; mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { if (!zfs_acl_valid_ace_type(type, iflags)) continue; entry_type = (iflags & ACE_TYPE_FLAGS); /* * Skip over any inherit_only ACEs */ if (iflags & ACE_INHERIT_ONLY_ACE) continue; if (entry_type == ACE_OWNER || (entry_type == 0 && who == fuid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRUSR))) { seen |= S_IRUSR; if (type == ALLOW) { mode |= S_IRUSR; } } if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWUSR))) { seen |= S_IWUSR; if (type == ALLOW) { mode |= S_IWUSR; } } if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXUSR))) { seen |= S_IXUSR; if (type == ALLOW) { mode |= S_IXUSR; } } } else if (entry_type == OWNING_GROUP || (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRGRP))) { seen |= S_IRGRP; if (type == ALLOW) { mode |= S_IRGRP; } } if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWGRP))) { seen |= S_IWGRP; if (type == ALLOW) { mode |= S_IWGRP; } } if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXGRP))) { seen |= S_IXGRP; if (type == ALLOW) { mode |= S_IXGRP; } } } else if (entry_type == ACE_EVERYONE) { if ((access_mask & ACE_READ_DATA)) { if (!(seen & S_IRUSR)) { seen |= S_IRUSR; if (type == ALLOW) { mode |= S_IRUSR; } } if (!(seen & S_IRGRP)) { seen |= S_IRGRP; if (type == ALLOW) { mode |= S_IRGRP; } } if (!(seen & S_IROTH)) { seen |= S_IROTH; if (type == ALLOW) { mode |= S_IROTH; } } } if ((access_mask & ACE_WRITE_DATA)) { if (!(seen & S_IWUSR)) { seen |= S_IWUSR; if (type == ALLOW) { mode |= S_IWUSR; } } if (!(seen & S_IWGRP)) { seen |= S_IWGRP; if (type == ALLOW) { mode |= S_IWGRP; } } if (!(seen & S_IWOTH)) { seen |= S_IWOTH; if (type == ALLOW) { mode |= S_IWOTH; } } } if ((access_mask & ACE_EXECUTE)) { if (!(seen & S_IXUSR)) { seen |= S_IXUSR; if (type == ALLOW) { mode |= S_IXUSR; } } if (!(seen & S_IXGRP)) { seen |= S_IXGRP; if (type == ALLOW) { mode |= S_IXGRP; } } if (!(seen & S_IXOTH)) { seen |= S_IXOTH; if (type == ALLOW) { mode |= S_IXOTH; } } } } else { /* * Only care if this IDENTIFIER_GROUP or * USER ACE denies execute access to someone, * mode is not affected */ if ((access_mask & ACE_EXECUTE) && type == DENY) an_exec_denied = B_TRUE; } } /* * Failure to allow is effectively a deny, so execute permission * is denied if it was never mentioned or if we explicitly * weren't allowed it. */ if (!an_exec_denied && ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) an_exec_denied = B_TRUE; if (an_exec_denied) *pflags &= ~ZFS_NO_EXECS_DENIED; else *pflags |= ZFS_NO_EXECS_DENIED; return (mode); } /* * Read an external acl object. If the intent is to modify, always * create a new acl and leave any cached acl in place. */ int zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, boolean_t will_modify) { zfs_acl_t *aclp; int aclsize; int acl_count; zfs_acl_node_t *aclnode; zfs_acl_phys_t znode_acl; int version; int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if (zp->z_zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(ZTOV(zp), __func__); if (zp->z_acl_cached && !will_modify) { *aclpp = zp->z_acl_cached; return (0); } version = zfs_znode_acl_version(zp); if ((error = zfs_acl_znode_info(zp, &aclsize, &acl_count, &znode_acl)) != 0) { goto done; } aclp = zfs_acl_alloc(version); aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; aclnode = zfs_acl_node_alloc(aclsize); aclnode->z_ace_count = aclp->z_acl_count; aclnode->z_size = aclsize; if (!zp->z_is_sa) { if (znode_acl.z_acl_extern_obj) { error = dmu_read(zp->z_zfsvfs->z_os, znode_acl.z_acl_extern_obj, 0, aclnode->z_size, aclnode->z_acldata, DMU_READ_PREFETCH); } else { memcpy(aclnode->z_acldata, znode_acl.z_ace_data, aclnode->z_size); } } else { error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), aclnode->z_acldata, aclnode->z_size); } if (error != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); goto done; } list_insert_head(&aclp->z_acl, aclnode); *aclpp = aclp; if (!will_modify) zp->z_acl_cached = aclp; done: return (error); } void zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, boolean_t start, void *userdata) { (void) buflen; zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; if (start) { cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); } else { cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, cb->cb_acl_node); } ASSERT3P(cb->cb_acl_node, !=, NULL); *dataptr = cb->cb_acl_node->z_acldata; *length = cb->cb_acl_node->z_size; } int zfs_acl_chown_setattr(znode_t *zp) { int error; zfs_acl_t *aclp; if (zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); ASSERT_VOP_IN_SEQC(ZTOV(zp)); } ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, &zp->z_pflags, zp->z_uid, zp->z_gid); return (error); } /* * common code for setting ACLs. * * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's * already checked the acl and knows whether to inherit. */ int zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { int error; zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_object_type_t otype; zfs_acl_locator_cb_t locate = { 0 }; uint64_t mode; sa_bulk_attr_t bulk[5]; uint64_t ctime[2]; int count = 0; zfs_acl_phys_t acl_phys; if (zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_IN_SEQC(ZTOV(zp)); } mode = zp->z_mode; mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, zp->z_uid, zp->z_gid); zp->z_mode = mode; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } /* * Upgrade needed? */ if (!zfsvfs->z_use_fuids) { otype = DMU_OT_OLDACL; } else { if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && (zfsvfs->z_version >= ZPL_VERSION_FUID)) zfs_acl_xform(zp, aclp, cr); ASSERT3U(aclp->z_version, >=, ZFS_ACL_VERSION_FUID); otype = DMU_OT_ACL; } /* * Arrgh, we have to handle old on disk format * as well as newer (preferred) SA format. */ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ locate.cb_aclp = aclp; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, aclp->z_acl_bytes); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &aclp->z_acl_count, sizeof (uint64_t)); } else { /* Painful legacy way */ zfs_acl_node_t *aclnode; uint64_t off = 0; uint64_t aoid; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), &acl_phys, sizeof (acl_phys))) != 0) return (error); aoid = acl_phys.z_acl_extern_obj; if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { /* * If ACL was previously external and we are now * converting to new ACL format then release old * ACL object and create a new one. */ if (aoid && aclp->z_version != acl_phys.z_acl_version) { error = dmu_object_free(zfsvfs->z_os, aoid, tx); if (error) return (error); aoid = 0; } if (aoid == 0) { aoid = dmu_object_alloc(zfsvfs->z_os, otype, aclp->z_acl_bytes, otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, otype == DMU_OT_ACL ? DN_OLD_MAX_BONUSLEN : 0, tx); } else { (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, aclp->z_acl_bytes, 0, tx); } acl_phys.z_acl_extern_obj = aoid; for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { if (aclnode->z_ace_count == 0) continue; dmu_write(zfsvfs->z_os, aoid, off, aclnode->z_size, aclnode->z_acldata, tx); off += aclnode->z_size; } } else { void *start = acl_phys.z_ace_data; /* * Migrating back embedded? */ if (acl_phys.z_acl_extern_obj) { error = dmu_object_free(zfsvfs->z_os, acl_phys.z_acl_extern_obj, tx); if (error) return (error); acl_phys.z_acl_extern_obj = 0; } for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { if (aclnode->z_ace_count == 0) continue; memcpy(start, aclnode->z_acldata, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } } /* * If Old version then swap count/bytes to match old * layout of znode_acl_phys_t. */ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { acl_phys.z_acl_size = aclp->z_acl_count; acl_phys.z_acl_count = aclp->z_acl_bytes; } else { acl_phys.z_acl_size = aclp->z_acl_bytes; acl_phys.z_acl_count = aclp->z_acl_count; } acl_phys.z_acl_version = aclp->z_version; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (acl_phys)); } /* * Replace ACL wide bits, but first clear them. */ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; zp->z_pflags |= aclp->z_hints; if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) zp->z_pflags |= ZFS_ACL_TRIVIAL; zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); } static void zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, zfs_acl_t *aclp) { void *acep = NULL; uint64_t who; int new_count, new_bytes; int ace_size; int entry_type; uint16_t iflags, type; uint32_t access_mask; zfs_acl_node_t *newnode; size_t abstract_size = aclp->z_ops->ace_abstract_size(); void *zacep; boolean_t isdir; trivial_acl_t masks; new_count = new_bytes = 0; isdir = (vtype == VDIR); acl_trivial_access_masks((mode_t)mode, isdir, &masks); newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); zacep = newnode->z_acldata; if (masks.allow0) { zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } if (masks.deny1) { zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } if (masks.deny2) { zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { entry_type = (iflags & ACE_TYPE_FLAGS); /* * ACEs used to represent the file mode may be divided * into an equivalent pair of inherit-only and regular * ACEs, if they are inheritable. * Skip regular ACEs, which are replaced by the new mode. */ if (split && (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE)) { if (!isdir || !(iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) continue; /* * We preserve owner@, group@, or @everyone * permissions, if they are inheritable, by * copying them to inherit_only ACEs. This * prevents inheritable permissions from being * altered along with the file mode. */ iflags |= ACE_INHERIT_ONLY_ACE; } /* * If this ACL has any inheritable ACEs, mark that in * the hints (which are later masked into the pflags) * so create knows to do inheritance. */ if (isdir && (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) aclp->z_hints |= ZFS_INHERIT_ACE; if ((type != ALLOW && type != DENY) || (iflags & ACE_INHERIT_ONLY_ACE)) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: aclp->z_hints |= ZFS_ACL_OBJ_ACE; break; } } else { /* * Limit permissions granted by ACEs to be no greater * than permissions of the requested group mode. * Applies when the "aclmode" property is set to * "groupmask". */ if ((type == ALLOW) && trim) access_mask &= masks.group; } zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); ace_size = aclp->z_ops->ace_size(acep); zacep = (void *)((uintptr_t)zacep + ace_size); new_count++; new_bytes += ace_size; } zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); new_count += 3; new_bytes += abstract_size * 3; zfs_acl_release_nodes(aclp); aclp->z_acl_count = new_count; aclp->z_acl_bytes = new_bytes; newnode->z_ace_count = new_count; newnode->z_size = new_bytes; list_insert_tail(&aclp->z_acl, newnode); } int zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { int error = 0; mutex_enter(&zp->z_acl_lock); if (zp->z_zfsvfs->z_replay == B_FALSE) ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); else error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); if (error == 0) { (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); } mutex_exit(&zp->z_acl_lock); return (error); } /* * Should ACE be inherited? */ static int zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) { int iflags = (acep_flags & 0xf); if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) return (1); else if (iflags & ACE_FILE_INHERIT_ACE) return (!((vtype == VDIR) && (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); return (0); } /* * inherit inheritable ACEs from parent */ static zfs_acl_t * zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, uint64_t mode, boolean_t *need_chmod) { void *pacep = NULL; void *acep; zfs_acl_node_t *aclnode; zfs_acl_t *aclp = NULL; uint64_t who; uint32_t access_mask; uint16_t iflags, newflags, type; size_t ace_size; void *data1, *data2; size_t data1sz, data2sz; uint_t aclinherit; boolean_t isdir = (vtype == VDIR); boolean_t isreg = (vtype == VREG); *need_chmod = B_TRUE; aclp = zfs_acl_alloc(paclp->z_version); aclinherit = zfsvfs->z_acl_inherit; if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK) return (aclp); while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, &access_mask, &iflags, &type))) { /* * don't inherit bogus ACEs */ if (!zfs_acl_valid_ace_type(type, iflags)) continue; /* * Check if ACE is inheritable by this vnode */ if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || !zfs_ace_can_use(vtype, iflags)) continue; /* * If owner@, group@, or everyone@ inheritable * then zfs_acl_chmod() isn't needed. */ if ((aclinherit == ZFS_ACL_PASSTHROUGH || aclinherit == ZFS_ACL_PASSTHROUGH_X) && ((iflags & (ACE_OWNER|ACE_EVERYONE)) || ((iflags & OWNING_GROUP) == OWNING_GROUP)) && (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) *need_chmod = B_FALSE; /* * Strip inherited execute permission from file if * not in mode */ if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { access_mask &= ~ACE_EXECUTE; } /* * Strip write_acl and write_owner from permissions * when inheriting an ACE */ if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { access_mask &= ~RESTRICTED_CLEAR; } ace_size = aclp->z_ops->ace_size(pacep); aclnode = zfs_acl_node_alloc(ace_size); list_insert_tail(&aclp->z_acl, aclnode); acep = aclnode->z_acldata; zfs_set_ace(aclp, acep, access_mask, type, who, iflags|ACE_INHERITED_ACE); /* * Copy special opaque data if any */ if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { data2sz = aclp->z_ops->ace_data(acep, &data2); VERIFY3U(data2sz, ==, data1sz); memcpy(data2, data1, data2sz); } aclp->z_acl_count++; aclnode->z_ace_count++; aclp->z_acl_bytes += aclnode->z_size; newflags = aclp->z_ops->ace_flags_get(acep); /* * If ACE is not to be inherited further, or if the vnode is * not a directory, remove all inheritance flags */ if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { newflags &= ~ALL_INHERIT; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); continue; } /* * This directory has an inheritable ACE */ aclp->z_hints |= ZFS_INHERIT_ACE; /* * If only FILE_INHERIT is set then turn on * inherit_only */ if ((iflags & (ACE_FILE_INHERIT_ACE | ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { newflags |= ACE_INHERIT_ONLY_ACE; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } else { newflags &= ~ACE_INHERIT_ONLY_ACE; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } } if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && aclp->z_acl_count != 0) { *need_chmod = B_FALSE; } return (aclp); } /* * Create file system object initial permissions * including inheritable ACEs. * Also, create FUIDs for owner and group. */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zuserns_t *mnt_ns) { int error; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfs_acl_t *paclp; gid_t gid; boolean_t need_chmod = B_TRUE; boolean_t trim = B_FALSE; boolean_t inherited = B_FALSE; if ((flag & IS_ROOT_NODE) == 0) { if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); } else ASSERT3P(dzp->z_vnode, ==, NULL); memset(acl_ids, 0, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); if (vsecp) if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) return (error); /* * Determine uid and gid. */ if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || ((flag & IS_XATTR) && (vap->va_type == VDIR))) { acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &acl_ids->z_fuidp); acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; } else { uid_t id = crgetuid(cr); if (IS_EPHEMERAL(id)) id = UID_NOBODY; acl_ids->z_fuid = (uint64_t)id; acl_ids->z_fgid = 0; if (vap->va_mask & AT_GID) { acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; if (acl_ids->z_fgid != dzp->z_gid && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) acl_ids->z_fgid = 0; } if (acl_ids->z_fgid == 0) { const char *domain; uint32_t rid; acl_ids->z_fgid = dzp->z_gid; gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); if (zfsvfs->z_use_fuids && IS_EPHEMERAL(acl_ids->z_fgid)) { domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, FUID_INDEX(acl_ids->z_fgid)); rid = FUID_RID(acl_ids->z_fgid); zfs_fuid_node_add(&acl_ids->z_fuidp, domain, rid, FUID_INDEX(acl_ids->z_fgid), acl_ids->z_fgid, ZFS_GROUP); } } } /* * If we're creating a directory, and the parent directory has the * set-GID bit set, set in on the new directory. * Otherwise, if the user is neither privileged nor a member of the * file's new group, clear the file's set-GID bit. */ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && (vap->va_type == VDIR)) { acl_ids->z_mode |= S_ISGID; } else { if ((acl_ids->z_mode & S_ISGID) && secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) acl_ids->z_mode &= ~S_ISGID; } if (acl_ids->z_aclp == NULL) { mutex_enter(&dzp->z_acl_lock); if (!(flag & IS_ROOT_NODE) && (dzp->z_pflags & ZFS_INHERIT_ACE) && !(dzp->z_pflags & ZFS_XATTR)) { VERIFY0(zfs_acl_node_read(dzp, B_TRUE, &paclp, B_FALSE)); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_type, paclp, acl_ids->z_mode, &need_chmod); inherited = B_TRUE; } else { acl_ids->z_aclp = zfs_acl_alloc(zfs_acl_version_zp(dzp)); acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } mutex_exit(&dzp->z_acl_lock); if (need_chmod) { if (vap->va_type == VDIR) acl_ids->z_aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) trim = B_TRUE; zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, trim, acl_ids->z_aclp); } } if (inherited || vsecp) { acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, acl_ids->z_fuid, acl_ids->z_fgid); if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } return (0); } /* * Free ACL and fuid_infop, but not the acl_ids structure */ void zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) { if (acl_ids->z_aclp) zfs_acl_free(acl_ids->z_aclp); if (acl_ids->z_fuidp) zfs_fuid_info_free(acl_ids->z_fuidp); acl_ids->z_aclp = NULL; acl_ids->z_fuidp = NULL; } boolean_t zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) { return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); } /* * Retrieve a file's ACL */ int zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfs_acl_t *aclp; ulong_t mask; int error; int count = 0; int largeace = 0; mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); if (mask == 0) return (SET_ERROR(ENOSYS)); if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr, NULL))) return (error); mutex_enter(&zp->z_acl_lock); if (zp->z_zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(ZTOV(zp), __func__); error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } /* * Scan ACL to determine number of ACEs */ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { void *zacep = NULL; uint64_t who; uint32_t access_mask; uint16_t type, iflags; while ((zacep = zfs_acl_next_ace(aclp, zacep, &who, &access_mask, &iflags, &type))) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: largeace++; continue; default: count++; } } vsecp->vsa_aclcnt = count; } else count = (int)aclp->z_acl_count; if (mask & VSA_ACECNT) { vsecp->vsa_aclcnt = count; } if (mask & VSA_ACE) { size_t aclsz; aclsz = count * sizeof (ace_t) + sizeof (ace_object_t) * largeace; vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); vsecp->vsa_aclentsz = aclsz; if (aclp->z_version == ZFS_ACL_VERSION_FUID) zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); else { zfs_acl_node_t *aclnode; void *start = vsecp->vsa_aclentp; for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { memcpy(start, aclnode->z_acldata, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } ASSERT3U((caddr_t)start - (caddr_t)vsecp->vsa_aclentp, ==, aclp->z_acl_bytes); } } if (mask & VSA_ACE_ACLFLAGS) { vsecp->vsa_aclflags = 0; if (zp->z_pflags & ZFS_ACL_DEFAULTED) vsecp->vsa_aclflags |= ACL_DEFAULTED; if (zp->z_pflags & ZFS_ACL_PROTECTED) vsecp->vsa_aclflags |= ACL_PROTECTED; if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; } mutex_exit(&zp->z_acl_lock); return (0); } int zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_type, vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) { zfs_acl_t *aclp; zfs_acl_node_t *aclnode; int aclcnt = vsecp->vsa_aclcnt; int error; if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) return (SET_ERROR(EINVAL)); aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); aclp->z_hints = 0; aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, &aclnode->z_size)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); } } else { if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, &aclnode->z_size, fuidp, cr)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); } } aclp->z_acl_bytes = aclnode->z_size; aclnode->z_ace_count = aclcnt; aclp->z_acl_count = aclcnt; list_insert_head(&aclp->z_acl, aclnode); /* * If flags are being set then add them to z_hints */ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { if (vsecp->vsa_aclflags & ACL_PROTECTED) aclp->z_hints |= ZFS_ACL_PROTECTED; if (vsecp->vsa_aclflags & ACL_DEFAULTED) aclp->z_hints |= ZFS_ACL_DEFAULTED; if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; } *zaclp = aclp; return (0); } /* * Set a file's ACL */ int zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); dmu_tx_t *tx; int error; zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; boolean_t fuid_dirtied; uint64_t acl_obj; if (zp->z_zfsvfs->z_replay == B_FALSE) ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (mask == 0) return (SET_ERROR(ENOSYS)); if (zp->z_pflags & ZFS_IMMUTABLE) return (SET_ERROR(EPERM)); if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, NULL))) return (error); error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, &aclp); if (error) return (error); /* * If ACL wide flags aren't being set then preserve any * existing flags. */ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { aclp->z_hints |= (zp->z_pflags & V4_ACL_WIDE_FLAGS); } top: mutex_enter(&zp->z_acl_lock); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); /* * If old version and ACL won't fit in bonus and we aren't * upgrading then take out necessary DMU holds */ if ((acl_obj = zfs_external_acl(zp)) != 0) { if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); zfs_acl_free(aclp); return (error); } error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(error); ASSERT3P(zp->z_acl_cached, ==, NULL); zp->z_acl_cached = aclp; if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); mutex_exit(&zp->z_acl_lock); return (error); } /* * Check accesses of interest (AoI) against attributes of the dataset * such as read-only. Returns zero if no AoI conflict with dataset * attributes, otherwise an appropriate errno is returned. */ static int zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) { if ((v4_mode & WRITE_MASK) && (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && (!IS_DEVVP(ZTOV(zp)) || (v4_mode & WRITE_MASK_ATTRS))) { return (SET_ERROR(EROFS)); } /* * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common(). */ if ((v4_mode & WRITE_MASK_DATA) && (zp->z_pflags & ZFS_IMMUTABLE)) { return (SET_ERROR(EPERM)); } /* * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK * (sunlnk) is set. We just don't allow directory removal, which is * handled in zfs_zaccess_delete(). */ if ((v4_mode & ACE_DELETE) && (zp->z_pflags & ZFS_NOUNLINK)) { return (EPERM); } if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && (zp->z_pflags & ZFS_AV_QUARANTINED))) { return (SET_ERROR(EACCES)); } return (0); } /* * The primary usage of this function is to loop through all of the * ACEs in the znode, determining what accesses of interest (AoI) to * the caller are allowed or denied. The AoI are expressed as bits in * the working_mode parameter. As each ACE is processed, bits covered * by that ACE are removed from the working_mode. This removal * facilitates two things. The first is that when the working mode is * empty (= 0), we know we've looked at all the AoI. The second is * that the ACE interpretation rules don't allow a later ACE to undo * something granted or denied by an earlier ACE. Removing the * discovered access or denial enforces this rule. At the end of * processing the ACEs, all AoI that were found to be denied are * placed into the working_mode, giving the caller a mask of denied * accesses. Returns: * 0 if all AoI granted * EACCESS if the denied mask is non-zero * other error if abnormal failure (e.g., IO error) * * A secondary usage of the function is to determine if any of the * AoI are granted. If an ACE grants any access in * the working_mode, we immediately short circuit out of the function. * This mode is chosen by setting anyaccess to B_TRUE. The * working_mode is not a denied access mask upon exit if the function * is used in this manner. */ static int zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, boolean_t anyaccess, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_acl_t *aclp; int error; uid_t uid = crgetuid(cr); uint64_t who; uint16_t type, iflags; uint16_t entry_type; uint32_t access_mask; uint32_t deny_mask = 0; zfs_ace_hdr_t *acep = NULL; boolean_t checkit; uid_t gowner; uid_t fowner; zfs_fuid_map_ids(zp, cr, &fowner, &gowner); mutex_enter(&zp->z_acl_lock); if (zp->z_zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(ZTOV(zp), __func__); error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } ASSERT3P(zp->z_acl_cached, !=, NULL); while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { uint32_t mask_matched; if (!zfs_acl_valid_ace_type(type, iflags)) continue; if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) continue; /* Skip ACE if it does not affect any AoI */ mask_matched = (access_mask & *working_mode); if (!mask_matched) continue; entry_type = (iflags & ACE_TYPE_FLAGS); checkit = B_FALSE; switch (entry_type) { case ACE_OWNER: if (uid == fowner) checkit = B_TRUE; break; case OWNING_GROUP: who = gowner; zfs_fallthrough; case ACE_IDENTIFIER_GROUP: checkit = zfs_groupmember(zfsvfs, who, cr); break; case ACE_EVERYONE: checkit = B_TRUE; break; /* USER Entry */ default: if (entry_type == 0) { uid_t newid; newid = zfs_fuid_map_id(zfsvfs, who, cr, ZFS_ACE_USER); if (newid != UID_NOBODY && uid == newid) checkit = B_TRUE; break; } else { mutex_exit(&zp->z_acl_lock); return (SET_ERROR(EIO)); } } if (checkit) { if (type == DENY) { DTRACE_PROBE3(zfs__ace__denies, znode_t *, zp, zfs_ace_hdr_t *, acep, uint32_t, mask_matched); deny_mask |= mask_matched; } else { DTRACE_PROBE3(zfs__ace__allows, znode_t *, zp, zfs_ace_hdr_t *, acep, uint32_t, mask_matched); if (anyaccess) { mutex_exit(&zp->z_acl_lock); return (0); } } *working_mode &= ~mask_matched; } /* Are we done? */ if (*working_mode == 0) break; } mutex_exit(&zp->z_acl_lock); /* Put the found 'denies' back on the working mode */ if (deny_mask) { *working_mode |= deny_mask; return (SET_ERROR(EACCES)); } else if (*working_mode) { return (-1); } return (0); } /* * Return true if any access whatsoever granted, we don't actually * care what access is granted. */ boolean_t zfs_has_access(znode_t *zp, cred_t *cr) { uint32_t have = ACE_ALL_PERMS; if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { uid_t owner; owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); } return (B_TRUE); } static int zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; int err; *working_mode = v4_mode; *check_privs = B_TRUE; /* * Short circuit empty requests */ if (v4_mode == 0 || zfsvfs->z_replay) { *working_mode = 0; return (0); } if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { *check_privs = B_FALSE; return (err); } /* * The caller requested that the ACL check be skipped. This * would only happen if the caller checked VOP_ACCESS() with a * 32 bit ACE mask and already had the appropriate permissions. */ if (skipaclchk) { *working_mode = 0; return (0); } /* * Note: ZFS_READONLY represents the "DOS R/O" attribute. * When that flag is set, we should behave as if write access * were not granted by anything in the ACL. In particular: * We _must_ allow writes after opening the file r/w, then * setting the DOS R/O attribute, and writing some more. * (Similar to how you can write after fchmod(fd, 0444).) * * Therefore ZFS_READONLY is ignored in the dataset check * above, and checked here as if part of the ACL check. * Also note: DOS R/O is ignored for directories. */ if ((v4_mode & WRITE_MASK_DATA) && (ZTOV(zp)->v_type != VDIR) && (zp->z_pflags & ZFS_READONLY)) { return (SET_ERROR(EPERM)); } return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); } static int zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, cred_t *cr) { if (*working_mode != ACE_WRITE_DATA) return (SET_ERROR(EACCES)); return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, check_privs, B_FALSE, cr)); } /* * Check if VEXEC is allowed. * * This routine is based on zfs_fastaccesschk_execute which has slowpath * calling zfs_zaccess. This would be incorrect on FreeBSD (see * zfs_freebsd_access for the difference). Thus this variant let's the * caller handle the slowpath (if necessary). * * On top of that we perform a lockless check for ZFS_NO_EXECS_DENIED. * * Safe access to znode_t is provided by the vnode lock. */ int zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) { boolean_t is_attr; if (zdp->z_pflags & ZFS_AV_QUARANTINED) return (1); is_attr = ((zdp->z_pflags & ZFS_XATTR) && (ZTOV(zdp)->v_type == VDIR)); if (is_attr) return (1); if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) return (0); return (1); } /* * Determine whether Access should be granted/denied. * * The least priv subsystem is always consulted as a basic privilege * can define any form of access. */ int zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, zuserns_t *mnt_ns) { uint32_t working_mode; int error; int is_attr; boolean_t check_privs; znode_t *xzp = NULL; znode_t *check_zp = zp; mode_t needed_bits; uid_t owner; is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); /* * In FreeBSD, we don't care about permissions of individual ADS. * Note that not checking them is not just an optimization - without * this shortcut, EA operations may bogusly fail with EACCES. */ if (zp->z_pflags & ZFS_XATTR) return (0); owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); /* * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC * in needed_bits. Map the bits mapped by working_mode (currently * missing) in missing_bits. * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), * needed_bits. */ needed_bits = 0; working_mode = mode; if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && owner == crgetuid(cr)) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) needed_bits |= VREAD; if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) needed_bits |= VWRITE; if (working_mode & ACE_EXECUTE) needed_bits |= VEXEC; if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, &check_privs, skipaclchk, cr)) == 0) { if (is_attr) VN_RELE(ZTOV(xzp)); return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, needed_bits, needed_bits)); } if (error && !check_privs) { if (is_attr) VN_RELE(ZTOV(xzp)); return (error); } if (error && (flags & V_APPEND)) { error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); } if (error && check_privs) { mode_t checkmode = 0; vnode_t *check_vp = ZTOV(check_zp); /* * First check for implicit owner permission on * read_acl/read_attributes */ - error = 0; ASSERT3U(working_mode, !=, 0); if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && owner == crgetuid(cr))) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) checkmode |= VREAD; if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) checkmode |= VWRITE; if (working_mode & ACE_EXECUTE) checkmode |= VEXEC; error = secpolicy_vnode_access2(cr, check_vp, owner, needed_bits & ~checkmode, needed_bits); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) error = secpolicy_vnode_chown(check_vp, cr, owner); if (error == 0 && (working_mode & ACE_WRITE_ACL)) error = secpolicy_vnode_setdac(check_vp, cr, owner); if (error == 0 && (working_mode & (ACE_DELETE|ACE_DELETE_CHILD))) error = secpolicy_vnode_remove(check_vp, cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { error = secpolicy_vnode_chown(check_vp, cr, owner); } if (error == 0) { /* * See if any bits other than those already checked * for are still present. If so then return EACCES */ if (working_mode & ~(ZFS_CHECKED_MASKS)) { error = SET_ERROR(EACCES); } } } else if (error == 0) { error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, needed_bits, needed_bits); } if (is_attr) VN_RELE(ZTOV(xzp)); return (error); } /* * Translate traditional unix VREAD/VWRITE/VEXEC mode into * NFSv4-style ZFS ACL format and call zfs_zaccess() */ int zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr, zuserns_t *mnt_ns) { return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr, mnt_ns)); } /* * Access function for secpolicy_vnode_setattr */ int zfs_zaccess_unix(void *zp, int mode, cred_t *cr) { int v4_mode = zfs_unix_to_v4(mode >> 6); return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr, NULL)); } static int zfs_delete_final_check(znode_t *zp, znode_t *dzp, mode_t available_perms, cred_t *cr) { int error; uid_t downer; downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); error = secpolicy_vnode_access2(cr, ZTOV(dzp), downer, available_perms, VWRITE|VEXEC); if (error == 0) error = zfs_sticky_remove_access(dzp, zp, cr); return (error); } /* * Determine whether Access should be granted/deny, without * consulting least priv subsystem. * * The following chart is the recommended NFSv4 enforcement for * ability to delete an object. * * ------------------------------------------------------- * | Parent Dir | Target Object Permissions | * | permissions | | * ------------------------------------------------------- * | | ACL Allows | ACL Denies| Delete | * | | Delete | Delete | unspecified| * ------------------------------------------------------- * | ACL Allows | Permit | Permit | Permit | * | DELETE_CHILD | | * ------------------------------------------------------- * | ACL Denies | Permit | Deny | Deny | * | DELETE_CHILD | | | | * ------------------------------------------------------- * | ACL specifies | | | | * | only allow | Permit | Permit | Permit | * | write and | | | | * | execute | | | | * ------------------------------------------------------- * | ACL denies | | | | * | write and | Permit | Deny | Deny | * | execute | | | | * ------------------------------------------------------- * ^ * | * No search privilege, can't even look up file? * */ int zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zuserns_t *mnt_ns) { uint32_t dzp_working_mode = 0; uint32_t zp_working_mode = 0; int dzp_error, zp_error; mode_t available_perms; boolean_t dzpcheck_privs = B_TRUE; boolean_t zpcheck_privs = B_TRUE; /* * We want specific DELETE permissions to * take precedence over WRITE/EXECUTE. We don't * want an ACL such as this to mess us up. * user:joe:write_data:deny,user:joe:delete:allow * * However, deny permissions may ultimately be overridden * by secpolicy_vnode_access(). * * We will ask for all of the necessary permissions and then * look at the working modes from the directory and target object * to determine what was found. */ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) return (SET_ERROR(EPERM)); /* * First row * If the directory permissions allow the delete, we are done. */ if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) return (0); /* * If target object has delete permission then we are done */ if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, &zpcheck_privs, B_FALSE, cr)) == 0) return (0); ASSERT(dzp_error); ASSERT(zp_error); if (!dzpcheck_privs) return (dzp_error); if (!zpcheck_privs) return (zp_error); /* * Second row * * If directory returns EACCES then delete_child was denied * due to deny delete_child. In this case send the request through * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() * since that *could* allow the delete based on write/execute permission * and we want delete permissions to override write/execute. */ if (dzp_error == EACCES) { /* XXXPJD: s/dzp/zp/ ? */ return (secpolicy_vnode_remove(ZTOV(dzp), cr)); } /* * Third Row * only need to see if we have write/execute on directory. */ dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); if (dzp_error != 0 && !dzpcheck_privs) return (dzp_error); /* * Fourth row */ available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; return (zfs_delete_final_check(zp, dzp, available_perms, cr)); } int zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, znode_t *tzp, cred_t *cr, zuserns_t *mnt_ns) { int add_perm; int error; if (szp->z_pflags & ZFS_AV_QUARANTINED) return (SET_ERROR(EACCES)); add_perm = (ZTOV(szp)->v_type == VDIR) ? ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; /* * Rename permissions are combination of delete permission + * add file/subdir permission. * * BSD operating systems also require write permission * on the directory being moved from one parent directory * to another. */ if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr, mnt_ns))) return (error); } /* * first make sure we do the delete portion. * * If that succeeds then check for add_file/add_subdir permissions */ if ((error = zfs_zaccess_delete(sdzp, szp, cr, mnt_ns))) return (error); /* * If we have a tzp, see if we can delete it? */ if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr, mnt_ns))) return (error); /* * Now check for add permissions */ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr, mnt_ns); return (error); } diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 957530243156..e691e60a8f92 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -1,2114 +1,2115 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2011 Martin Matuska */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* Used by fstat(1). */ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)"); /* * Define ZNODE_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. */ #ifdef ZFS_DEBUG #define ZNODE_STATS #endif /* DEBUG */ #ifdef ZNODE_STATS #define ZNODE_STAT_ADD(stat) ((stat)++) #else #define ZNODE_STAT_ADD(stat) /* nothing */ #endif /* ZNODE_STATS */ /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL #if !defined(KMEM_DEBUG) && __FreeBSD_version >= 1300102 #define _ZFS_USE_SMR static uma_zone_t znode_uma_zone; #else static kmem_cache_t *znode_cache = NULL; #endif extern struct vop_vector zfs_vnodeops; extern struct vop_vector zfs_fifoops; extern struct vop_vector zfs_shareops; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { znode_t *zp = buf; POINTER_INVALIDATE(&zp->z_zfsvfs); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); ASSERT3P(zp->z_vnode, ==, NULL); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } #ifdef _ZFS_USE_SMR VFS_SMR_DECLARE; static int zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private, int flags) { return (zfs_znode_cache_constructor(mem, private, flags)); } static void zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private) { zfs_znode_cache_destructor(mem, private); } void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT3P(znode_uma_zone, ==, NULL); znode_uma_zone = uma_zcreate("zfs_znode_cache", sizeof (znode_t), zfs_znode_cache_constructor_smr, zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0); VFS_SMR_ZONE_SET(znode_uma_zone); } static znode_t * zfs_znode_alloc_kmem(int flags) { return (uma_zalloc_smr(znode_uma_zone, flags)); } static void zfs_znode_free_kmem(znode_t *zp) { if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } uma_zfree_smr(znode_uma_zone, zp); } #else void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT3P(znode_cache, ==, NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); } static znode_t * zfs_znode_alloc_kmem(int flags) { return (kmem_cache_alloc(znode_cache, flags)); } static void zfs_znode_free_kmem(znode_t *zp) { if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } #endif void zfs_znode_fini(void) { /* * Cleanup zcache */ #ifdef _ZFS_USE_SMR if (znode_uma_zone) { uma_zdestroy(znode_uma_zone); znode_uma_zone = NULL; } #else if (znode_cache) { kmem_cache_destroy(znode_cache); znode_cache = NULL; } #endif } static int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { zfs_acl_ids_t acl_ids; vattr_t vattr; znode_t *sharezp; znode_t *zp; int error; vattr.va_mask = AT_MODE|AT_UID|AT_GID; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0555; vattr.va_uid = crgetuid(kcred); vattr.va_gid = crgetgid(kcred); sharezp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; sharezp->z_is_sa = zfsvfs->z_use_sa; VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, kcred, NULL, &acl_ids, NULL)); zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, sharezp); POINTER_INVALIDATE(&sharezp->z_zfsvfs); error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); zfsvfs->z_shares_dir = sharezp->z_id; zfs_acl_ids_free(&acl_ids); sa_handle_destroy(sharezp->z_sa_hdl); zfs_znode_free_kmem(sharezp); return (error); } /* * define a couple of values we need available * for both 64 and 32 bit environments. */ #ifndef NBITSMINOR64 #define NBITSMINOR64 32 #endif #ifndef MAXMAJ64 #define MAXMAJ64 0xffffffffUL #endif #ifndef MAXMIN64 #define MAXMIN64 0xffffffffUL #endif /* * Create special expldev for ZFS private use. * Can't use standard expldev since it doesn't do * what we want. The standard expldev() takes a * dev32_t in LP64 and expands it to a long dev_t. * We need an interface that takes a dev32_t in ILP32 * and expands it to a long dev_t. */ static uint64_t zfs_expldev(dev_t dev) { return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); } /* * Special cmpldev for ZFS private use. * Can't use standard cmpldev since it takes * a long dev_t and compresses it to dev32_t in * LP64. We need to do a compaction of a long dev_t * to a dev32_t in ILP32. */ dev_t zfs_cmpldev(uint64_t dev) { return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); ASSERT3P(zp->z_sa_hdl, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); if (sa_hdl == NULL) { VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; /* * Slap on VROOT if we are the root znode unless we are the root * node of a snapshot mounted under .zfs. */ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) ZTOV(zp)->v_flag |= VROOT; vn_exists(ZTOV(zp)); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || zp->z_unlinked || ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zp->z_zfsvfs)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } static void zfs_vnode_forget(vnode_t *vp) { /* copied from insmntque_stddtr */ vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Construct a new znode/vnode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; vnode_t *vp; uint64_t mode; uint64_t parent; #ifdef notyet uint64_t mtime[2], ctime[2]; #endif uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[9]; int count = 0; int error; zp = zfs_znode_alloc_kmem(KM_SLEEP); #ifndef _ZFS_USE_SMR KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0, ("%s: fast path lookup enabled without smr", __func__)); #endif #if __FreeBSD_version >= 1300076 KASSERT(curthread->td_vp_reserved != NULL, ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); #else KASSERT(curthread->td_vp_reserv > 0, ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); #endif error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); if (error != 0) { zfs_znode_free_kmem(zp); return (NULL); } zp->z_vnode = vp; vp->v_data = zp; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); zp->z_sa_hdl = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; #if __FreeBSD_version >= 1300139 atomic_store_ptr(&zp->z_cached_symlink, NULL); #endif vp = ZTOV(zp); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, 16); #ifdef notyet SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); #endif SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, 8); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zfs_vnode_forget(vp); zp->z_vnode = NULL; zfs_znode_free_kmem(zp); return (NULL); } zp->z_projid = projid; zp->z_mode = mode; /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; vp->v_type = IFTOVT((mode_t)mode); switch (vp->v_type) { case VDIR: zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ break; case VFIFO: vp->v_op = &zfs_fifoops; break; case VREG: if (parent == zfsvfs->z_shares_dir) { ASSERT0(zp->z_uid); ASSERT0(zp->z_gid); vp->v_op = &zfs_shareops; } break; default: break; } mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes++; zp->z_zfsvfs = zfsvfs; mutex_exit(&zfsvfs->z_znodes_lock); /* * Acquire vnode lock before making it available to the world. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #if __FreeBSD_version >= 1400077 vn_set_state(vp, VSTATE_CONSTRUCTED); #endif VN_LOCK_AREC(vp); if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); return (zp); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * bonuslen - length of bonus buffer * setaclp - File/Dir initial ACL * fuidp - Tracks fuid allocation. * * OUT: zpp - allocated znode * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t dzp_pflags = 0; uint64_t rdev = 0; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; timestruc_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; ASSERT3P(vap, !=, NULL); ASSERT3U((vap->va_mask & AT_MODE), ==, AT_MODE); if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; vfs_timestamp(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (vap->va_type == VDIR) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } else { dzp_pflags = dzp->z_pflags; } /* * If parent is an xattr, so am I. */ if (dzp_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (vap->va_type == VDIR) { size = 2; /* contents ("." and "..") */ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; } else { size = links = 0; } if (vap->va_type == VBLK || vap->va_type == VCHR) { rdev = zfs_expldev(vap->va_rdev); } parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } if (obj_type == DMU_OT_ZNODE || (vap->va_type == VBLK || vap->va_type == VCHR)) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx)); if (!(flag & IS_ROOT_NODE)) { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); ASSERT3P(*zpp, !=, NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; (*zpp)->z_dnodesize = dnodesize; if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { vnode_t *vp = ZTOV(*zpp); vp->v_vflag |= VV_FORCEINSMQ; int err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; (void) err; KASSERT(err == 0, ("insmntque() failed: error %d", err)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; xoap = xva_getxoptattr(xvap); ASSERT3P(xoap, !=, NULL); if (zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_IN_SEQC(ZTOV(zp)); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; vnode_t *vp; sa_handle_t *hdl; int locked; int err; getnewvnode_reserve_(); again: *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { err = SET_ERROR(ENOENT); } else { vp = ZTOV(zp); /* * Don't let the vnode disappear after * ZFS_OBJ_HOLD_EXIT. */ VN_HOLD(vp); *zpp = zp; err = 0; } sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); if (err) { getnewvnode_drop_reserve(); return (err); } locked = VOP_ISLOCKED(vp); VI_LOCK(vp); if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) { /* * The vnode is doomed and this thread doesn't * hold the exclusive lock on it, so the vnode * must be being reclaimed by another thread. * Otherwise the doomed vnode is being reclaimed * by this thread and zfs_zget is called from * ZIL internals. */ VI_UNLOCK(vp); /* * XXX vrele() locks the vnode when the last reference * is dropped. Although in this case the vnode is * doomed / dead and so no inactivation is required, * the vnode lock is still acquired. That could result * in a LOR with z_teardown_lock if another thread holds * the vnode's lock and tries to take z_teardown_lock. * But that is only possible if the other thread peforms * a ZFS vnode operation on the vnode. That either * should not happen if the vnode is dead or the thread * should also have a reference to the vnode and thus * our reference is not last. */ VN_RELE(vp); goto again; } VI_UNLOCK(vp); getnewvnode_drop_reserve(); return (err); } /* * Not found create new znode/vnode * but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } if (err == 0) { vnode_t *vp = ZTOV(zp); err = insmntque(vp, zfsvfs->z_vfs); if (err == 0) { vp->v_hash = obj_num; VOP_UNLOCK1(vp); } else { zp->z_vnode = NULL; zfs_znode_dmu_fini(zp); zfs_znode_free(zp); *zpp = NULL; } } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_object_info_t doi; dmu_buf_t *db; vnode_t *vp; uint64_t obj_num = zp->z_id; uint64_t mode, size; sa_bulk_attr_t bulk[8]; int err; int count = 0; uint64_t gen; /* * Remove cached pages before reloading the znode, so that they are not * lingering after we run into any error. Ideally, we should vgone() * the vnode in case of error, but currently we cannot do that * because of the LOR between the vnode lock and z_teardown_lock. * So, instead, we have to "doom" the znode in the illumos style. * * Ignore invalid pages during the scan. This is to avoid deadlocks * between page busying and the teardown lock, as pages are busied prior * to a VOP_GETPAGES operation, which acquires the teardown read lock. * Such pages will be invalid and can safely be skipped here. */ vp = ZTOV(zp); #if __FreeBSD_version >= 1400042 vn_pages_remove_valid(vp, 0, 0); #else vn_pages_remove(vp, 0, 0); #endif ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT3P(zp->z_sa_hdl, ==, NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); size = zp->z_size; /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, sizeof (zp->z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, sizeof (zp->z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } zp->z_mode = mode; if (gen != zp->z_gen) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * It is highly improbable but still quite possible that two * objects in different datasets are created with the same * object numbers and in transaction groups with the same * numbers. znodes corresponding to those objects would * have the same z_id and z_gen, but their other attributes * may be different. * zfs recv -F may replace one of such objects with the other. * As a result file properties recorded in the replaced * object's vnode may no longer match the received object's * properties. At present the only cached property is the * files type recorded in v_type. * So, handle this case by leaving the old vnode and znode * disassociated from the actual object. A new vnode and a * znode will be created if the object is accessed * (e.g. via a look-up). The old vnode and znode will be * recycled when the last vnode reference is dropped. */ if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatically removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (zp->z_links == 0); if (zp->z_unlinked) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } zp->z_blksz = doi.doi_data_block_size; if (zp->z_size != size) vnode_pager_setsize(vp, zp->z_size); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY0(dmu_object_free(os, acl_obj, tx)); } VERIFY0(dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); zfs_znode_free(zp); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; ASSERT3P(zp->z_sa_hdl, !=, NULL); /* * Don't allow a zfs_zget() while were trying to release this znode */ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_rmnode(zp); return; } } zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_znode_free(zp); } void zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; #if __FreeBSD_version >= 1300139 char *symlink; #endif ASSERT3P(zp->z_sa_hdl, ==, NULL); zp->z_vnode = NULL; mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes--; mutex_exit(&zfsvfs->z_znodes_lock); #if __FreeBSD_version >= 1300139 symlink = atomic_load_ptr(&zp->z_cached_symlink); if (symlink != NULL) { atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)NULL); cache_symlink_free(symlink, strlen(symlink) + 1); } #endif if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } zfs_znode_free_kmem(zp); } void zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2], boolean_t have_tx) { timestruc_t now; vfs_timestamp(&now); if (have_tx) { /* will sa_bulk_update happen really soon? */ zp->z_atime_dirty = 0; zp->z_seq++; } else { zp->z_atime_dirty = 1; } if (flag & AT_ATIME) { ZFS_TIME_ENCODE(&now, zp->z_atime); } if (flag & AT_MTIME) { ZFS_TIME_ENCODE(&now, mtime); if (zp->z_zfsvfs->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & AT_CTIME) { ZFS_TIME_ENCODE(&now, ctime); if (zp->z_zfsvfs->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), &zp->z_size, sizeof (zp->z_size), tx)); vnode_pager_setsize(ZTOV(zp), end); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); if (error == 0) { #if __FreeBSD_version >= 1400032 vnode_pager_purge_range(ZTOV(zp), off, off + len); #else /* * Before __FreeBSD_version 1400032 we cannot free block in the * middle of a file, but only at the end of a file, so this code * path should never happen. */ vnode_pager_setsize(ZTOV(zp), off); #endif } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); dmu_tx_commit(tx); /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ vnode_pager_setsize(vp, end); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; else return (error); } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) return (error); log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(error); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); return (0); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int error; int i; znode_t *rootzp = NULL; zfsvfs_t *zfsvfs; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT0(error); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; char *name; ASSERT3S(nvpair_type(elem), ==, DATA_TYPE_UINT64); val = fnvpair_value_uint64(elem); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT0(error); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT3U(version, !=, 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + ASSERT0(error); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT0(error); /* * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. */ VATTR_NULL(&vattr); vattr.va_mask = AT_MODE|AT_UID|AT_GID; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); rootzp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT0(error); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); rootzp->z_zfsvfs = zfsvfs; VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, NULL)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT0(error); zfs_acl_ids_free(&acl_ids); POINTER_INVALIDATE(&rootzp->z_zfsvfs); sa_handle_destroy(rootzp->z_sa_hdl); zfs_znode_free_kmem(rootzp); /* * Create shares directory */ error = zfs_create_share_dir(zfsvfs, tx); ASSERT0(error); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } #endif /* _KERNEL */ static int zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) { uint64_t sa_obj = 0; int error; error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0 && error != ENOENT) return (error); error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); return (error); } static int zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, dmu_buf_t **db, const void *tag) { dmu_object_info_t doi; int error; if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) return (error); dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t))) { sa_buf_rele(*db, tag); return (SET_ERROR(ENOTSUP)); } error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); if (error != 0) { sa_buf_rele(*db, tag); return (error); } return (0); } static void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag) { sa_handle_destroy(hdl); sa_buf_rele(db, tag); } /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; uint64_t parent_mode; sa_bulk_attr_t bulk[3]; sa_handle_t *sa_hdl; dmu_buf_t *sa_db; int count = 0; int error; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &mode, sizeof (mode)); if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); /* * When a link is removed its parent pointer is not changed and will * be invalid. There are two cases where a link is removed but the * file stays around, when it goes to the delete queue and when there * are additional links. */ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); if (error != 0) return (error); error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); if (error != 0) return (error); *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); /* * Extended attributes can be applied to files, directories, etc. * Otherwise the parent must be a directory. */ if (!*is_xattrdir && !S_ISDIR(parent_mode)) return (SET_ERROR(EINVAL)); *pobjp = parent; return (0); } /* * Given an object number, return some zpl level statistics */ static int zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &sb->zs_mode, sizeof (sb->zs_mode)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, &sb->zs_gen, sizeof (sb->zs_gen)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, &sb->zs_links, sizeof (sb->zs_links)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, &sb->zs_ctime, sizeof (sb->zs_ctime)); return (sa_bulk_lookup(hdl, bulk, count)); } static int zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, sa_attr_type_t *sa_table, char *buf, int len) { sa_handle_t *sa_hdl; sa_handle_t *prevhdl = NULL; dmu_buf_t *prevdb = NULL; dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; int error; *path = '\0'; sa_hdl = hdl; uint64_t deleteq_obj; VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); error = zap_lookup_int(osp, deleteq_obj, obj); if (error == 0) { return (ESTALE); } else if (error != ENOENT) { return (error); } for (;;) { uint64_t pobj; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir; if (prevdb) { ASSERT3P(prevhdl, !=, NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { (void) sprintf(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT3P(path, >=, buf); memcpy(path, component, complen); obj = pobj; if (sa_hdl != hdl) { prevhdl = sa_hdl; prevdb = sa_db; } error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); if (error != 0) { sa_hdl = prevhdl; sa_db = prevdb; break; } } if (sa_hdl != NULL && sa_hdl != hdl) { ASSERT3P(sa_db, !=, NULL); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, char *buf, int len) { char *path = buf + len - 1; sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; *path = '\0'; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_stats_impl(hdl, sa_table, sb); if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } void zfs_znode_update_vfs(znode_t *zp) { vm_object_t object; if ((object = ZTOV(zp)->v_object) == NULL || zp->z_size == object->un_pager.vnp.vnp_size) return; vnode_pager_setsize(ZTOV(zp), zp->z_size); } #ifdef _KERNEL int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t parent; int is_xattrdir; int err; /* Extended attributes should not be visible as regular files. */ if ((zp->z_pflags & ZFS_XATTR) != 0) return (SET_ERROR(EINVAL)); err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, &parent, &is_xattrdir); if (err != 0) return (err); ASSERT0(is_xattrdir); /* No name as this is a root object. */ if (parent == zp->z_id) return (SET_ERROR(EINVAL)); err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, ZFS_DIRENT_OBJ(-1ULL), buf); if (err != 0) return (err); err = zfs_zget(zfsvfs, parent, dzpp); return (err); } #endif /* _KERNEL */ diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 4cceeb8bdffd..963e7a1ec96a 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -1,1465 +1,1461 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #include #include #include #include #include #include #include #include #include #include /* * Within the scope of spl-kmem.c file the kmem_cache_* definitions * are removed to allow access to the real Linux slab allocator. */ #undef kmem_cache_destroy #undef kmem_cache_create #undef kmem_cache_alloc #undef kmem_cache_free /* * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() * with smp_mb__{before,after}_atomic() because they were redundant. This is * only used inside our SLAB allocator, so we implement an internal wrapper * here to give us smp_mb__{before,after}_atomic() on older kernels. */ #ifndef smp_mb__before_atomic #define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) #endif #ifndef smp_mb__after_atomic #define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) #endif /* BEGIN CSTYLED */ /* * Cache magazines are an optimization designed to minimize the cost of * allocating memory. They do this by keeping a per-cpu cache of recently * freed objects, which can then be reallocated without taking a lock. This * can improve performance on highly contended caches. However, because * objects in magazines will prevent otherwise empty slabs from being * immediately released this may not be ideal for low memory machines. * * For this reason spl_kmem_cache_magazine_size can be used to set a maximum * magazine size. When this value is set to 0 the magazine size will be * automatically determined based on the object size. Otherwise magazines * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines * may never be entirely disabled in this implementation. */ static unsigned int spl_kmem_cache_magazine_size = 0; module_param(spl_kmem_cache_magazine_size, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_magazine_size, "Default magazine size (2-256), set automatically (0)"); /* * The default behavior is to report the number of objects remaining in the * cache. This allows the Linux VM to repeatedly reclaim objects from the * cache when memory is low satisfy other memory allocations. Alternately, * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache * is reclaimed. This may increase the likelihood of out of memory events. */ static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; module_param(spl_kmem_cache_reclaim, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_obj_per_slab, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); static unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; module_param(spl_kmem_cache_max_size, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); /* * For small objects the Linux slab allocator should be used to make the most * efficient use of the memory. However, large objects are not supported by * the Linux slab and therefore the SPL implementation is preferred. A cutoff * of 16K was determined to be optimal for architectures using 4K pages and * to also work well on architecutres using larger 64K page sizes. */ static unsigned int spl_kmem_cache_slab_limit = 16384; module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, "Objects less than N bytes use the Linux slab"); /* * The number of threads available to allocate new slabs for caches. This * should not need to be tuned but it is available for performance analysis. */ static unsigned int spl_kmem_cache_kmem_threads = 4; module_param(spl_kmem_cache_kmem_threads, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, "Number of spl_kmem_cache threads"); /* END CSTYLED */ /* * Slab allocation interfaces * * While the Linux slab implementation was inspired by the Solaris * implementation I cannot use it to emulate the Solaris APIs. I * require two features which are not provided by the Linux slab. * * 1) Constructors AND destructors. Recent versions of the Linux * kernel have removed support for destructors. This is a deal * breaker for the SPL which contains particularly expensive * initializers for mutex's, condition variables, etc. We also * require a minimal level of cleanup for these data types unlike * many Linux data types which do need to be explicitly destroyed. * * 2) Virtual address space backed slab. Callers of the Solaris slab * expect it to work well for both small are very large allocations. * Because of memory fragmentation the Linux slab which is backed * by kmalloc'ed memory performs very badly when confronted with * large numbers of large allocations. Basing the slab on the * virtual address space removes the need for contiguous pages * and greatly improve performance for large allocations. * * For these reasons, the SPL has its own slab implementation with * the needed features. It is not as highly optimized as either the * Solaris or Linux slabs, but it should get me most of what is * needed until it can be optimized or obsoleted by another approach. * * One serious concern I do have about this method is the relatively * small virtual address space on 32bit arches. This will seriously * constrain the size of the slab caches and their performance. */ struct list_head spl_kmem_cache_list; /* List of caches */ struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ static taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); static void * kv_alloc(spl_kmem_cache_t *skc, int size, int flags) { gfp_t lflags = kmem_flags_convert(flags); void *ptr; ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); return (ptr); } static void kv_free(spl_kmem_cache_t *skc, void *ptr, int size) { ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); /* * The Linux direct reclaim path uses this out of band value to * determine if forward progress is being made. Normally this is * incremented by kmem_freepages() which is part of the various * Linux slab implementations. However, since we are using none * of that infrastructure we are responsible for incrementing it. */ if (current->reclaim_state) current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; vfree(ptr); } /* * Required space for each aligned sks. */ static inline uint32_t spl_sks_size(spl_kmem_cache_t *skc) { return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), skc->skc_obj_align, uint32_t)); } /* * Required space for each aligned object. */ static inline uint32_t spl_obj_size(spl_kmem_cache_t *skc) { uint32_t align = skc->skc_obj_align; return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); } uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache) { return (cache->skc_obj_total); } EXPORT_SYMBOL(spl_kmem_cache_inuse); uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache) { return (cache->skc_obj_size); } EXPORT_SYMBOL(spl_kmem_cache_entry_size); /* * Lookup the spl_kmem_object_t for an object given that object. */ static inline spl_kmem_obj_t * spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) { return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, skc->skc_obj_align, uint32_t)); } /* * It's important that we pack the spl_kmem_obj_t structure and the * actual objects in to one large address space to minimize the number * of calls to the allocator. It is far better to do a few large * allocations and then subdivide it ourselves. Now which allocator * we use requires balancing a few trade offs. * * For small objects we use kmem_alloc() because as long as you are * only requesting a small number of pages (ideally just one) its cheap. * However, when you start requesting multiple pages with kmem_alloc() * it gets increasingly expensive since it requires contiguous pages. * For this reason we shift to vmem_alloc() for slabs of large objects * which removes the need for contiguous pages. We do not use * vmem_alloc() in all cases because there is significant locking * overhead in __get_vm_area_node(). This function takes a single * global lock when acquiring an available virtual address range which * serializes all vmem_alloc()'s for all slab caches. Using slightly * different allocation functions for small and large objects should * give us the best of both worlds. * * +------------------------+ * | spl_kmem_slab_t --+-+ | * | skc_obj_size <-+ | | * | spl_kmem_obj_t | | * | skc_obj_size <---+ | * | spl_kmem_obj_t | | * | ... v | * +------------------------+ */ static spl_kmem_slab_t * spl_slab_alloc(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; void *base; uint32_t obj_size; base = kv_alloc(skc, skc->skc_slab_size, flags); if (base == NULL) return (NULL); sks = (spl_kmem_slab_t *)base; sks->sks_magic = SKS_MAGIC; sks->sks_objs = skc->skc_slab_objs; sks->sks_age = jiffies; sks->sks_cache = skc; INIT_LIST_HEAD(&sks->sks_list); INIT_LIST_HEAD(&sks->sks_free_list); sks->sks_ref = 0; obj_size = spl_obj_size(skc); for (int i = 0; i < sks->sks_objs; i++) { void *obj = base + spl_sks_size(skc) + (i * obj_size); ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj); sko->sko_addr = obj; sko->sko_magic = SKO_MAGIC; sko->sko_slab = sks; INIT_LIST_HEAD(&sko->sko_list); list_add_tail(&sko->sko_list, &sks->sks_free_list); } return (sks); } /* * Remove a slab from complete or partial list, it must be called with * the 'skc->skc_lock' held but the actual free must be performed * outside the lock to prevent deadlocking on vmem addresses. */ static void spl_slab_free(spl_kmem_slab_t *sks, struct list_head *sks_list, struct list_head *sko_list) { spl_kmem_cache_t *skc; ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref == 0); skc = sks->sks_cache; ASSERT(skc->skc_magic == SKC_MAGIC); /* * Update slab/objects counters in the cache, then remove the * slab from the skc->skc_partial_list. Finally add the slab * and all its objects in to the private work lists where the * destructors will be called and the memory freed to the system. */ skc->skc_obj_total -= sks->sks_objs; skc->skc_slab_total--; list_del(&sks->sks_list); list_add(&sks->sks_list, sks_list); list_splice_init(&sks->sks_free_list, sko_list); } /* * Reclaim empty slabs at the end of the partial list. */ static void spl_slab_reclaim(spl_kmem_cache_t *skc) { spl_kmem_slab_t *sks = NULL, *m = NULL; spl_kmem_obj_t *sko = NULL, *n = NULL; LIST_HEAD(sks_list); LIST_HEAD(sko_list); /* * Empty slabs and objects must be moved to a private list so they * can be safely freed outside the spin lock. All empty slabs are * at the end of skc->skc_partial_list, therefore once a non-empty * slab is found we can stop scanning. */ spin_lock(&skc->skc_lock); list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, sks_list) { if (sks->sks_ref > 0) break; spl_slab_free(sks, &sks_list, &sko_list); } spin_unlock(&skc->skc_lock); /* * The following two loops ensure all the object destructors are run, * and the slabs themselves are freed. This is all done outside the * skc->skc_lock since this allows the destructor to sleep, and * allows us to perform a conditional reschedule when a freeing a * large number of objects and slabs back to the system. */ list_for_each_entry_safe(sko, n, &sko_list, sko_list) { ASSERT(sko->sko_magic == SKO_MAGIC); } list_for_each_entry_safe(sks, m, &sks_list, sks_list) { ASSERT(sks->sks_magic == SKS_MAGIC); kv_free(skc, sks, skc->skc_slab_size); } } static spl_kmem_emergency_t * spl_emergency_search(struct rb_root *root, void *obj) { struct rb_node *node = root->rb_node; spl_kmem_emergency_t *ske; unsigned long address = (unsigned long)obj; while (node) { ske = container_of(node, spl_kmem_emergency_t, ske_node); if (address < ske->ske_obj) node = node->rb_left; else if (address > ske->ske_obj) node = node->rb_right; else return (ske); } return (NULL); } static int spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) { struct rb_node **new = &(root->rb_node), *parent = NULL; spl_kmem_emergency_t *ske_tmp; unsigned long address = ske->ske_obj; while (*new) { ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); parent = *new; if (address < ske_tmp->ske_obj) new = &((*new)->rb_left); else if (address > ske_tmp->ske_obj) new = &((*new)->rb_right); else return (0); } rb_link_node(&ske->ske_node, parent, new); rb_insert_color(&ske->ske_node, root); return (1); } /* * Allocate a single emergency object and track it in a red black tree. */ static int spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) { gfp_t lflags = kmem_flags_convert(flags); spl_kmem_emergency_t *ske; int order = get_order(skc->skc_obj_size); int empty; /* Last chance use a partial slab if one now exists */ spin_lock(&skc->skc_lock); empty = list_empty(&skc->skc_partial_list); spin_unlock(&skc->skc_lock); if (!empty) return (-EEXIST); ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) return (-ENOMEM); ske->ske_obj = __get_free_pages(lflags, order); if (ske->ske_obj == 0) { kfree(ske); return (-ENOMEM); } spin_lock(&skc->skc_lock); empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); if (likely(empty)) { skc->skc_obj_total++; skc->skc_obj_emergency++; if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) skc->skc_obj_emergency_max = skc->skc_obj_emergency; } spin_unlock(&skc->skc_lock); if (unlikely(!empty)) { free_pages(ske->ske_obj, order); kfree(ske); return (-EINVAL); } *obj = (void *)ske->ske_obj; return (0); } /* * Locate the passed object in the red black tree and free it. */ static int spl_emergency_free(spl_kmem_cache_t *skc, void *obj) { spl_kmem_emergency_t *ske; int order = get_order(skc->skc_obj_size); spin_lock(&skc->skc_lock); ske = spl_emergency_search(&skc->skc_emergency_tree, obj); if (ske) { rb_erase(&ske->ske_node, &skc->skc_emergency_tree); skc->skc_obj_emergency--; skc->skc_obj_total--; } spin_unlock(&skc->skc_lock); if (ske == NULL) return (-ENOENT); free_pages(ske->ske_obj, order); kfree(ske); return (0); } /* * Release objects from the per-cpu magazine back to their slab. The flush * argument contains the max number of entries to remove from the magazine. */ static void spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) { spin_lock(&skc->skc_lock); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC); int count = MIN(flush, skm->skm_avail); for (int i = 0; i < count; i++) spl_cache_shrink(skc, skm->skm_objs[i]); skm->skm_avail -= count; memmove(skm->skm_objs, &(skm->skm_objs[count]), sizeof (void *) * skm->skm_avail); spin_unlock(&skc->skc_lock); } /* * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, * for very small objects we may end up with more than this so as not * to waste space in the minimal allocation of a single page. */ static int spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) { uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; sks_size = spl_sks_size(skc); obj_size = spl_obj_size(skc); max_size = (spl_kmem_cache_max_size * 1024 * 1024); tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); if (tgt_size <= max_size) { tgt_objs = (tgt_size - sks_size) / obj_size; } else { tgt_objs = (max_size - sks_size) / obj_size; tgt_size = (tgt_objs * obj_size) + sks_size; } if (tgt_objs == 0) return (-ENOSPC); *objs = tgt_objs; *size = tgt_size; return (0); } /* * Make a guess at reasonable per-cpu magazine size based on the size of * each object and the cost of caching N of them in each magazine. Long * term this should really adapt based on an observed usage heuristic. */ static int spl_magazine_size(spl_kmem_cache_t *skc) { uint32_t obj_size = spl_obj_size(skc); int size; if (spl_kmem_cache_magazine_size > 0) return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); /* Per-magazine sizes below assume a 4Kib page size */ if (obj_size > (PAGE_SIZE * 256)) size = 4; /* Minimum 4Mib per-magazine */ else if (obj_size > (PAGE_SIZE * 32)) size = 16; /* Minimum 2Mib per-magazine */ else if (obj_size > (PAGE_SIZE)) size = 64; /* Minimum 256Kib per-magazine */ else if (obj_size > (PAGE_SIZE / 4)) size = 128; /* Minimum 128Kib per-magazine */ else size = 256; return (size); } /* * Allocate a per-cpu magazine to associate with a specific core. */ static spl_kmem_magazine_t * spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) { spl_kmem_magazine_t *skm; int size = sizeof (spl_kmem_magazine_t) + sizeof (void *) * skc->skc_mag_size; skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (skm) { skm->skm_magic = SKM_MAGIC; skm->skm_avail = 0; skm->skm_size = skc->skc_mag_size; skm->skm_refill = skc->skc_mag_refill; skm->skm_cache = skc; skm->skm_cpu = cpu; } return (skm); } /* * Free a per-cpu magazine associated with a specific core. */ static void spl_magazine_free(spl_kmem_magazine_t *skm) { ASSERT(skm->skm_magic == SKM_MAGIC); ASSERT(skm->skm_avail == 0); kfree(skm); } /* * Create all pre-cpu magazines of reasonable sizes. */ static int spl_magazine_create(spl_kmem_cache_t *skc) { int i = 0; ASSERT((skc->skc_flags & KMC_SLAB) == 0); skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); skc->skc_mag_size = spl_magazine_size(skc); skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; for_each_possible_cpu(i) { skc->skc_mag[i] = spl_magazine_alloc(skc, i); if (!skc->skc_mag[i]) { for (i--; i >= 0; i--) spl_magazine_free(skc->skc_mag[i]); kfree(skc->skc_mag); return (-ENOMEM); } } return (0); } /* * Destroy all pre-cpu magazines. */ static void spl_magazine_destroy(spl_kmem_cache_t *skc) { spl_kmem_magazine_t *skm; int i = 0; ASSERT((skc->skc_flags & KMC_SLAB) == 0); for_each_possible_cpu(i) { skm = skc->skc_mag[i]; spl_cache_flush(skc, skm, skm->skm_avail); spl_magazine_free(skm); } kfree(skc->skc_mag); } /* * Create a object cache based on the following arguments: * name cache name * size cache object size * align cache object alignment * ctor cache object constructor * dtor cache object destructor * reclaim cache object reclaim * priv cache private data for ctor/dtor/reclaim * vmp unused must be NULL * flags * KMC_KVMEM Force kvmem backed SPL cache * KMC_SLAB Force Linux slab backed cache * KMC_NODEBUG Disable debugging (unsupported) */ spl_kmem_cache_t * spl_kmem_cache_create(const char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim, void *priv, void *vmp, int flags) { gfp_t lflags = kmem_flags_convert(KM_SLEEP); spl_kmem_cache_t *skc; int rc; /* * Unsupported flags */ ASSERT(vmp == NULL); ASSERT(reclaim == NULL); might_sleep(); skc = kzalloc(sizeof (*skc), lflags); if (skc == NULL) return (NULL); skc->skc_magic = SKC_MAGIC; skc->skc_name_size = strlen(name) + 1; skc->skc_name = kmalloc(skc->skc_name_size, lflags); if (skc->skc_name == NULL) { kfree(skc); return (NULL); } strlcpy(skc->skc_name, name, skc->skc_name_size); skc->skc_ctor = ctor; skc->skc_dtor = dtor; skc->skc_private = priv; skc->skc_vmp = vmp; skc->skc_linux_cache = NULL; skc->skc_flags = flags; skc->skc_obj_size = size; skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; atomic_set(&skc->skc_ref, 0); INIT_LIST_HEAD(&skc->skc_list); INIT_LIST_HEAD(&skc->skc_complete_list); INIT_LIST_HEAD(&skc->skc_partial_list); skc->skc_emergency_tree = RB_ROOT; spin_lock_init(&skc->skc_lock); init_waitqueue_head(&skc->skc_waitq); skc->skc_slab_fail = 0; skc->skc_slab_create = 0; skc->skc_slab_destroy = 0; skc->skc_slab_total = 0; skc->skc_slab_alloc = 0; skc->skc_slab_max = 0; skc->skc_obj_total = 0; skc->skc_obj_alloc = 0; skc->skc_obj_max = 0; skc->skc_obj_deadlock = 0; skc->skc_obj_emergency = 0; skc->skc_obj_emergency_max = 0; rc = percpu_counter_init_common(&skc->skc_linux_alloc, 0, GFP_KERNEL); if (rc != 0) { kfree(skc); return (NULL); } /* * Verify the requested alignment restriction is sane. */ if (align) { VERIFY(ISP2(align)); VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); VERIFY3U(align, <=, PAGE_SIZE); skc->skc_obj_align = align; } /* * When no specific type of slab is requested (kmem, vmem, or * linuxslab) then select a cache type based on the object size * and default tunables. */ if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) { if (spl_kmem_cache_slab_limit && size <= (size_t)spl_kmem_cache_slab_limit) { /* * Objects smaller than spl_kmem_cache_slab_limit can * use the Linux slab for better space-efficiency. */ skc->skc_flags |= KMC_SLAB; } else { /* * All other objects are considered large and are * placed on kvmem backed slabs. */ skc->skc_flags |= KMC_KVMEM; } } /* * Given the type of slab allocate the required resources. */ if (skc->skc_flags & KMC_KVMEM) { rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size); if (rc) goto out; rc = spl_magazine_create(skc); if (rc) goto out; } else { unsigned long slabflags = 0; - if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { - rc = EINVAL; + if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) goto out; - } #if defined(SLAB_USERCOPY) /* * Required for PAX-enabled kernels if the slab is to be * used for copying between user and kernel space. */ slabflags |= SLAB_USERCOPY; #endif #if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY) /* * Newer grsec patchset uses kmem_cache_create_usercopy() * instead of SLAB_USERCOPY flag */ skc->skc_linux_cache = kmem_cache_create_usercopy( skc->skc_name, size, align, slabflags, 0, size, NULL); #else skc->skc_linux_cache = kmem_cache_create( skc->skc_name, size, align, slabflags, NULL); #endif - if (skc->skc_linux_cache == NULL) { - rc = ENOMEM; + if (skc->skc_linux_cache == NULL) goto out; - } } down_write(&spl_kmem_cache_sem); list_add_tail(&skc->skc_list, &spl_kmem_cache_list); up_write(&spl_kmem_cache_sem); return (skc); out: kfree(skc->skc_name); percpu_counter_destroy(&skc->skc_linux_alloc); kfree(skc); return (NULL); } EXPORT_SYMBOL(spl_kmem_cache_create); /* * Register a move callback for cache defragmentation. * XXX: Unimplemented but harmless to stub out for now. */ void spl_kmem_cache_set_move(spl_kmem_cache_t *skc, kmem_cbrc_t (move)(void *, void *, size_t, void *)) { ASSERT(move != NULL); } EXPORT_SYMBOL(spl_kmem_cache_set_move); /* * Destroy a cache and all objects associated with the cache. */ void spl_kmem_cache_destroy(spl_kmem_cache_t *skc) { DECLARE_WAIT_QUEUE_HEAD(wq); taskqid_t id; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB)); down_write(&spl_kmem_cache_sem); list_del_init(&skc->skc_list); up_write(&spl_kmem_cache_sem); /* Cancel any and wait for any pending delayed tasks */ VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); spin_lock(&skc->skc_lock); id = skc->skc_taskqid; spin_unlock(&skc->skc_lock); taskq_cancel_id(spl_kmem_cache_taskq, id); /* * Wait until all current callers complete, this is mainly * to catch the case where a low memory situation triggers a * cache reaping action which races with this destroy. */ wait_event(wq, atomic_read(&skc->skc_ref) == 0); if (skc->skc_flags & KMC_KVMEM) { spl_magazine_destroy(skc); spl_slab_reclaim(skc); } else { ASSERT(skc->skc_flags & KMC_SLAB); kmem_cache_destroy(skc->skc_linux_cache); } spin_lock(&skc->skc_lock); /* * Validate there are no objects in use and free all the * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ ASSERT3U(skc->skc_slab_alloc, ==, 0); ASSERT3U(skc->skc_obj_alloc, ==, 0); ASSERT3U(skc->skc_slab_total, ==, 0); ASSERT3U(skc->skc_obj_total, ==, 0); ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0); percpu_counter_destroy(&skc->skc_linux_alloc); spin_unlock(&skc->skc_lock); kfree(skc->skc_name); kfree(skc); } EXPORT_SYMBOL(spl_kmem_cache_destroy); /* * Allocate an object from a slab attached to the cache. This is used to * repopulate the per-cpu magazine caches in batches when they run low. */ static void * spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) { spl_kmem_obj_t *sko; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(sks->sks_magic == SKS_MAGIC); sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); ASSERT(sko->sko_magic == SKO_MAGIC); ASSERT(sko->sko_addr != NULL); /* Remove from sks_free_list */ list_del_init(&sko->sko_list); sks->sks_age = jiffies; sks->sks_ref++; skc->skc_obj_alloc++; /* Track max obj usage statistics */ if (skc->skc_obj_alloc > skc->skc_obj_max) skc->skc_obj_max = skc->skc_obj_alloc; /* Track max slab usage statistics */ if (sks->sks_ref == 1) { skc->skc_slab_alloc++; if (skc->skc_slab_alloc > skc->skc_slab_max) skc->skc_slab_max = skc->skc_slab_alloc; } return (sko->sko_addr); } /* * Generic slab allocation function to run by the global work queues. * It is responsible for allocating a new slab, linking it in to the list * of partial slabs, and then waking any waiters. */ static int __spl_cache_grow(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; fstrans_cookie_t cookie = spl_fstrans_mark(); sks = spl_slab_alloc(skc, flags); spl_fstrans_unmark(cookie); spin_lock(&skc->skc_lock); if (sks) { skc->skc_slab_total++; skc->skc_obj_total += sks->sks_objs; list_add_tail(&sks->sks_list, &skc->skc_partial_list); smp_mb__before_atomic(); clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); smp_mb__after_atomic(); } spin_unlock(&skc->skc_lock); return (sks == NULL ? -ENOMEM : 0); } static void spl_cache_grow_work(void *data) { spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; spl_kmem_cache_t *skc = ska->ska_cache; int error = __spl_cache_grow(skc, ska->ska_flags); atomic_dec(&skc->skc_ref); smp_mb__before_atomic(); clear_bit(KMC_BIT_GROWING, &skc->skc_flags); smp_mb__after_atomic(); if (error == 0) wake_up_all(&skc->skc_waitq); kfree(ska); } /* * Returns non-zero when a new slab should be available. */ static int spl_cache_grow_wait(spl_kmem_cache_t *skc) { return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); } /* * No available objects on any slabs, create a new slab. Note that this * functionality is disabled for KMC_SLAB caches which are backed by the * Linux slab. */ static int spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) { int remaining, rc = 0; ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); might_sleep(); *obj = NULL; /* * Before allocating a new slab wait for any reaping to complete and * then return so the local magazine can be rechecked for new objects. */ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, TASK_UNINTERRUPTIBLE); return (rc ? rc : -EAGAIN); } /* * Note: It would be nice to reduce the overhead of context switch * and improve NUMA locality, by trying to allocate a new slab in the * current process context with KM_NOSLEEP flag. * * However, this can't be applied to vmem/kvmem due to a bug that * spl_vmalloc() doesn't honor gfp flags in page table allocation. */ /* * This is handled by dispatching a work request to the global work * queue. This allows us to asynchronously allocate a new slab while * retaining the ability to safely fall back to a smaller synchronous * allocations to ensure forward progress is always maintained. */ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { spl_kmem_alloc_t *ska; ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); if (ska == NULL) { clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); smp_mb__after_atomic(); wake_up_all(&skc->skc_waitq); return (-ENOMEM); } atomic_inc(&skc->skc_ref); ska->ska_cache = skc; ska->ska_flags = flags; taskq_init_ent(&ska->ska_tqe); taskq_dispatch_ent(spl_kmem_cache_taskq, spl_cache_grow_work, ska, 0, &ska->ska_tqe); } /* * The goal here is to only detect the rare case where a virtual slab * allocation has deadlocked. We must be careful to minimize the use * of emergency objects which are more expensive to track. Therefore, * we set a very long timeout for the asynchronous allocation and if * the timeout is reached the cache is flagged as deadlocked. From * this point only new emergency objects will be allocated until the * asynchronous allocation completes and clears the deadlocked flag. */ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { rc = spl_emergency_alloc(skc, flags, obj); } else { remaining = wait_event_timeout(skc->skc_waitq, spl_cache_grow_wait(skc), HZ / 10); if (!remaining) { spin_lock(&skc->skc_lock); if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); skc->skc_obj_deadlock++; } spin_unlock(&skc->skc_lock); } rc = -ENOMEM; } return (rc); } /* * Refill a per-cpu magazine with objects from the slabs for this cache. * Ideally the magazine can be repopulated using existing objects which have * been released, however if we are unable to locate enough free objects new * slabs of objects will be created. On success NULL is returned, otherwise * the address of a single emergency object is returned for use by the caller. */ static void * spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) { spl_kmem_slab_t *sks; int count = 0, rc, refill; void *obj = NULL; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC); refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); spin_lock(&skc->skc_lock); while (refill > 0) { /* No slabs available we may need to grow the cache */ if (list_empty(&skc->skc_partial_list)) { spin_unlock(&skc->skc_lock); local_irq_enable(); rc = spl_cache_grow(skc, flags, &obj); local_irq_disable(); /* Emergency object for immediate use by caller */ if (rc == 0 && obj != NULL) return (obj); if (rc) goto out; /* Rescheduled to different CPU skm is not local */ if (skm != skc->skc_mag[smp_processor_id()]) goto out; /* * Potentially rescheduled to the same CPU but * allocations may have occurred from this CPU while * we were sleeping so recalculate max refill. */ refill = MIN(refill, skm->skm_size - skm->skm_avail); spin_lock(&skc->skc_lock); continue; } /* Grab the next available slab */ sks = list_entry((&skc->skc_partial_list)->next, spl_kmem_slab_t, sks_list); ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref < sks->sks_objs); ASSERT(!list_empty(&sks->sks_free_list)); /* * Consume as many objects as needed to refill the requested * cache. We must also be careful not to overfill it. */ while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { ASSERT(skm->skm_avail < skm->skm_size); ASSERT(count < skm->skm_size); skm->skm_objs[skm->skm_avail++] = spl_cache_obj(skc, sks); } /* Move slab to skc_complete_list when full */ if (sks->sks_ref == sks->sks_objs) { list_del(&sks->sks_list); list_add(&sks->sks_list, &skc->skc_complete_list); } } spin_unlock(&skc->skc_lock); out: return (NULL); } /* * Release an object back to the slab from which it came. */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) { spl_kmem_slab_t *sks = NULL; spl_kmem_obj_t *sko = NULL; ASSERT(skc->skc_magic == SKC_MAGIC); sko = spl_sko_from_obj(skc, obj); ASSERT(sko->sko_magic == SKO_MAGIC); sks = sko->sko_slab; ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_cache == skc); list_add(&sko->sko_list, &sks->sks_free_list); sks->sks_age = jiffies; sks->sks_ref--; skc->skc_obj_alloc--; /* * Move slab to skc_partial_list when no longer full. Slabs * are added to the head to keep the partial list is quasi-full * sorted order. Fuller at the head, emptier at the tail. */ if (sks->sks_ref == (sks->sks_objs - 1)) { list_del(&sks->sks_list); list_add(&sks->sks_list, &skc->skc_partial_list); } /* * Move empty slabs to the end of the partial list so * they can be easily found and freed during reclamation. */ if (sks->sks_ref == 0) { list_del(&sks->sks_list); list_add_tail(&sks->sks_list, &skc->skc_partial_list); skc->skc_slab_alloc--; } } /* * Allocate an object from the per-cpu magazine, or if the magazine * is empty directly allocate from a slab and repopulate the magazine. */ void * spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) { spl_kmem_magazine_t *skm; void *obj = NULL; ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); /* * Allocate directly from a Linux slab. All optimizations are left * to the underlying cache we only need to guarantee that KM_SLEEP * callers will never fail. */ if (skc->skc_flags & KMC_SLAB) { struct kmem_cache *slc = skc->skc_linux_cache; do { obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); } while ((obj == NULL) && !(flags & KM_NOSLEEP)); if (obj != NULL) { /* * Even though we leave everything up to the * underlying cache we still keep track of * how many objects we've allocated in it for * better debuggability. */ percpu_counter_inc(&skc->skc_linux_alloc); } goto ret; } local_irq_disable(); restart: /* * Safe to update per-cpu structure without lock, but * in the restart case we must be careful to reacquire * the local magazine since this may have changed * when we need to grow the cache. */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); if (likely(skm->skm_avail)) { /* Object available in CPU cache, use it */ obj = skm->skm_objs[--skm->skm_avail]; } else { obj = spl_cache_refill(skc, skm, flags); if ((obj == NULL) && !(flags & KM_NOSLEEP)) goto restart; local_irq_enable(); goto ret; } local_irq_enable(); ASSERT(obj); ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); ret: /* Pre-emptively migrate object to CPU L1 cache */ if (obj) { if (obj && skc->skc_ctor) skc->skc_ctor(obj, skc->skc_private, flags); else prefetchw(obj); } return (obj); } EXPORT_SYMBOL(spl_kmem_cache_alloc); /* * Free an object back to the local per-cpu magazine, there is no * guarantee that this is the same magazine the object was originally * allocated from. We may need to flush entire from the magazine * back to the slabs to make space. */ void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) { spl_kmem_magazine_t *skm; unsigned long flags; int do_reclaim = 0; int do_emergency = 0; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); /* * Run the destructor */ if (skc->skc_dtor) skc->skc_dtor(obj, skc->skc_private); /* * Free the object from the Linux underlying Linux slab. */ if (skc->skc_flags & KMC_SLAB) { kmem_cache_free(skc->skc_linux_cache, obj); percpu_counter_dec(&skc->skc_linux_alloc); return; } /* * While a cache has outstanding emergency objects all freed objects * must be checked. However, since emergency objects will never use * a virtual address these objects can be safely excluded as an * optimization. */ if (!is_vmalloc_addr(obj)) { spin_lock(&skc->skc_lock); do_emergency = (skc->skc_obj_emergency > 0); spin_unlock(&skc->skc_lock); if (do_emergency && (spl_emergency_free(skc, obj) == 0)) return; } local_irq_save(flags); /* * Safe to update per-cpu structure without lock, but * no remote memory allocation tracking is being performed * it is entirely possible to allocate an object from one * CPU cache and return it to another. */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); /* * Per-CPU cache full, flush it to make space for this object, * this may result in an empty slab which can be reclaimed once * interrupts are re-enabled. */ if (unlikely(skm->skm_avail >= skm->skm_size)) { spl_cache_flush(skc, skm, skm->skm_refill); do_reclaim = 1; } /* Available space in cache, use it */ skm->skm_objs[skm->skm_avail++] = obj; local_irq_restore(flags); if (do_reclaim) spl_slab_reclaim(skc); } EXPORT_SYMBOL(spl_kmem_cache_free); /* * Depending on how many and which objects are released it may simply * repopulate the local magazine which will then need to age-out. Objects * which cannot fit in the magazine will be released back to their slabs * which will also need to age out before being released. This is all just * best effort and we do not want to thrash creating and destroying slabs. */ void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) { ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); if (skc->skc_flags & KMC_SLAB) return; atomic_inc(&skc->skc_ref); /* * Prevent concurrent cache reaping when contended. */ if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) goto out; /* Reclaim from the magazine and free all now empty slabs. */ unsigned long irq_flags; local_irq_save(irq_flags); spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; spl_cache_flush(skc, skm, skm->skm_avail); local_irq_restore(irq_flags); spl_slab_reclaim(skc); clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); smp_mb__after_atomic(); wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); out: atomic_dec(&skc->skc_ref); } EXPORT_SYMBOL(spl_kmem_cache_reap_now); /* * This is stubbed out for code consistency with other platforms. There * is existing logic to prevent concurrent reaping so while this is ugly * it should do no harm. */ int spl_kmem_cache_reap_active(void) { return (0); } EXPORT_SYMBOL(spl_kmem_cache_reap_active); /* * Reap all free slabs from all registered caches. */ void spl_kmem_reap(void) { spl_kmem_cache_t *skc = NULL; down_read(&spl_kmem_cache_sem); list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { spl_kmem_cache_reap_now(skc); } up_read(&spl_kmem_cache_sem); } EXPORT_SYMBOL(spl_kmem_reap); int spl_kmem_cache_init(void) { init_rwsem(&spl_kmem_cache_sem); INIT_LIST_HEAD(&spl_kmem_cache_list); spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", spl_kmem_cache_kmem_threads, maxclsyspri, spl_kmem_cache_kmem_threads * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (spl_kmem_cache_taskq == NULL) return (-ENOMEM); return (0); } void spl_kmem_cache_fini(void) { taskq_destroy(spl_kmem_cache_taskq); } diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index b863945a1c59..b4ef86a5e4a6 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -1,207 +1,206 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Thread Implementation. */ #include #include #include /* * Thread interfaces */ typedef struct thread_priv_s { unsigned long tp_magic; /* Magic */ int tp_name_size; /* Name size */ char *tp_name; /* Name (without _thread suffix) */ void (*tp_func)(void *); /* Registered function */ void *tp_args; /* Args to be passed to function */ size_t tp_len; /* Len to be passed to function */ int tp_state; /* State to start thread at */ pri_t tp_pri; /* Priority to start threat at */ } thread_priv_t; static int thread_generic_wrapper(void *arg) { thread_priv_t *tp = (thread_priv_t *)arg; void (*func)(void *); void *args; ASSERT(tp->tp_magic == TP_MAGIC); func = tp->tp_func; args = tp->tp_args; set_current_state(tp->tp_state); set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); kmem_free(tp->tp_name, tp->tp_name_size); kmem_free(tp, sizeof (thread_priv_t)); if (func) func(args); return (0); } /* * thread_create() may block forever if it cannot create a thread or * allocate memory. This is preferable to returning a NULL which Solaris * style callers likely never check for... since it can't fail. */ kthread_t * __thread_create(caddr_t stk, size_t stksize, thread_func_t func, const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri) { thread_priv_t *tp; struct task_struct *tsk; char *p; /* Option pp is simply ignored */ /* Variable stack size unsupported */ ASSERT(stk == NULL); tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE); if (tp == NULL) return (NULL); tp->tp_magic = TP_MAGIC; tp->tp_name_size = strlen(name) + 1; tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE); if (tp->tp_name == NULL) { kmem_free(tp, sizeof (thread_priv_t)); return (NULL); } strlcpy(tp->tp_name, name, tp->tp_name_size); /* * Strip trailing "_thread" from passed name which will be the func * name since the exposed API has no parameter for passing a name. */ p = strstr(tp->tp_name, "_thread"); if (p) p[0] = '\0'; tp->tp_func = func; tp->tp_args = args; tp->tp_len = len; tp->tp_state = state; tp->tp_pri = pri; tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp, "%s", tp->tp_name); if (IS_ERR(tsk)) return (NULL); wake_up_process(tsk); return ((kthread_t *)tsk); } EXPORT_SYMBOL(__thread_create); /* * spl_kthread_create - Wrapper providing pre-3.13 semantics for * kthread_create() in which it is not killable and less likely * to return -ENOMEM. */ struct task_struct * spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) { struct task_struct *tsk; va_list args; char name[TASK_COMM_LEN]; va_start(args, namefmt); vsnprintf(name, sizeof (name), namefmt, args); va_end(args); do { tsk = kthread_create(func, data, "%s", name); if (IS_ERR(tsk)) { if (signal_pending(current)) { clear_thread_flag(TIF_SIGPENDING); continue; } if (PTR_ERR(tsk) == -ENOMEM) continue; return (NULL); } else { return (tsk); } } while (1); } EXPORT_SYMBOL(spl_kthread_create); /* * The "why" argument indicates the allowable side-effects of the call: * * FORREAL: Extract the next pending signal from p_sig into p_cursig; * stop the process if a stop has been requested or if a traced signal * is pending. * * JUSTLOOKING: Don't stop the process, just indicate whether or not * a signal might be pending (FORREAL is needed to tell for sure). */ int issig(int why) { ASSERT(why == FORREAL || why == JUSTLOOKING); if (!signal_pending(current)) return (0); if (why != FORREAL) return (1); struct task_struct *task = current; spl_kernel_siginfo_t __info; sigset_t set; siginitsetinv(&set, 1ULL << (SIGSTOP - 1) | 1ULL << (SIGTSTP - 1)); sigorsets(&set, &task->blocked, &set); spin_lock_irq(&task->sighand->siglock); - int ret; #ifdef HAVE_DEQUEUE_SIGNAL_4ARG enum pid_type __type; - if ((ret = dequeue_signal(task, &set, &__info, &__type)) != 0) { + if (dequeue_signal(task, &set, &__info, &__type) != 0) { #else - if ((ret = dequeue_signal(task, &set, &__info)) != 0) { + if (dequeue_signal(task, &set, &__info) != 0) { #endif #ifdef HAVE_SIGNAL_STOP spin_unlock_irq(&task->sighand->siglock); kernel_signal_stop(); #else if (current->jobctl & JOBCTL_STOP_DEQUEUED) spl_set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); schedule(); #endif return (0); } spin_unlock_irq(&task->sighand->siglock); return (1); } EXPORT_SYMBOL(issig); diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 2b5c4956dd02..925ee9d9fe9c 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1,1059 +1,1056 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LINUX_BLK_CGROUP_HEADER #include #endif typedef struct vdev_disk { struct block_device *vd_bdev; krwlock_t vd_lock; } vdev_disk_t; /* * Unique identifier for the exclusive vdev holder. */ static void *zfs_vdev_holder = VDEV_HOLDER; /* * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the * device is missing. The missing path may be transient since the links * can be briefly removed and recreated in response to udev events. */ static uint_t zfs_vdev_open_timeout_ms = 1000; /* * Size of the "reserved" partition, in blocks. */ #define EFI_MIN_RESV_SIZE (16 * 1024) /* * Virtual device vector for disks. */ typedef struct dio_request { zio_t *dr_zio; /* Parent ZIO */ atomic_t dr_ref; /* References */ int dr_error; /* Bio error */ int dr_bio_count; /* Count of bio's */ struct bio *dr_bio[]; /* Attached bio's */ } dio_request_t; /* * BIO request failfast mask. */ static unsigned int zfs_vdev_failfast_mask = 1; static fmode_t vdev_bdev_mode(spa_mode_t spa_mode) { fmode_t mode = 0; if (spa_mode & SPA_MODE_READ) mode |= FMODE_READ; if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; return (mode); } /* * Returns the usable capacity (in bytes) for the partition or disk. */ static uint64_t bdev_capacity(struct block_device *bdev) { return (i_size_read(bdev->bd_inode)); } #if !defined(HAVE_BDEV_WHOLE) static inline struct block_device * bdev_whole(struct block_device *bdev) { return (bdev->bd_contains); } #endif #if defined(HAVE_BDEVNAME) #define vdev_bdevname(bdev, name) bdevname(bdev, name) #else static inline void vdev_bdevname(struct block_device *bdev, char *name) { snprintf(name, BDEVNAME_SIZE, "%pg", bdev); } #endif /* * Returns the maximum expansion capacity of the block device (in bytes). * * It is possible to expand a vdev when it has been created as a wholedisk * and the containing block device has increased in capacity. Or when the * partition containing the pool has been manually increased in size. * * This function is only responsible for calculating the potential expansion * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is * responsible for verifying the expected partition layout in the wholedisk * case, and updating the partition table if appropriate. Once the partition * size has been increased the additional capacity will be visible using * bdev_capacity(). * * The returned maximum expansion capacity is always expected to be larger, or * at the very least equal, to its usable capacity to prevent overestimating * the pool expandsize. */ static uint64_t bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) { uint64_t psize; int64_t available; if (wholedisk && bdev != bdev_whole(bdev)) { /* * When reporting maximum expansion capacity for a wholedisk * deduct any capacity which is expected to be lost due to * alignment restrictions. Over reporting this value isn't * harmful and would only result in slightly less capacity * than expected post expansion. * The estimated available space may be slightly smaller than * bdev_capacity() for devices where the number of sectors is * not a multiple of the alignment size and the partition layout * is keeping less than PARTITION_END_ALIGNMENT bytes after the * "reserved" EFI partition: in such cases return the device * usable capacity. */ available = i_size_read(bdev_whole(bdev)->bd_inode) - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + PARTITION_END_ALIGNMENT) << SECTOR_BITS); psize = MAX(available, bdev_capacity(bdev)); } else { psize = bdev_capacity(bdev); } return (psize); } static void vdev_disk_error(zio_t *zio) { /* * This function can be called in interrupt context, for instance while * handling IRQs coming from a misbehaving disk device; use printk() * which is safe from any context. */ printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), zio->io_vd->vdev_path, zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, zio->io_flags); } static void vdev_disk_kobj_evt_post(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; if (vd && vd->vd_bdev) { spl_signal_kobj_evt(vd->vd_bdev); } else { vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", v->vdev_path); } } static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct block_device *bdev; fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; /* Must have a pathname and it must be absolute. */ if (v->vdev_path == NULL || v->vdev_path[0] != '/') { v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; vdev_dbgmsg(v, "invalid vdev_path"); return (SET_ERROR(EINVAL)); } /* * Reopen the device if it is currently open. When expanding a * partition force re-scanning the partition table if userland * did not take care of this already. We need to do this while closed * in order to get an accurate updated block device size. Then * since udev may need to recreate the device links increase the * open retry timeout before reporting the device as unavailable. */ vd = v->vdev_tsd; if (vd) { char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; boolean_t reread_part = B_FALSE; rw_enter(&vd->vd_lock, RW_WRITER); bdev = vd->vd_bdev; vd->vd_bdev = NULL; if (bdev) { if (v->vdev_expanding && bdev != bdev_whole(bdev)) { vdev_bdevname(bdev_whole(bdev), disk_name + 5); /* * If userland has BLKPG_RESIZE_PARTITION, * then it should have updated the partition * table already. We can detect this by * comparing our current physical size * with that of the device. If they are * the same, then we must not have * BLKPG_RESIZE_PARTITION or it failed to * update the partition table online. We * fallback to rescanning the partition * table from the kernel below. However, * if the capacity already reflects the * updated partition, then we skip * rescanning the partition table here. */ if (v->vdev_psize == bdev_capacity(bdev)) reread_part = B_TRUE; } blkdev_put(bdev, mode | FMODE_EXCL); } if (reread_part) { bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, zfs_vdev_holder); if (!IS_ERR(bdev)) { int error = vdev_bdev_reread_part(bdev); blkdev_put(bdev, mode | FMODE_EXCL); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); } } } } else { vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); rw_enter(&vd->vd_lock, RW_WRITER); } /* * Devices are always opened by the path provided at configuration * time. This means that if the provided path is a udev by-id path * then drives may be re-cabled without an issue. If the provided * path is a udev by-path path, then the physical location information * will be preserved. This can be critical for more complicated * configurations where drives are located in specific physical * locations to maximize the systems tolerance to component failure. * * Alternatively, you can provide your own udev rule to flexibly map * the drives as you see fit. It is not advised that you use the * /dev/[hd]d devices which may be reordered due to probing order. * Devices in the wrong locations will be detected by the higher * level vdev validation. * * The specified paths may be briefly removed and recreated in * response to udev events. This should be exceptionally unlikely * because the zpool command makes every effort to verify these paths * have already settled prior to reaching this point. Therefore, * a ENOENT failure at this point is highly likely to be transient * and it is reasonable to sleep and retry before giving up. In * practice delays have been observed to be on the order of 100ms. * * When ERESTARTSYS is returned it indicates the block device is * a zvol which could not be opened due to the deadlock detection * logic in zvol_open(). Extend the timeout and retry the open * subsequent attempts are expected to eventually succeed. */ hrtime_t start = gethrtime(); bdev = ERR_PTR(-ENXIO); while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, zfs_vdev_holder); if (unlikely(PTR_ERR(bdev) == -ENOENT)) { /* * There is no point of waiting since device is removed * explicitly */ if (v->vdev_removed) break; schedule_timeout(MSEC_TO_TICK(10)); } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; } else if (IS_ERR(bdev)) { break; } } if (IS_ERR(bdev)) { int error = -PTR_ERR(bdev); vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, (u_longlong_t)(gethrtime() - start), (u_longlong_t)timeout); vd->vd_bdev = NULL; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); return (SET_ERROR(error)); } else { vd->vd_bdev = bdev; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); } /* Determine the physical block size */ int physical_block_size = bdev_physical_block_size(vd->vd_bdev); /* Determine the logical block size */ int logical_block_size = bdev_logical_block_size(vd->vd_bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; /* Set when device reports it supports TRIM. */ v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); /* Set when device reports it supports secure TRIM. */ v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); /* Physical volume size in bytes for the partition */ *psize = bdev_capacity(vd->vd_bdev); /* Physical volume size in bytes including possible expansion space */ *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ *physical_ashift = highbit64(MAX(physical_block_size, SPA_MINBLOCKSIZE)) - 1; *logical_ashift = highbit64(MAX(logical_block_size, SPA_MINBLOCKSIZE)) - 1; return (0); } static void vdev_disk_close(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; if (v->vdev_reopening || vd == NULL) return; if (vd->vd_bdev != NULL) { blkdev_put(vd->vd_bdev, vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); } rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; } static dio_request_t * vdev_disk_dio_alloc(int bio_count) { dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); atomic_set(&dr->dr_ref, 0); dr->dr_bio_count = bio_count; dr->dr_error = 0; for (int i = 0; i < dr->dr_bio_count; i++) dr->dr_bio[i] = NULL; return (dr); } static void vdev_disk_dio_free(dio_request_t *dr) { int i; for (i = 0; i < dr->dr_bio_count; i++) if (dr->dr_bio[i]) bio_put(dr->dr_bio[i]); kmem_free(dr, sizeof (dio_request_t) + sizeof (struct bio *) * dr->dr_bio_count); } static void vdev_disk_dio_get(dio_request_t *dr) { atomic_inc(&dr->dr_ref); } -static int +static void vdev_disk_dio_put(dio_request_t *dr) { int rc = atomic_dec_return(&dr->dr_ref); /* * Free the dio_request when the last reference is dropped and * ensure zio_interpret is called only once with the correct zio */ if (rc == 0) { zio_t *zio = dr->dr_zio; int error = dr->dr_error; vdev_disk_dio_free(dr); if (zio) { zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_delay_interrupt(zio); } } - - return (rc); } BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) { dio_request_t *dr = bio->bi_private; - int rc; if (dr->dr_error == 0) { #ifdef HAVE_1ARG_BIO_END_IO_T dr->dr_error = BIO_END_IO_ERROR(bio); #else if (error) dr->dr_error = -(error); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) dr->dr_error = EIO; #endif } /* Drop reference acquired by __vdev_disk_physio */ - rc = vdev_disk_dio_put(dr); + vdev_disk_dio_put(dr); } static inline void vdev_submit_bio_impl(struct bio *bio) { #ifdef HAVE_1ARG_SUBMIT_BIO (void) submit_bio(bio); #else (void) submit_bio(bio_data_dir(bio), bio); #endif } /* * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so * replace it with preempt_schedule under the following condition: */ #if defined(CONFIG_ARM64) && \ defined(CONFIG_PREEMPTION) && \ defined(CONFIG_BLK_CGROUP) #define preempt_schedule_notrace(x) preempt_schedule(x) #endif /* * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct * as an argument removing the need to set it with bio_set_dev(). This * removes the need for all of the following compatibility code. */ #if !defined(HAVE_BIO_ALLOC_4ARG) #ifdef HAVE_BIO_SET_DEV #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) /* * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). * As a side effect the function was converted to GPL-only. Define our * own version when needed which uses rcu_read_lock_sched(). * * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public * part, moving blkg_tryget into the private one. Define our own version. */ #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) static inline bool vdev_blkg_tryget(struct blkcg_gq *blkg) { struct percpu_ref *ref = &blkg->refcnt; unsigned long __percpu *count; bool rc; rcu_read_lock_sched(); if (__ref_is_percpu(ref, &count)) { this_cpu_inc(*count); rc = true; } else { #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA rc = atomic_long_inc_not_zero(&ref->data->count); #else rc = atomic_long_inc_not_zero(&ref->count); #endif } rcu_read_unlock_sched(); return (rc); } #else #define vdev_blkg_tryget(bg) blkg_tryget(bg) #endif #ifdef HAVE_BIO_SET_DEV_MACRO /* * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the * GPL-only bio_associate_blkg() symbol thus inadvertently converting * the entire macro. Provide a minimal version which always assigns the * request queue's root_blkg to the bio. */ static inline void vdev_bio_associate_blkg(struct bio *bio) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_associate_blkg vdev_bio_associate_blkg #else static inline void vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif bio_clear_flag(bio, BIO_REMAPPED); if (bio->bi_bdev != bdev) bio_clear_flag(bio, BIO_THROTTLED); bio->bi_bdev = bdev; ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_set_dev vdev_bio_set_dev #endif #endif #else /* * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. */ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) { bio->bi_bdev = bdev; } #endif /* HAVE_BIO_SET_DEV */ #endif /* !HAVE_BIO_ALLOC_4ARG */ static inline void vdev_submit_bio(struct bio *bio) { struct bio_list *bio_list = current->bio_list; current->bio_list = NULL; vdev_submit_bio_impl(bio); current->bio_list = bio_list; } static inline struct bio * vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, unsigned short nr_vecs) { struct bio *bio; #ifdef HAVE_BIO_ALLOC_4ARG bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); #else bio = bio_alloc(gfp_mask, nr_vecs); if (likely(bio != NULL)) bio_set_dev(bio, bdev); #endif return (bio); } static inline unsigned int vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) { unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, bio_size, abd_offset); #ifdef HAVE_BIO_MAX_SEGS return (bio_max_segs(nr_segs)); #else return (MIN(nr_segs, BIO_MAX_PAGES)); #endif } static int __vdev_disk_physio(struct block_device *bdev, zio_t *zio, size_t io_size, uint64_t io_offset, int rw, int flags) { dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; int bio_size; int bio_count = 16; int error = 0; struct blk_plug plug; unsigned short nr_vecs; /* * Accessing outside the block device is never allowed. */ if (io_offset + io_size > bdev->bd_inode->i_size) { vdev_dbgmsg(zio->io_vd, "Illegal access %llu size %llu, device size %llu", (u_longlong_t)io_offset, (u_longlong_t)io_size, (u_longlong_t)i_size_read(bdev->bd_inode)); return (SET_ERROR(EIO)); } retry: dr = vdev_disk_dio_alloc(bio_count); if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && zio->io_vd->vdev_failfast == B_TRUE) { bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); } dr->dr_zio = zio; /* * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio * can cover at least 128KB and at most 1MB. When the required number * of iovec's exceeds this, we are forced to break the IO in multiple * bio's and wait for them all to complete. This is likely if the * recordsize property is increased beyond 1MB. The default * bio_count=16 should typically accommodate the maximum-size zio of * 16MB. */ abd_offset = 0; bio_offset = io_offset; bio_size = io_size; for (int i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ if (bio_size <= 0) break; /* * If additional bio's are required, we have to retry, but * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { vdev_disk_dio_free(dr); bio_count *= 2; goto retry; } nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); return (SET_ERROR(ENOMEM)); } /* Matching put called by vdev_disk_physio_completion */ vdev_disk_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; dr->dr_bio[i]->bi_private = dr; bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_disk_dio_get(dr); if (dr->dr_bio_count > 1) blk_start_plug(&plug); /* Submit all bio's associated with this dio */ for (int i = 0; i < dr->dr_bio_count; i++) { if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); } if (dr->dr_bio_count > 1) blk_finish_plug(&plug); - (void) vdev_disk_dio_put(dr); + vdev_disk_dio_put(dr); return (error); } BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; #ifdef HAVE_1ARG_BIO_END_IO_T zio->io_error = BIO_END_IO_ERROR(bio); #else zio->io_error = -error; #endif if (zio->io_error && (zio->io_error == EOPNOTSUPP)) zio->io_vd->vdev_nowritecache = B_TRUE; bio_put(bio); ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_interrupt(zio); } static int vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) { struct request_queue *q; struct bio *bio; q = bdev_get_queue(bdev); if (!q) return (SET_ERROR(ENXIO)); bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); if (unlikely(bio == NULL)) return (SET_ERROR(ENOMEM)); bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_private = zio; bio_set_flush(bio); vdev_submit_bio(bio); invalidate_bdev(bdev); return (0); } static int vdev_disk_io_trim(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) if (zio->io_trim_flags & ZIO_TRIM_SECURE) { return (-blkdev_issue_secure_erase(vd->vd_bdev, zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); } else { return (-blkdev_issue_discard(vd->vd_bdev, zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); } #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) unsigned long trim_flags = 0; #if defined(BLKDEV_DISCARD_SECURE) if (zio->io_trim_flags & ZIO_TRIM_SECURE) trim_flags |= BLKDEV_DISCARD_SECURE; #endif return (-blkdev_issue_discard(vd->vd_bdev, zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); #else #error "Unsupported kernel" #endif } static void vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; int rw, error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (vd == NULL) { zio->io_error = ENXIO; zio_interrupt(zio); return; } rw_enter(&vd->vd_lock, RW_READER); /* * If the vdev is closed, it's likely due to a failed reopen and is * in the UNAVAIL state. Nothing to be done here but return failure. */ if (vd->vd_bdev == NULL) { rw_exit(&vd->vd_lock); zio->io_error = ENXIO; zio_interrupt(zio); return; } switch (zio->io_type) { case ZIO_TYPE_IOCTL: if (!vdev_readable(v)) { rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush) break; if (v->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } error = vdev_disk_io_flush(vd->vd_bdev, zio); if (error == 0) { rw_exit(&vd->vd_lock); return; } zio->io_error = error; break; default: zio->io_error = SET_ERROR(ENOTSUP); } rw_exit(&vd->vd_lock); zio_execute(zio); return; case ZIO_TYPE_WRITE: rw = WRITE; break; case ZIO_TYPE_READ: rw = READ; break; case ZIO_TYPE_TRIM: zio->io_error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); zio_interrupt(zio); return; default: rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; } zio->io_target_timestamp = zio_handle_io_delay(zio); error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, 0); rw_exit(&vd->vd_lock); if (error) { zio->io_error = error; zio_interrupt(zio); return; } } static void vdev_disk_io_done(zio_t *zio) { /* * If the device returned EIO, we revalidate the media. If it is * determined the media has changed this triggers the asynchronous * removal of the device from the configuration. */ if (zio->io_error == EIO) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; if (!zfs_check_disk_status(vd->vd_bdev)) { invalidate_bdev(vd->vd_bdev); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } } } static void vdev_disk_hold(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') return; /* * Only prefetch path and devid info if the device has * never been opened. */ if (vd->vdev_tsd != NULL) return; } static void vdev_disk_rele(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* XXX: Implement me as a vnode rele for the device */ } vdev_ops_t vdev_disk_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = vdev_disk_hold, .vdev_op_rele = vdev_disk_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE, /* leaf vdev */ .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post }; /* * The zfs_vdev_scheduler module option has been deprecated. Setting this * value no longer has any effect. It has not yet been entirely removed * to allow the module to be loaded if this option is specified in the * /etc/modprobe.d/zfs.conf file. The following warning will be logged. */ static int param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) { int error = param_set_charp(val, kp); if (error == 0) { printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " "is not supported.\n"); } return (error); } static const char *zfs_vdev_scheduler = "unused"; module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, param_get_charp, &zfs_vdev_scheduler, 0644); MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); int param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint_t val; int error; error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } int param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint_t val; int error; error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, "Timeout before determining that a device is missing"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index 75638399ef03..db1bb9577197 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -1,3041 +1,3038 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE #define DENY ACE_ACCESS_DENIED_ACE_TYPE #define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE #define MIN_ACE_TYPE ALLOW #define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) #define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) #define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) #define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) #define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ ACE_DELETE|ACE_DELETE_CHILD) #define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) #define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) #define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) #define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) #define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) #define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ ZFS_ACL_PROTECTED) #define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ ZFS_ACL_OBJ_ACE) #define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) #define IDMAP_WK_CREATOR_OWNER_UID 2147483648U static uint16_t zfs_ace_v0_get_type(void *acep) { return (((zfs_oldace_t *)acep)->z_type); } static uint16_t zfs_ace_v0_get_flags(void *acep) { return (((zfs_oldace_t *)acep)->z_flags); } static uint32_t zfs_ace_v0_get_mask(void *acep) { return (((zfs_oldace_t *)acep)->z_access_mask); } static uint64_t zfs_ace_v0_get_who(void *acep) { return (((zfs_oldace_t *)acep)->z_fuid); } static void zfs_ace_v0_set_type(void *acep, uint16_t type) { ((zfs_oldace_t *)acep)->z_type = type; } static void zfs_ace_v0_set_flags(void *acep, uint16_t flags) { ((zfs_oldace_t *)acep)->z_flags = flags; } static void zfs_ace_v0_set_mask(void *acep, uint32_t mask) { ((zfs_oldace_t *)acep)->z_access_mask = mask; } static void zfs_ace_v0_set_who(void *acep, uint64_t who) { ((zfs_oldace_t *)acep)->z_fuid = who; } static size_t zfs_ace_v0_size(void *acep) { (void) acep; return (sizeof (zfs_oldace_t)); } static size_t zfs_ace_v0_abstract_size(void) { return (sizeof (zfs_oldace_t)); } static int zfs_ace_v0_mask_off(void) { return (offsetof(zfs_oldace_t, z_access_mask)); } static int zfs_ace_v0_data(void *acep, void **datap) { (void) acep; *datap = NULL; return (0); } static const acl_ops_t zfs_acl_v0_ops = { .ace_mask_get = zfs_ace_v0_get_mask, .ace_mask_set = zfs_ace_v0_set_mask, .ace_flags_get = zfs_ace_v0_get_flags, .ace_flags_set = zfs_ace_v0_set_flags, .ace_type_get = zfs_ace_v0_get_type, .ace_type_set = zfs_ace_v0_set_type, .ace_who_get = zfs_ace_v0_get_who, .ace_who_set = zfs_ace_v0_set_who, .ace_size = zfs_ace_v0_size, .ace_abstract_size = zfs_ace_v0_abstract_size, .ace_mask_off = zfs_ace_v0_mask_off, .ace_data = zfs_ace_v0_data }; static uint16_t zfs_ace_fuid_get_type(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_type); } static uint16_t zfs_ace_fuid_get_flags(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_flags); } static uint32_t zfs_ace_fuid_get_mask(void *acep) { return (((zfs_ace_hdr_t *)acep)->z_access_mask); } static uint64_t zfs_ace_fuid_get_who(void *args) { uint16_t entry_type; zfs_ace_t *acep = args; entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return (-1); return (((zfs_ace_t *)acep)->z_fuid); } static void zfs_ace_fuid_set_type(void *acep, uint16_t type) { ((zfs_ace_hdr_t *)acep)->z_type = type; } static void zfs_ace_fuid_set_flags(void *acep, uint16_t flags) { ((zfs_ace_hdr_t *)acep)->z_flags = flags; } static void zfs_ace_fuid_set_mask(void *acep, uint32_t mask) { ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; } static void zfs_ace_fuid_set_who(void *arg, uint64_t who) { zfs_ace_t *acep = arg; uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return; acep->z_fuid = who; } static size_t zfs_ace_fuid_size(void *acep) { zfs_ace_hdr_t *zacep = acep; uint16_t entry_type; switch (zacep->z_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: return (sizeof (zfs_object_ace_t)); case ALLOW: case DENY: entry_type = (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE) return (sizeof (zfs_ace_hdr_t)); zfs_fallthrough; default: return (sizeof (zfs_ace_t)); } } static size_t zfs_ace_fuid_abstract_size(void) { return (sizeof (zfs_ace_hdr_t)); } static int zfs_ace_fuid_mask_off(void) { return (offsetof(zfs_ace_hdr_t, z_access_mask)); } static int zfs_ace_fuid_data(void *acep, void **datap) { zfs_ace_t *zacep = acep; zfs_object_ace_t *zobjp; switch (zacep->z_hdr.z_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: zobjp = acep; *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); default: *datap = NULL; return (0); } } static const acl_ops_t zfs_acl_fuid_ops = { .ace_mask_get = zfs_ace_fuid_get_mask, .ace_mask_set = zfs_ace_fuid_set_mask, .ace_flags_get = zfs_ace_fuid_get_flags, .ace_flags_set = zfs_ace_fuid_set_flags, .ace_type_get = zfs_ace_fuid_get_type, .ace_type_set = zfs_ace_fuid_set_type, .ace_who_get = zfs_ace_fuid_get_who, .ace_who_set = zfs_ace_fuid_set_who, .ace_size = zfs_ace_fuid_size, .ace_abstract_size = zfs_ace_fuid_abstract_size, .ace_mask_off = zfs_ace_fuid_mask_off, .ace_data = zfs_ace_fuid_data }; /* * The following three functions are provided for compatibility with * older ZPL version in order to determine if the file use to have * an external ACL and what version of ACL previously existed on the * file. Would really be nice to not need this, sigh. */ uint64_t zfs_external_acl(znode_t *zp) { zfs_acl_phys_t acl_phys; int error; if (zp->z_is_sa) return (0); /* * Need to deal with a potential * race where zfs_sa_upgrade could cause * z_isa_sa to change. * * If the lookup fails then the state of z_is_sa should have * changed. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), &acl_phys, sizeof (acl_phys))) == 0) return (acl_phys.z_acl_extern_obj); else { /* * after upgrade the SA_ZPL_ZNODE_ACL should have been * removed */ VERIFY(zp->z_is_sa && error == ENOENT); return (0); } } /* * Determine size of ACL in bytes * * This is more complicated than it should be since we have to deal * with old external ACLs. */ static int zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, zfs_acl_phys_t *aclphys) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t acl_count; int size; int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if (zp->z_is_sa) { if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), &size)) != 0) return (error); *aclsize = size; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), &acl_count, sizeof (acl_count))) != 0) return (error); *aclcount = acl_count; } else { if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), aclphys, sizeof (*aclphys))) != 0) return (error); if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); *aclcount = aclphys->z_acl_size; } else { *aclsize = aclphys->z_acl_size; *aclcount = aclphys->z_acl_count; } } return (0); } int zfs_znode_acl_version(znode_t *zp) { zfs_acl_phys_t acl_phys; if (zp->z_is_sa) return (ZFS_ACL_VERSION_FUID); else { int error; /* * Need to deal with a potential * race where zfs_sa_upgrade could cause * z_isa_sa to change. * * If the lookup fails then the state of z_is_sa should have * changed. */ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), &acl_phys, sizeof (acl_phys))) == 0) return (acl_phys.z_acl_version); else { /* * After upgrade SA_ZPL_ZNODE_ACL should have * been removed. */ VERIFY(zp->z_is_sa && error == ENOENT); return (ZFS_ACL_VERSION_FUID); } } } static int zfs_acl_version(int version) { if (version < ZPL_VERSION_FUID) return (ZFS_ACL_VERSION_INITIAL); else return (ZFS_ACL_VERSION_FUID); } static int zfs_acl_version_zp(znode_t *zp) { return (zfs_acl_version(ZTOZSB(zp)->z_version)); } zfs_acl_t * zfs_acl_alloc(int vers) { zfs_acl_t *aclp; aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), offsetof(zfs_acl_node_t, z_next)); aclp->z_version = vers; if (vers == ZFS_ACL_VERSION_FUID) aclp->z_ops = &zfs_acl_fuid_ops; else aclp->z_ops = &zfs_acl_v0_ops; return (aclp); } zfs_acl_node_t * zfs_acl_node_alloc(size_t bytes) { zfs_acl_node_t *aclnode; aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); if (bytes) { aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); aclnode->z_allocdata = aclnode->z_acldata; aclnode->z_allocsize = bytes; aclnode->z_size = bytes; } return (aclnode); } static void zfs_acl_node_free(zfs_acl_node_t *aclnode) { if (aclnode->z_allocsize) kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); kmem_free(aclnode, sizeof (zfs_acl_node_t)); } static void zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; while ((aclnode = list_head(&aclp->z_acl))) { list_remove(&aclp->z_acl, aclnode); zfs_acl_node_free(aclnode); } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } void zfs_acl_free(zfs_acl_t *aclp) { zfs_acl_release_nodes(aclp); list_destroy(&aclp->z_acl); kmem_free(aclp, sizeof (zfs_acl_t)); } static boolean_t zfs_acl_valid_ace_type(uint_t type, uint_t flags) { uint16_t entry_type; switch (type) { case ALLOW: case DENY: case ACE_SYSTEM_AUDIT_ACE_TYPE: case ACE_SYSTEM_ALARM_ACE_TYPE: entry_type = flags & ACE_TYPE_FLAGS; return (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE || entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP); default: if (type <= MAX_ACE_TYPE) return (B_TRUE); } return (B_FALSE); } static boolean_t zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) { /* * first check type of entry */ if (!zfs_acl_valid_ace_type(type, iflags)) return (B_FALSE); switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: if (aclp->z_version < ZFS_ACL_VERSION_FUID) return (B_FALSE); aclp->z_hints |= ZFS_ACL_OBJ_ACE; } /* * next check inheritance level flags */ if (S_ISDIR(obj_mode) && (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) aclp->z_hints |= ZFS_INHERIT_ACE; if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { if ((iflags & (ACE_FILE_INHERIT_ACE| ACE_DIRECTORY_INHERIT_ACE)) == 0) { return (B_FALSE); } } return (B_TRUE); } static void * zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, uint32_t *access_mask, uint16_t *iflags, uint16_t *type) { zfs_acl_node_t *aclnode; ASSERT(aclp); if (start == NULL) { aclnode = list_head(&aclp->z_acl); if (aclnode == NULL) return (NULL); aclp->z_next_ace = aclnode->z_acldata; aclp->z_curr_node = aclnode; aclnode->z_ace_idx = 0; } aclnode = aclp->z_curr_node; if (aclnode == NULL) return (NULL); if (aclnode->z_ace_idx >= aclnode->z_ace_count) { aclnode = list_next(&aclp->z_acl, aclnode); if (aclnode == NULL) return (NULL); else { aclp->z_curr_node = aclnode; aclnode->z_ace_idx = 0; aclp->z_next_ace = aclnode->z_acldata; } } if (aclnode->z_ace_idx < aclnode->z_ace_count) { void *acep = aclp->z_next_ace; size_t ace_size; /* * Make sure we don't overstep our bounds */ ace_size = aclp->z_ops->ace_size(acep); if (((caddr_t)acep + ace_size) > ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { return (NULL); } *iflags = aclp->z_ops->ace_flags_get(acep); *type = aclp->z_ops->ace_type_get(acep); *access_mask = aclp->z_ops->ace_mask_get(acep); *who = aclp->z_ops->ace_who_get(acep); aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; aclnode->z_ace_idx++; return ((void *)acep); } return (NULL); } static uintptr_t zfs_ace_walk(void *datap, uintptr_t cookie, int aclcnt, uint16_t *flags, uint16_t *type, uint32_t *mask) { (void) aclcnt; zfs_acl_t *aclp = datap; zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)cookie; uint64_t who; acep = zfs_acl_next_ace(aclp, acep, &who, mask, flags, type); return ((uintptr_t)acep); } /* * Copy ACE to internal ZFS format. * While processing the ACL each ACE will be validated for correctness. * ACE FUIDs will be created later. */ static int zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp, void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, zfs_fuid_info_t **fuidp, cred_t *cr) { int i; uint16_t entry_type; zfs_ace_t *aceptr = z_acl; ace_t *acep = datap; zfs_object_ace_t *zobjacep; ace_object_t *aceobjp; for (i = 0; i != aclcnt; i++) { aceptr->z_hdr.z_access_mask = acep->a_access_mask; aceptr->z_hdr.z_flags = acep->a_flags; aceptr->z_hdr.z_type = acep->a_type; entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE) { aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, cr, (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); } /* * Make sure ACE is valid */ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type, aceptr->z_hdr.z_flags) != B_TRUE) return (SET_ERROR(EINVAL)); switch (acep->a_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: zobjacep = (zfs_object_ace_t *)aceptr; aceobjp = (ace_object_t *)acep; memcpy(zobjacep->z_object_type, aceobjp->a_obj_type, sizeof (aceobjp->a_obj_type)); memcpy(zobjacep->z_inherit_type, aceobjp->a_inherit_obj_type, sizeof (aceobjp->a_inherit_obj_type)); acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); break; default: acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); } aceptr = (zfs_ace_t *)((caddr_t)aceptr + aclp->z_ops->ace_size(aceptr)); } *size = (caddr_t)aceptr - (caddr_t)z_acl; return (0); } /* * Copy ZFS ACEs to fixed size ace_t layout */ static void zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, void *datap, int filter) { uint64_t who; uint32_t access_mask; uint16_t iflags, type; zfs_ace_hdr_t *zacep = NULL; ace_t *acep = datap; ace_object_t *objacep; zfs_object_ace_t *zobjacep; size_t ace_size; uint16_t entry_type; while ((zacep = zfs_acl_next_ace(aclp, zacep, &who, &access_mask, &iflags, &type))) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: if (filter) { continue; } zobjacep = (zfs_object_ace_t *)zacep; objacep = (ace_object_t *)acep; memcpy(objacep->a_obj_type, zobjacep->z_object_type, sizeof (zobjacep->z_object_type)); memcpy(objacep->a_inherit_obj_type, zobjacep->z_inherit_type, sizeof (zobjacep->z_inherit_type)); ace_size = sizeof (ace_object_t); break; default: ace_size = sizeof (ace_t); break; } entry_type = (iflags & ACE_TYPE_FLAGS); if ((entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE)) { acep->a_who = zfs_fuid_map_id(zfsvfs, who, cr, (entry_type & ACE_IDENTIFIER_GROUP) ? ZFS_ACE_GROUP : ZFS_ACE_USER); } else { acep->a_who = (uid_t)(int64_t)who; } acep->a_access_mask = access_mask; acep->a_flags = iflags; acep->a_type = type; acep = (ace_t *)((caddr_t)acep + ace_size); } } static int zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, zfs_oldace_t *z_acl, int aclcnt, size_t *size) { int i; zfs_oldace_t *aceptr = z_acl; for (i = 0; i != aclcnt; i++, aceptr++) { aceptr->z_access_mask = acep[i].a_access_mask; aceptr->z_type = acep[i].a_type; aceptr->z_flags = acep[i].a_flags; aceptr->z_fuid = acep[i].a_who; /* * Make sure ACE is valid */ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type, aceptr->z_flags) != B_TRUE) return (SET_ERROR(EINVAL)); } *size = (caddr_t)aceptr - (caddr_t)z_acl; return (0); } /* * convert old ACL format to new */ void zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) { zfs_oldace_t *oldaclp; int i; uint16_t type, iflags; uint32_t access_mask; uint64_t who; void *cookie = NULL; zfs_acl_node_t *newaclnode; ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); /* * First create the ACE in a contiguous piece of memory * for zfs_copy_ace_2_fuid(). * * We only convert an ACL once, so this won't happen * every time. */ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, KM_SLEEP); i = 0; while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, &access_mask, &iflags, &type))) { oldaclp[i].z_flags = iflags; oldaclp[i].z_type = type; oldaclp[i].z_fuid = who; oldaclp[i++].z_access_mask = access_mask; } newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * sizeof (zfs_object_ace_t)); aclp->z_ops = &zfs_acl_fuid_ops; VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode, aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count, &newaclnode->z_size, NULL, cr) == 0); newaclnode->z_ace_count = aclp->z_acl_count; aclp->z_version = ZFS_ACL_VERSION; kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); /* * Release all previous ACL nodes */ zfs_acl_release_nodes(aclp); list_insert_head(&aclp->z_acl, newaclnode); aclp->z_acl_bytes = newaclnode->z_size; aclp->z_acl_count = newaclnode->z_ace_count; } /* * Convert unix access mask to v4 access mask */ static uint32_t zfs_unix_to_v4(uint32_t access_mask) { uint32_t new_mask = 0; if (access_mask & S_IXOTH) new_mask |= ACE_EXECUTE; if (access_mask & S_IWOTH) new_mask |= ACE_WRITE_DATA; if (access_mask & S_IROTH) new_mask |= ACE_READ_DATA; return (new_mask); } static int zfs_v4_to_unix(uint32_t access_mask, int *unmapped) { int new_mask = 0; *unmapped = access_mask & (ACE_WRITE_OWNER | ACE_WRITE_ACL | ACE_DELETE); if (access_mask & WRITE_MASK) new_mask |= S_IWOTH; if (access_mask & ACE_READ_DATA) new_mask |= S_IROTH; if (access_mask & ACE_EXECUTE) new_mask |= S_IXOTH; return (new_mask); } static void zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, uint16_t access_type, uint64_t fuid, uint16_t entry_type) { uint16_t type = entry_type & ACE_TYPE_FLAGS; aclp->z_ops->ace_mask_set(acep, access_mask); aclp->z_ops->ace_type_set(acep, access_type); aclp->z_ops->ace_flags_set(acep, entry_type); if ((type != ACE_OWNER && type != OWNING_GROUP && type != ACE_EVERYONE)) aclp->z_ops->ace_who_set(acep, fuid); } /* * Determine mode of file based on ACL. */ uint64_t zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, uint64_t *pflags, uint64_t fuid, uint64_t fgid) { int entry_type; mode_t mode; mode_t seen = 0; zfs_ace_hdr_t *acep = NULL; uint64_t who; uint16_t iflags, type; uint32_t access_mask; boolean_t an_exec_denied = B_FALSE; mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { if (!zfs_acl_valid_ace_type(type, iflags)) continue; entry_type = (iflags & ACE_TYPE_FLAGS); /* * Skip over any inherit_only ACEs */ if (iflags & ACE_INHERIT_ONLY_ACE) continue; if (entry_type == ACE_OWNER || (entry_type == 0 && who == fuid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRUSR))) { seen |= S_IRUSR; if (type == ALLOW) { mode |= S_IRUSR; } } if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWUSR))) { seen |= S_IWUSR; if (type == ALLOW) { mode |= S_IWUSR; } } if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXUSR))) { seen |= S_IXUSR; if (type == ALLOW) { mode |= S_IXUSR; } } } else if (entry_type == OWNING_GROUP || (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRGRP))) { seen |= S_IRGRP; if (type == ALLOW) { mode |= S_IRGRP; } } if ((access_mask & ACE_WRITE_DATA) && (!(seen & S_IWGRP))) { seen |= S_IWGRP; if (type == ALLOW) { mode |= S_IWGRP; } } if ((access_mask & ACE_EXECUTE) && (!(seen & S_IXGRP))) { seen |= S_IXGRP; if (type == ALLOW) { mode |= S_IXGRP; } } } else if (entry_type == ACE_EVERYONE) { if ((access_mask & ACE_READ_DATA)) { if (!(seen & S_IRUSR)) { seen |= S_IRUSR; if (type == ALLOW) { mode |= S_IRUSR; } } if (!(seen & S_IRGRP)) { seen |= S_IRGRP; if (type == ALLOW) { mode |= S_IRGRP; } } if (!(seen & S_IROTH)) { seen |= S_IROTH; if (type == ALLOW) { mode |= S_IROTH; } } } if ((access_mask & ACE_WRITE_DATA)) { if (!(seen & S_IWUSR)) { seen |= S_IWUSR; if (type == ALLOW) { mode |= S_IWUSR; } } if (!(seen & S_IWGRP)) { seen |= S_IWGRP; if (type == ALLOW) { mode |= S_IWGRP; } } if (!(seen & S_IWOTH)) { seen |= S_IWOTH; if (type == ALLOW) { mode |= S_IWOTH; } } } if ((access_mask & ACE_EXECUTE)) { if (!(seen & S_IXUSR)) { seen |= S_IXUSR; if (type == ALLOW) { mode |= S_IXUSR; } } if (!(seen & S_IXGRP)) { seen |= S_IXGRP; if (type == ALLOW) { mode |= S_IXGRP; } } if (!(seen & S_IXOTH)) { seen |= S_IXOTH; if (type == ALLOW) { mode |= S_IXOTH; } } } } else { /* * Only care if this IDENTIFIER_GROUP or * USER ACE denies execute access to someone, * mode is not affected */ if ((access_mask & ACE_EXECUTE) && type == DENY) an_exec_denied = B_TRUE; } } /* * Failure to allow is effectively a deny, so execute permission * is denied if it was never mentioned or if we explicitly * weren't allowed it. */ if (!an_exec_denied && ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) an_exec_denied = B_TRUE; if (an_exec_denied) *pflags &= ~ZFS_NO_EXECS_DENIED; else *pflags |= ZFS_NO_EXECS_DENIED; return (mode); } /* * Read an external acl object. If the intent is to modify, always * create a new acl and leave any cached acl in place. */ int zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp, boolean_t will_modify) { zfs_acl_t *aclp; int aclsize = 0; int acl_count = 0; zfs_acl_node_t *aclnode; zfs_acl_phys_t znode_acl; int version; int error; boolean_t drop_lock = B_FALSE; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); if (zp->z_acl_cached && !will_modify) { *aclpp = zp->z_acl_cached; return (0); } /* * close race where znode could be upgrade while trying to * read the znode attributes. * * But this could only happen if the file isn't already an SA * znode */ if (!zp->z_is_sa && !have_lock) { mutex_enter(&zp->z_lock); drop_lock = B_TRUE; } version = zfs_znode_acl_version(zp); if ((error = zfs_acl_znode_info(zp, &aclsize, &acl_count, &znode_acl)) != 0) { goto done; } aclp = zfs_acl_alloc(version); aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; aclnode = zfs_acl_node_alloc(aclsize); aclnode->z_ace_count = aclp->z_acl_count; aclnode->z_size = aclsize; if (!zp->z_is_sa) { if (znode_acl.z_acl_extern_obj) { error = dmu_read(ZTOZSB(zp)->z_os, znode_acl.z_acl_extern_obj, 0, aclnode->z_size, aclnode->z_acldata, DMU_READ_PREFETCH); } else { memcpy(aclnode->z_acldata, znode_acl.z_ace_data, aclnode->z_size); } } else { error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)), aclnode->z_acldata, aclnode->z_size); } if (error != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); goto done; } list_insert_head(&aclp->z_acl, aclnode); *aclpp = aclp; if (!will_modify) zp->z_acl_cached = aclp; done: if (drop_lock) mutex_exit(&zp->z_lock); return (error); } void zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, boolean_t start, void *userdata) { (void) buflen; zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; if (start) { cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); } else { cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, cb->cb_acl_node); } ASSERT3P(cb->cb_acl_node, !=, NULL); *dataptr = cb->cb_acl_node->z_acldata; *length = cb->cb_acl_node->z_size; } int zfs_acl_chown_setattr(znode_t *zp) { int error; zfs_acl_t *aclp; if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIX) return (0); ASSERT(MUTEX_HELD(&zp->z_lock)); ASSERT(MUTEX_HELD(&zp->z_acl_lock)); error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); if (error == 0 && aclp->z_acl_count > 0) zp->z_mode = ZTOI(zp)->i_mode = zfs_mode_compute(zp->z_mode, aclp, &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid)); /* * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL * nor a DACL_ACES SA in which case ENOENT is returned from * zfs_acl_node_read() when the SA can't be located. * Allow chown/chgrp to succeed in these cases rather than * returning an error that makes no sense in the context of * the caller. */ if (error == ENOENT) return (0); return (error); } typedef struct trivial_acl { uint32_t allow0; /* allow mask for bits only in owner */ uint32_t deny1; /* deny mask for bits not in owner */ uint32_t deny2; /* deny mask for bits not in group */ uint32_t owner; /* allow mask matching mode */ uint32_t group; /* allow mask matching mode */ uint32_t everyone; /* allow mask matching mode */ } trivial_acl_t; static void acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) { uint32_t read_mask = ACE_READ_DATA; uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; uint32_t execute_mask = ACE_EXECUTE; if (isdir) write_mask |= ACE_DELETE_CHILD; masks->deny1 = 0; if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) masks->deny1 |= read_mask; if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) masks->deny1 |= write_mask; if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) masks->deny1 |= execute_mask; masks->deny2 = 0; if (!(mode & S_IRGRP) && (mode & S_IROTH)) masks->deny2 |= read_mask; if (!(mode & S_IWGRP) && (mode & S_IWOTH)) masks->deny2 |= write_mask; if (!(mode & S_IXGRP) && (mode & S_IXOTH)) masks->deny2 |= execute_mask; masks->allow0 = 0; if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) masks->allow0 |= read_mask; if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) masks->allow0 |= write_mask; if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) masks->allow0 |= execute_mask; masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; if (mode & S_IRUSR) masks->owner |= read_mask; if (mode & S_IWUSR) masks->owner |= write_mask; if (mode & S_IXUSR) masks->owner |= execute_mask; masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| ACE_SYNCHRONIZE; if (mode & S_IRGRP) masks->group |= read_mask; if (mode & S_IWGRP) masks->group |= write_mask; if (mode & S_IXGRP) masks->group |= execute_mask; masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| ACE_SYNCHRONIZE; if (mode & S_IROTH) masks->everyone |= read_mask; if (mode & S_IWOTH) masks->everyone |= write_mask; if (mode & S_IXOTH) masks->everyone |= execute_mask; } /* * ace_trivial: * determine whether an ace_t acl is trivial * * Trivialness implies that the acl is composed of only * owner, group, everyone entries. ACL can't * have read_acl denied, and write_owner/write_acl/write_attributes * can only be owner@ entry. */ static int ace_trivial_common(void *acep, int aclcnt, uintptr_t (*walk)(void *, uintptr_t, int, uint16_t *, uint16_t *, uint32_t *)) { uint16_t flags; uint32_t mask; uint16_t type; uint64_t cookie = 0; while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { switch (flags & ACE_TYPE_FLAGS) { case ACE_OWNER: case ACE_GROUP|ACE_IDENTIFIER_GROUP: case ACE_EVERYONE: break; default: return (1); } if (flags & (ACE_FILE_INHERIT_ACE| ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| ACE_INHERIT_ONLY_ACE)) return (1); /* * Special check for some special bits * * Don't allow anybody to deny reading basic * attributes or a files ACL. */ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && (type == ACE_ACCESS_DENIED_ACE_TYPE)) return (1); /* * Delete permission is never set by default */ if (mask & ACE_DELETE) return (1); /* * Child delete permission should be accompanied by write */ if ((mask & ACE_DELETE_CHILD) && !(mask & ACE_WRITE_DATA)) return (1); /* * only allow owner@ to have * write_acl/write_owner/write_attributes/write_xattr/ */ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && (!(flags & ACE_OWNER) && (mask & (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| ACE_WRITE_NAMED_ATTRS)))) return (1); } return (0); } /* * common code for setting ACLs. * * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's * already checked the acl and knows whether to inherit. */ int zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { int error; zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_object_type_t otype; zfs_acl_locator_cb_t locate = { 0 }; uint64_t mode; sa_bulk_attr_t bulk[5]; uint64_t ctime[2]; int count = 0; zfs_acl_phys_t acl_phys; mode = zp->z_mode; mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid)); zp->z_mode = ZTOI(zp)->i_mode = mode; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } /* * Upgrade needed? */ if (!zfsvfs->z_use_fuids) { otype = DMU_OT_OLDACL; } else { if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && (zfsvfs->z_version >= ZPL_VERSION_FUID)) zfs_acl_xform(zp, aclp, cr); ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); otype = DMU_OT_ACL; } /* * Arrgh, we have to handle old on disk format * as well as newer (preferred) SA format. */ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ locate.cb_aclp = aclp; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, aclp->z_acl_bytes); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &aclp->z_acl_count, sizeof (uint64_t)); } else { /* Painful legacy way */ zfs_acl_node_t *aclnode; uint64_t off = 0; uint64_t aoid; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), &acl_phys, sizeof (acl_phys))) != 0) return (error); aoid = acl_phys.z_acl_extern_obj; if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { /* * If ACL was previously external and we are now * converting to new ACL format then release old * ACL object and create a new one. */ if (aoid && aclp->z_version != acl_phys.z_acl_version) { error = dmu_object_free(zfsvfs->z_os, aoid, tx); if (error) return (error); aoid = 0; } if (aoid == 0) { aoid = dmu_object_alloc(zfsvfs->z_os, otype, aclp->z_acl_bytes, otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, otype == DMU_OT_ACL ? DN_OLD_MAX_BONUSLEN : 0, tx); } else { (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, aclp->z_acl_bytes, 0, tx); } acl_phys.z_acl_extern_obj = aoid; for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { if (aclnode->z_ace_count == 0) continue; dmu_write(zfsvfs->z_os, aoid, off, aclnode->z_size, aclnode->z_acldata, tx); off += aclnode->z_size; } } else { void *start = acl_phys.z_ace_data; /* * Migrating back embedded? */ if (acl_phys.z_acl_extern_obj) { error = dmu_object_free(zfsvfs->z_os, acl_phys.z_acl_extern_obj, tx); if (error) return (error); acl_phys.z_acl_extern_obj = 0; } for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { if (aclnode->z_ace_count == 0) continue; memcpy(start, aclnode->z_acldata, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } } /* * If Old version then swap count/bytes to match old * layout of znode_acl_phys_t. */ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { acl_phys.z_acl_size = aclp->z_acl_count; acl_phys.z_acl_count = aclp->z_acl_bytes; } else { acl_phys.z_acl_size = aclp->z_acl_bytes; acl_phys.z_acl_count = aclp->z_acl_count; } acl_phys.z_acl_version = aclp->z_version; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (acl_phys)); } /* * Replace ACL wide bits, but first clear them. */ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; zp->z_pflags |= aclp->z_hints; if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) zp->z_pflags |= ZFS_ACL_TRIVIAL; zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); } static void zfs_acl_chmod(boolean_t isdir, uint64_t mode, boolean_t split, boolean_t trim, zfs_acl_t *aclp) { void *acep = NULL; uint64_t who; int new_count, new_bytes; int ace_size; int entry_type; uint16_t iflags, type; uint32_t access_mask; zfs_acl_node_t *newnode; size_t abstract_size = aclp->z_ops->ace_abstract_size(); void *zacep; trivial_acl_t masks; new_count = new_bytes = 0; acl_trivial_access_masks((mode_t)mode, isdir, &masks); newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); zacep = newnode->z_acldata; if (masks.allow0) { zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } if (masks.deny1) { zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } if (masks.deny2) { zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; } while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { entry_type = (iflags & ACE_TYPE_FLAGS); /* * ACEs used to represent the file mode may be divided * into an equivalent pair of inherit-only and regular * ACEs, if they are inheritable. * Skip regular ACEs, which are replaced by the new mode. */ if (split && (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || entry_type == ACE_EVERYONE)) { if (!isdir || !(iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) continue; /* * We preserve owner@, group@, or @everyone * permissions, if they are inheritable, by * copying them to inherit_only ACEs. This * prevents inheritable permissions from being * altered along with the file mode. */ iflags |= ACE_INHERIT_ONLY_ACE; } /* * If this ACL has any inheritable ACEs, mark that in * the hints (which are later masked into the pflags) * so create knows to do inheritance. */ if (isdir && (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) aclp->z_hints |= ZFS_INHERIT_ACE; if ((type != ALLOW && type != DENY) || (iflags & ACE_INHERIT_ONLY_ACE)) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: aclp->z_hints |= ZFS_ACL_OBJ_ACE; break; } } else { /* * Limit permissions to be no greater than * group permissions. * The "aclinherit" and "aclmode" properties * affect policy for create and chmod(2), * respectively. */ if ((type == ALLOW) && trim) access_mask &= masks.group; } zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); ace_size = aclp->z_ops->ace_size(acep); zacep = (void *)((uintptr_t)zacep + ace_size); new_count++; new_bytes += ace_size; } zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); zacep = (void *)((uintptr_t)zacep + abstract_size); zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); new_count += 3; new_bytes += abstract_size * 3; zfs_acl_release_nodes(aclp); aclp->z_acl_count = new_count; aclp->z_acl_bytes = new_bytes; newnode->z_ace_count = new_count; newnode->z_size = new_bytes; list_insert_tail(&aclp->z_acl, newnode); } int zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { int error = 0; mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_DISCARD) *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); else error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); if (error == 0) { (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; zfs_acl_chmod(S_ISDIR(ZTOI(zp)->i_mode), mode, B_TRUE, (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); } mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); return (error); } /* * Should ACE be inherited? */ static int zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags) { int iflags = (acep_flags & 0xf); if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) return (1); else if (iflags & ACE_FILE_INHERIT_ACE) return (!(S_ISDIR(obj_mode) && (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); return (0); } /* * inherit inheritable ACEs from parent */ static zfs_acl_t * zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp, uint64_t mode, boolean_t *need_chmod) { void *pacep = NULL; void *acep; zfs_acl_node_t *aclnode; zfs_acl_t *aclp = NULL; uint64_t who; uint32_t access_mask; uint16_t iflags, newflags, type; size_t ace_size; void *data1, *data2; size_t data1sz, data2sz; uint_t aclinherit; boolean_t isdir = S_ISDIR(va_mode); boolean_t isreg = S_ISREG(va_mode); *need_chmod = B_TRUE; aclp = zfs_acl_alloc(paclp->z_version); aclinherit = zfsvfs->z_acl_inherit; if (aclinherit == ZFS_ACL_DISCARD || S_ISLNK(va_mode)) return (aclp); while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, &access_mask, &iflags, &type))) { /* * don't inherit bogus ACEs */ if (!zfs_acl_valid_ace_type(type, iflags)) continue; /* * Check if ACE is inheritable by this vnode */ if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || !zfs_ace_can_use(va_mode, iflags)) continue; /* * If owner@, group@, or everyone@ inheritable * then zfs_acl_chmod() isn't needed. */ if ((aclinherit == ZFS_ACL_PASSTHROUGH || aclinherit == ZFS_ACL_PASSTHROUGH_X) && ((iflags & (ACE_OWNER|ACE_EVERYONE)) || ((iflags & OWNING_GROUP) == OWNING_GROUP)) && (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) *need_chmod = B_FALSE; /* * Strip inherited execute permission from file if * not in mode */ if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { access_mask &= ~ACE_EXECUTE; } /* * Strip write_acl and write_owner from permissions * when inheriting an ACE */ if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { access_mask &= ~RESTRICTED_CLEAR; } ace_size = aclp->z_ops->ace_size(pacep); aclnode = zfs_acl_node_alloc(ace_size); list_insert_tail(&aclp->z_acl, aclnode); acep = aclnode->z_acldata; zfs_set_ace(aclp, acep, access_mask, type, who, iflags|ACE_INHERITED_ACE); /* * Copy special opaque data if any */ if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { VERIFY((data2sz = aclp->z_ops->ace_data(acep, &data2)) == data1sz); memcpy(data2, data1, data2sz); } aclp->z_acl_count++; aclnode->z_ace_count++; aclp->z_acl_bytes += aclnode->z_size; newflags = aclp->z_ops->ace_flags_get(acep); /* * If ACE is not to be inherited further, or if the vnode is * not a directory, remove all inheritance flags */ if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { newflags &= ~ALL_INHERIT; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); continue; } /* * This directory has an inheritable ACE */ aclp->z_hints |= ZFS_INHERIT_ACE; /* * If only FILE_INHERIT is set then turn on * inherit_only */ if ((iflags & (ACE_FILE_INHERIT_ACE | ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { newflags |= ACE_INHERIT_ONLY_ACE; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } else { newflags &= ~ACE_INHERIT_ONLY_ACE; aclp->z_ops->ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } } if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && aclp->z_acl_count != 0) { *need_chmod = B_FALSE; } return (aclp); } /* * Create file system object initial permissions * including inheritable ACEs. * Also, create FUIDs for owner and group. */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zuserns_t *mnt_ns) { int error; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zfs_acl_t *paclp; gid_t gid = vap->va_gid; boolean_t need_chmod = B_TRUE; boolean_t trim = B_FALSE; boolean_t inherited = B_FALSE; memset(acl_ids, 0, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = vap->va_mode; if (vsecp) if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp, cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) return (error); acl_ids->z_fuid = vap->va_uid; acl_ids->z_fgid = vap->va_gid; #ifdef HAVE_KSID /* * Determine uid and gid. */ if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) { acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &acl_ids->z_fuidp); acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; } else { acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, cr, &acl_ids->z_fuidp); acl_ids->z_fgid = 0; if (vap->va_mask & AT_GID) { acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) acl_ids->z_fgid = 0; } if (acl_ids->z_fgid == 0) { if (dzp->z_mode & S_ISGID) { char *domain; uint32_t rid; acl_ids->z_fgid = KGID_TO_SGID( ZTOI(dzp)->i_gid); gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); if (zfsvfs->z_use_fuids && IS_EPHEMERAL(acl_ids->z_fgid)) { domain = zfs_fuid_idx_domain( &zfsvfs->z_fuid_idx, FUID_INDEX(acl_ids->z_fgid)); rid = FUID_RID(acl_ids->z_fgid); zfs_fuid_node_add(&acl_ids->z_fuidp, domain, rid, FUID_INDEX(acl_ids->z_fgid), acl_ids->z_fgid, ZFS_GROUP); } } else { acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, ZFS_GROUP, cr, &acl_ids->z_fuidp); gid = crgetgid(cr); } } } #endif /* HAVE_KSID */ /* * If we're creating a directory, and the parent directory has the * set-GID bit set, set in on the new directory. * Otherwise, if the user is neither privileged nor a member of the * file's new group, clear the file's set-GID bit. */ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && (S_ISDIR(vap->va_mode))) { acl_ids->z_mode |= S_ISGID; } else { if ((acl_ids->z_mode & S_ISGID) && secpolicy_vnode_setids_setgids(cr, gid, mnt_ns, zfs_i_user_ns(ZTOI(dzp))) != 0) { acl_ids->z_mode &= ~S_ISGID; } } if (acl_ids->z_aclp == NULL) { mutex_enter(&dzp->z_acl_lock); mutex_enter(&dzp->z_lock); if (!(flag & IS_ROOT_NODE) && (dzp->z_pflags & ZFS_INHERIT_ACE) && !(dzp->z_pflags & ZFS_XATTR)) { VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, &paclp, B_FALSE)); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_mode, paclp, acl_ids->z_mode, &need_chmod); inherited = B_TRUE; } else { acl_ids->z_aclp = zfs_acl_alloc(zfs_acl_version_zp(dzp)); acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } mutex_exit(&dzp->z_lock); mutex_exit(&dzp->z_acl_lock); if (need_chmod) { if (S_ISDIR(vap->va_mode)) acl_ids->z_aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) trim = B_TRUE; zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE, trim, acl_ids->z_aclp); } } if (inherited || vsecp) { acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, acl_ids->z_fuid, acl_ids->z_fgid); if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } return (0); } /* * Free ACL and fuid_infop, but not the acl_ids structure */ void zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) { if (acl_ids->z_aclp) zfs_acl_free(acl_ids->z_aclp); if (acl_ids->z_fuidp) zfs_fuid_info_free(acl_ids->z_fuidp); acl_ids->z_aclp = NULL; acl_ids->z_fuidp = NULL; } boolean_t zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) { return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); } /* * Retrieve a file's ACL */ int zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfs_acl_t *aclp; ulong_t mask; int error; int count = 0; int largeace = 0; mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); if (mask == 0) return (SET_ERROR(ENOSYS)); if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr, kcred->user_ns))) return (error); mutex_enter(&zp->z_acl_lock); error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } /* * Scan ACL to determine number of ACEs */ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { void *zacep = NULL; uint64_t who; uint32_t access_mask; uint16_t type, iflags; while ((zacep = zfs_acl_next_ace(aclp, zacep, &who, &access_mask, &iflags, &type))) { switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: largeace++; continue; default: count++; } } vsecp->vsa_aclcnt = count; } else count = (int)aclp->z_acl_count; if (mask & VSA_ACECNT) { vsecp->vsa_aclcnt = count; } if (mask & VSA_ACE) { size_t aclsz; aclsz = count * sizeof (ace_t) + sizeof (ace_object_t) * largeace; vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); vsecp->vsa_aclentsz = aclsz; if (aclp->z_version == ZFS_ACL_VERSION_FUID) zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr, vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); else { zfs_acl_node_t *aclnode; void *start = vsecp->vsa_aclentp; for (aclnode = list_head(&aclp->z_acl); aclnode; aclnode = list_next(&aclp->z_acl, aclnode)) { memcpy(start, aclnode->z_acldata, aclnode->z_size); start = (caddr_t)start + aclnode->z_size; } ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == aclp->z_acl_bytes); } } if (mask & VSA_ACE_ACLFLAGS) { vsecp->vsa_aclflags = 0; if (zp->z_pflags & ZFS_ACL_DEFAULTED) vsecp->vsa_aclflags |= ACL_DEFAULTED; if (zp->z_pflags & ZFS_ACL_PROTECTED) vsecp->vsa_aclflags |= ACL_PROTECTED; if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; } mutex_exit(&zp->z_acl_lock); return (0); } int zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode, vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) { zfs_acl_t *aclp; zfs_acl_node_t *aclnode; int aclcnt = vsecp->vsa_aclcnt; int error; if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) return (SET_ERROR(EINVAL)); aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); aclp->z_hints = 0; aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp, (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, &aclnode->z_size)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); } } else { if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp, vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, &aclnode->z_size, fuidp, cr)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); } } aclp->z_acl_bytes = aclnode->z_size; aclnode->z_ace_count = aclcnt; aclp->z_acl_count = aclcnt; list_insert_head(&aclp->z_acl, aclnode); /* * If flags are being set then add them to z_hints */ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { if (vsecp->vsa_aclflags & ACL_PROTECTED) aclp->z_hints |= ZFS_ACL_PROTECTED; if (vsecp->vsa_aclflags & ACL_DEFAULTED) aclp->z_hints |= ZFS_ACL_DEFAULTED; if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; } *zaclp = aclp; return (0); } /* * Set a file's ACL */ int zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zilog_t *zilog = zfsvfs->z_log; ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); dmu_tx_t *tx; int error; zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; boolean_t fuid_dirtied; uint64_t acl_obj; if (mask == 0) return (SET_ERROR(ENOSYS)); if (zp->z_pflags & ZFS_IMMUTABLE) return (SET_ERROR(EPERM)); if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, kcred->user_ns))) return (error); error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, &aclp); if (error) return (error); /* * If ACL wide flags aren't being set then preserve any * existing flags. */ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { aclp->z_hints |= (zp->z_pflags & V4_ACL_WIDE_FLAGS); } top: mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); /* * If old version and ACL won't fit in bonus and we aren't * upgrading then take out necessary DMU holds */ if ((acl_obj = zfs_external_acl(zp)) != 0) { if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); zfs_acl_free(aclp); return (error); } error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); ASSERT(zp->z_acl_cached == NULL); zp->z_acl_cached = aclp; if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); mutex_exit(&zp->z_lock); mutex_exit(&zp->z_acl_lock); return (error); } /* * Check accesses of interest (AoI) against attributes of the dataset * such as read-only. Returns zero if no AoI conflict with dataset * attributes, otherwise an appropriate errno is returned. */ static int zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) { if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) && (!Z_ISDEV(ZTOI(zp)->i_mode) || (v4_mode & WRITE_MASK_ATTRS))) { return (SET_ERROR(EROFS)); } /* * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common(). */ if ((v4_mode & WRITE_MASK_DATA) && (zp->z_pflags & ZFS_IMMUTABLE)) { return (SET_ERROR(EPERM)); } if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && (zp->z_pflags & ZFS_NOUNLINK)) { return (SET_ERROR(EPERM)); } if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && (zp->z_pflags & ZFS_AV_QUARANTINED))) { return (SET_ERROR(EACCES)); } return (0); } /* * The primary usage of this function is to loop through all of the * ACEs in the znode, determining what accesses of interest (AoI) to * the caller are allowed or denied. The AoI are expressed as bits in * the working_mode parameter. As each ACE is processed, bits covered * by that ACE are removed from the working_mode. This removal * facilitates two things. The first is that when the working mode is * empty (= 0), we know we've looked at all the AoI. The second is * that the ACE interpretation rules don't allow a later ACE to undo * something granted or denied by an earlier ACE. Removing the * discovered access or denial enforces this rule. At the end of * processing the ACEs, all AoI that were found to be denied are * placed into the working_mode, giving the caller a mask of denied * accesses. Returns: * 0 if all AoI granted * EACCES if the denied mask is non-zero * other error if abnormal failure (e.g., IO error) * * A secondary usage of the function is to determine if any of the * AoI are granted. If an ACE grants any access in * the working_mode, we immediately short circuit out of the function. * This mode is chosen by setting anyaccess to B_TRUE. The * working_mode is not a denied access mask upon exit if the function * is used in this manner. */ static int zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, boolean_t anyaccess, cred_t *cr, zuserns_t *mnt_ns) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_acl_t *aclp; int error; uid_t uid = crgetuid(cr); uint64_t who; uint16_t type, iflags; uint16_t entry_type; uint32_t access_mask; uint32_t deny_mask = 0; zfs_ace_hdr_t *acep = NULL; boolean_t checkit; uid_t gowner; uid_t fowner; if (mnt_ns) { fowner = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), KUID_TO_SUID(ZTOI(zp)->i_uid)); gowner = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), KGID_TO_SGID(ZTOI(zp)->i_gid)); } else zfs_fuid_map_ids(zp, cr, &fowner, &gowner); mutex_enter(&zp->z_acl_lock); error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } ASSERT(zp->z_acl_cached); while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type))) { uint32_t mask_matched; if (!zfs_acl_valid_ace_type(type, iflags)) continue; if (S_ISDIR(ZTOI(zp)->i_mode) && (iflags & ACE_INHERIT_ONLY_ACE)) continue; /* Skip ACE if it does not affect any AoI */ mask_matched = (access_mask & *working_mode); if (!mask_matched) continue; entry_type = (iflags & ACE_TYPE_FLAGS); checkit = B_FALSE; switch (entry_type) { case ACE_OWNER: if (uid == fowner) checkit = B_TRUE; break; case OWNING_GROUP: who = gowner; zfs_fallthrough; case ACE_IDENTIFIER_GROUP: checkit = zfs_groupmember(zfsvfs, who, cr); break; case ACE_EVERYONE: checkit = B_TRUE; break; /* USER Entry */ default: if (entry_type == 0) { uid_t newid; newid = zfs_fuid_map_id(zfsvfs, who, cr, ZFS_ACE_USER); if (newid != IDMAP_WK_CREATOR_OWNER_UID && uid == newid) checkit = B_TRUE; break; } else { mutex_exit(&zp->z_acl_lock); return (SET_ERROR(EIO)); } } if (checkit) { if (type == DENY) { DTRACE_PROBE3(zfs__ace__denies, znode_t *, zp, zfs_ace_hdr_t *, acep, uint32_t, mask_matched); deny_mask |= mask_matched; } else { DTRACE_PROBE3(zfs__ace__allows, znode_t *, zp, zfs_ace_hdr_t *, acep, uint32_t, mask_matched); if (anyaccess) { mutex_exit(&zp->z_acl_lock); return (0); } } *working_mode &= ~mask_matched; } /* Are we done? */ if (*working_mode == 0) break; } mutex_exit(&zp->z_acl_lock); /* Put the found 'denies' back on the working mode */ if (deny_mask) { *working_mode |= deny_mask; return (SET_ERROR(EACCES)); } else if (*working_mode) { return (-1); } return (0); } /* * Return true if any access whatsoever granted, we don't actually * care what access is granted. */ boolean_t zfs_has_access(znode_t *zp, cred_t *cr) { uint32_t have = ACE_ALL_PERMS; if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr, kcred->user_ns) != 0) { uid_t owner; owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER); return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0); } return (B_TRUE); } /* * Simplified access check for case where ACL is known to not contain * information beyond what is defined in the mode. In this case, we * can pass along to the kernel / vfs generic_permission() check, which * evaluates the mode and POSIX ACL. * * NFSv4 ACLs allow granting permissions that are usually relegated only * to the file owner or superuser. Examples are ACE_WRITE_OWNER (chown), * ACE_WRITE_ACL(chmod), and ACE_DELETE. ACE_DELETE requests must fail * because with conventional posix permissions, right to delete file * is determined by write bit on the parent dir. * * If unmappable perms are requested, then we must return EPERM * and include those bits in the working_mode so that the caller of * zfs_zaccess_common() can decide whether to perform additional * policy / capability checks. EACCES is used in zfs_zaccess_aces_check() * to indicate access check failed due to explicit DENY entry, and so * we want to avoid that here. */ static int zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr, zuserns_t *mnt_ns) { int err, mask; int unmapped = 0; ASSERT(zp->z_pflags & ZFS_ACL_TRIVIAL); mask = zfs_v4_to_unix(*working_mode, &unmapped); if (mask == 0 || unmapped) { *working_mode = unmapped; return (unmapped ? SET_ERROR(EPERM) : 0); } #if defined(HAVE_IOPS_PERMISSION_USERNS) if (mnt_ns) err = generic_permission(mnt_ns, ZTOI(zp), mask); else err = generic_permission(cr->user_ns, ZTOI(zp), mask); #else err = generic_permission(ZTOI(zp), mask); #endif if (err != 0) { return (SET_ERROR(EPERM)); } *working_mode = unmapped; return (0); } static int zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr, zuserns_t *mnt_ns) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int err; *working_mode = v4_mode; *check_privs = B_TRUE; /* * Short circuit empty requests */ if (v4_mode == 0 || zfsvfs->z_replay) { *working_mode = 0; return (0); } if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { *check_privs = B_FALSE; return (err); } /* * The caller requested that the ACL check be skipped. This * would only happen if the caller checked VOP_ACCESS() with a * 32 bit ACE mask and already had the appropriate permissions. */ if (skipaclchk) { *working_mode = 0; return (0); } /* * Note: ZFS_READONLY represents the "DOS R/O" attribute. * When that flag is set, we should behave as if write access * were not granted by anything in the ACL. In particular: * We _must_ allow writes after opening the file r/w, then * setting the DOS R/O attribute, and writing some more. * (Similar to how you can write after fchmod(fd, 0444).) * * Therefore ZFS_READONLY is ignored in the dataset check * above, and checked here as if part of the ACL check. * Also note: DOS R/O is ignored for directories. */ if ((v4_mode & WRITE_MASK_DATA) && S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_READONLY)) { return (SET_ERROR(EPERM)); } if (zp->z_pflags & ZFS_ACL_TRIVIAL) return (zfs_zaccess_trivial(zp, working_mode, cr, mnt_ns)); return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr, mnt_ns)); } static int zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, cred_t *cr, zuserns_t *mnt_ns) { if (*working_mode != ACE_WRITE_DATA) return (SET_ERROR(EACCES)); return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, check_privs, B_FALSE, cr, mnt_ns)); } int zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) { boolean_t owner = B_FALSE; boolean_t groupmbr = B_FALSE; boolean_t is_attr; uid_t uid = crgetuid(cr); int error; if (zdp->z_pflags & ZFS_AV_QUARANTINED) return (SET_ERROR(EACCES)); is_attr = ((zdp->z_pflags & ZFS_XATTR) && (S_ISDIR(ZTOI(zdp)->i_mode))); if (is_attr) goto slow; mutex_enter(&zdp->z_acl_lock); if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { mutex_exit(&zdp->z_acl_lock); return (0); } if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 || KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) { mutex_exit(&zdp->z_acl_lock); goto slow; } if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) { - owner = B_TRUE; if (zdp->z_mode & S_IXUSR) { mutex_exit(&zdp->z_acl_lock); return (0); } else { mutex_exit(&zdp->z_acl_lock); goto slow; } } if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) { - groupmbr = B_TRUE; if (zdp->z_mode & S_IXGRP) { mutex_exit(&zdp->z_acl_lock); return (0); } else { mutex_exit(&zdp->z_acl_lock); goto slow; } } if (!owner && !groupmbr) { if (zdp->z_mode & S_IXOTH) { mutex_exit(&zdp->z_acl_lock); return (0); } } mutex_exit(&zdp->z_acl_lock); slow: DTRACE_PROBE(zfs__fastpath__execute__access__miss); if ((error = zfs_enter(ZTOZSB(zdp), FTAG)) != 0) return (error); error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, kcred->user_ns); zfs_exit(ZTOZSB(zdp), FTAG); return (error); } /* * Determine whether Access should be granted/denied. * * The least priv subsystem is always consulted as a basic privilege * can define any form of access. */ int zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, zuserns_t *mnt_ns) { uint32_t working_mode; int error; int is_attr; boolean_t check_privs; znode_t *xzp; znode_t *check_zp = zp; mode_t needed_bits; uid_t owner; is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode)); /* * If attribute then validate against base file */ if (is_attr) { if ((error = zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &xzp)) != 0) { return (error); } check_zp = xzp; /* * fixup mode to map to xattr perms */ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); mode |= ACE_WRITE_NAMED_ATTRS; } if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { mode &= ~(ACE_READ_DATA|ACE_EXECUTE); mode |= ACE_READ_NAMED_ATTRS; } } owner = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), KUID_TO_SUID(ZTOI(zp)->i_uid)); owner = zfs_fuid_map_id(ZTOZSB(zp), owner, cr, ZFS_OWNER); /* * Map the bits required to the standard inode flags * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits * mapped by working_mode (currently missing) in missing_bits. * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), * needed_bits. */ needed_bits = 0; working_mode = mode; if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && owner == crgetuid(cr)) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) needed_bits |= S_IRUSR; if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) needed_bits |= S_IWUSR; if (working_mode & ACE_EXECUTE) needed_bits |= S_IXUSR; if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, &check_privs, skipaclchk, cr, mnt_ns)) == 0) { if (is_attr) zrele(xzp); return (secpolicy_vnode_access2(cr, ZTOI(zp), owner, needed_bits, needed_bits)); } if (error && !check_privs) { if (is_attr) zrele(xzp); return (error); } if (error && (flags & V_APPEND)) { error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr, mnt_ns); } if (error && check_privs) { mode_t checkmode = 0; /* * First check for implicit owner permission on * read_acl/read_attributes */ - error = 0; ASSERT(working_mode != 0); if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && owner == crgetuid(cr))) working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) checkmode |= S_IRUSR; if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) checkmode |= S_IWUSR; if (working_mode & ACE_EXECUTE) checkmode |= S_IXUSR; error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner, needed_bits & ~checkmode, needed_bits); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) error = secpolicy_vnode_chown(cr, owner); if (error == 0 && (working_mode & ACE_WRITE_ACL)) error = secpolicy_vnode_setdac(cr, owner); if (error == 0 && (working_mode & (ACE_DELETE|ACE_DELETE_CHILD))) error = secpolicy_vnode_remove(cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { error = secpolicy_vnode_chown(cr, owner); } if (error == 0) { /* * See if any bits other than those already checked * for are still present. If so then return EACCES */ if (working_mode & ~(ZFS_CHECKED_MASKS)) { error = SET_ERROR(EACCES); } } } else if (error == 0) { error = secpolicy_vnode_access2(cr, ZTOI(zp), owner, needed_bits, needed_bits); } if (is_attr) zrele(xzp); return (error); } /* * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into * NFSv4-style ZFS ACL format and call zfs_zaccess() */ int zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr, zuserns_t *mnt_ns) { return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr, mnt_ns)); } /* * Access function for secpolicy_vnode_setattr */ int zfs_zaccess_unix(void *zp, int mode, cred_t *cr) { int v4_mode = zfs_unix_to_v4(mode >> 6); return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr, kcred->user_ns)); } /* See zfs_zaccess_delete() */ static const boolean_t zfs_write_implies_delete_child = B_TRUE; /* * Determine whether delete access should be granted. * * The following chart outlines how we handle delete permissions which is * how recent versions of windows (Windows 2008) handles it. The efficiency * comes from not having to check the parent ACL where the object itself grants * delete: * * ------------------------------------------------------- * | Parent Dir | Target Object Permissions | * | permissions | | * ------------------------------------------------------- * | | ACL Allows | ACL Denies| Delete | * | | Delete | Delete | unspecified| * ------------------------------------------------------- * | ACL Allows | Permit | Deny * | Permit | * | DELETE_CHILD | | | | * ------------------------------------------------------- * | ACL Denies | Permit | Deny | Deny | * | DELETE_CHILD | | | | * ------------------------------------------------------- * | ACL specifies | | | | * | only allow | Permit | Deny * | Permit | * | write and | | | | * | execute | | | | * ------------------------------------------------------- * | ACL denies | | | | * | write and | Permit | Deny | Deny | * | execute | | | | * ------------------------------------------------------- * ^ * | * Re. execute permission on the directory: if that's missing, * the vnode lookup of the target will fail before we get here. * * Re [*] in the table above: NFSv4 would normally Permit delete for * these two cells of the matrix. * See acl.h for notes on which ACE_... flags should be checked for which * operations. Specifically, the NFSv4 committee recommendation is in * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs * should take precedence ahead of ALLOW ACEs. * * This implementation always consults the target object's ACL first. * If a DENY ACE is present on the target object that specifies ACE_DELETE, * delete access is denied. If an ALLOW ACE with ACE_DELETE is present on * the target object, access is allowed. If and only if no entries with * ACE_DELETE are present in the object's ACL, check the container's ACL * for entries with ACE_DELETE_CHILD. * * A summary of the logic implemented from the table above is as follows: * * First check for DENY ACEs that apply. * If either target or container has a deny, EACCES. * * Delete access can then be summarized as follows: * 1: The object to be deleted grants ACE_DELETE, or * 2: The containing directory grants ACE_DELETE_CHILD. * In a Windows system, that would be the end of the story. * In this system, (2) has some complications... * 2a: "sticky" bit on a directory adds restrictions, and * 2b: existing ACEs from previous versions of ZFS may * not carry ACE_DELETE_CHILD where they should, so we * also allow delete when ACE_WRITE_DATA is granted. * * Note: 2b is technically a work-around for a prior bug, * which hopefully can go away some day. For those who * no longer need the work around, and for testing, this * work-around is made conditional via the tunable: * zfs_write_implies_delete_child */ int zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zuserns_t *mnt_ns) { uint32_t wanted_dirperms; uint32_t dzp_working_mode = 0; uint32_t zp_working_mode = 0; int dzp_error, zp_error; boolean_t dzpcheck_privs; boolean_t zpcheck_privs; if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) return (SET_ERROR(EPERM)); /* * Case 1: * If target object grants ACE_DELETE then we are done. This is * indicated by a return value of 0. For this case we don't worry * about the sticky bit because sticky only applies to the parent * directory and this is the child access result. * * If we encounter a DENY ACE here, we're also done (EACCES). * Note that if we hit a DENY ACE here (on the target) it should * take precedence over a DENY ACE on the container, so that when * we have more complete auditing support we will be able to * report an access failure against the specific target. * (This is part of why we're checking the target first.) */ zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, &zpcheck_privs, B_FALSE, cr, mnt_ns); if (zp_error == EACCES) { /* We hit a DENY ACE. */ if (!zpcheck_privs) return (SET_ERROR(zp_error)); return (secpolicy_vnode_remove(cr)); } if (zp_error == 0) return (0); /* * Case 2: * If the containing directory grants ACE_DELETE_CHILD, * or we're in backward compatibility mode and the * containing directory has ACE_WRITE_DATA, allow. * Case 2b is handled with wanted_dirperms. */ wanted_dirperms = ACE_DELETE_CHILD; if (zfs_write_implies_delete_child) wanted_dirperms |= ACE_WRITE_DATA; dzp_error = zfs_zaccess_common(dzp, wanted_dirperms, &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr, mnt_ns); if (dzp_error == EACCES) { /* We hit a DENY ACE. */ if (!dzpcheck_privs) return (SET_ERROR(dzp_error)); return (secpolicy_vnode_remove(cr)); } /* * Cases 2a, 2b (continued) * * Note: dzp_working_mode now contains any permissions * that were NOT granted. Therefore, if any of the * wanted_dirperms WERE granted, we will have: * dzp_working_mode != wanted_dirperms * We're really asking if ANY of those permissions * were granted, and if so, grant delete access. */ if (dzp_working_mode != wanted_dirperms) dzp_error = 0; /* * dzp_error is 0 if the container granted us permissions to "modify". * If we do not have permission via one or more ACEs, our current * privileges may still permit us to modify the container. * * dzpcheck_privs is false when i.e. the FS is read-only. * Otherwise, do privilege checks for the container. */ if (dzp_error != 0 && dzpcheck_privs) { uid_t owner; /* * The secpolicy call needs the requested access and * the current access mode of the container, but it * only knows about Unix-style modes (VEXEC, VWRITE), * so this must condense the fine-grained ACE bits into * Unix modes. * * The VEXEC flag is easy, because we know that has * always been checked before we get here (during the * lookup of the target vnode). The container has not * granted us permissions to "modify", so we do not set * the VWRITE flag in the current access mode. */ owner = zfs_fuid_map_id(ZTOZSB(dzp), KUID_TO_SUID(ZTOI(dzp)->i_uid), cr, ZFS_OWNER); dzp_error = secpolicy_vnode_access2(cr, ZTOI(dzp), owner, S_IXUSR, S_IWUSR|S_IXUSR); } if (dzp_error != 0) { /* * Note: We may have dzp_error = -1 here (from * zfs_zacess_common). Don't return that. */ return (SET_ERROR(EACCES)); } /* * At this point, we know that the directory permissions allow * us to modify, but we still need to check for the additional * restrictions that apply when the "sticky bit" is set. * * Yes, zfs_sticky_remove_access() also checks this bit, but * checking it here and skipping the call below is nice when * you're watching all of this with dtrace. */ if ((dzp->z_mode & S_ISVTX) == 0) return (0); /* * zfs_sticky_remove_access will succeed if: * 1. The sticky bit is absent. * 2. We pass the sticky bit restrictions. * 3. We have privileges that always allow file removal. */ return (zfs_sticky_remove_access(dzp, zp, cr)); } int zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, znode_t *tzp, cred_t *cr, zuserns_t *mnt_ns) { int add_perm; int error; if (szp->z_pflags & ZFS_AV_QUARANTINED) return (SET_ERROR(EACCES)); add_perm = S_ISDIR(ZTOI(szp)->i_mode) ? ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; /* * Rename permissions are combination of delete permission + * add file/subdir permission. */ /* * first make sure we do the delete portion. * * If that succeeds then check for add_file/add_subdir permissions */ if ((error = zfs_zaccess_delete(sdzp, szp, cr, mnt_ns))) return (error); /* * If we have a tzp, see if we can delete it? */ if (tzp) { if ((error = zfs_zaccess_delete(tdzp, tzp, cr, mnt_ns))) return (error); } /* * Now check for add permissions */ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr, mnt_ns); return (error); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index a94af0ea3bd3..47f132a38abe 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1,4233 +1,4232 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using zfs_enter(zfsvfs). * A zfs_exit(zfsvfs) is needed before all returns. Any znodes * must be checked with zfs_verify_zp(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() (if * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the * last reference, the vnode/znode can be freed, so the zp may point to * freed memory. Second, the last reference will call zfs_zinactive(), * which may induce a lot of work -- pushing cached pages (which acquires * range locks) and syncing out cached atime changes. Third, * zfs_zinactive() may require a new tx, which could deadlock the system * if you were already holding one. This deadlock occurs because the tx * currently being operated on prevents a txg from syncing, which * prevents the new tx from progressing, resulting in a deadlock. If you * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() * is a synonym for zrele(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * zfs_enter(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * zfs_exit(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary * zfs_exit(zfsvfs); // finished in zfs * return (error); // done, report error */ int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Honor ZFS_APPENDONLY file attribute */ if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } /* Keep a count of the synchronous opens in the znode */ if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); zfs_exit(zfsvfs, FTAG); return (0); } int zfs_close(struct inode *ip, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; uint64_t nbytes; int64_t off; void *pb; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { nbytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); (void) dmu_read(os, zp->z_id, start + off, nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; int64_t start, off; uint64_t bytes; int len = nbytes; int error = 0; void *pb; start = uio->uio_loffset; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { bytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { ASSERT(PageUptodate(pp)); unlock_page(pp); pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } #endif /* _KERNEL */ static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Write the bytes to a file. * * IN: zp - znode of file to be written to * data - bytes to write * len - number of bytes to write * pos - offset to start writing at * * OUT: resid - remaining bytes to write * * RETURN: 0 if success * positive error code if failure. EIO is returned * for a short write when residp isn't provided. * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *residp) { fstrans_cookie_t cookie; int error; struct iovec iov; iov.iov_base = (void *)data; iov.iov_len = len; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); cookie = spl_fstrans_mark(); error = zfs_write(zp, &uio, 0, kcred); spl_fstrans_unmark(cookie); if (error == 0) { if (residp != NULL) *residp = zfs_uio_resid(&uio); else if (zfs_uio_resid(&uio) != 0) error = SET_ERROR(EIO); } return (error); } static void zfs_rele_async_task(void *arg) { iput(arg); } void zfs_zrele_async(znode_t *zp) { struct inode *ip = ZTOI(zp); objset_t *os = ITOZSB(ip)->z_os; ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); /* * If decrementing the count would put us at 0, we can't do it inline * here, because that would be synchronous. Instead, dispatch an iput * to run later. * * For more information on the dangers of a synchronous iput, see the * header comment of this file. */ if (!atomic_add_unless(&ip->i_count, -1, 1)) { VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held inode reference for it. * * IN: zdp - znode of directory to search. * nm - name of entry to lookup. * flags - LOOKUP_XATTR set if looking for an attribute. * cr - credentials of caller. * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: zpp - znode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ int zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(zdp); int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (!S_ISDIR(ZTOI(zdp)->i_mode)) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *zpp = zdp; zhold(*zpp); return (0); } return (error); } } if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) return (error); *zpp = NULL; if (flags & LOOKUP_XATTR) { /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { zfs_exit(zfsvfs, FTAG); return (error); } /* * Do we have permission to get into attribute directory? */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, B_TRUE, cr, kcred->user_ns))) { zrele(*zpp); *zpp = NULL; } zfs_exit(zfsvfs, FTAG); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, kcred->user_ns))) { zfs_exit(zfsvfs, FTAG); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); if ((error == 0) && (*zpp)) zfs_znode_update_vfs(*zpp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the ip of the created or trunc'd file. * * IN: dzp - znode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated if new entry created * zp - ctime|mtime always, atime if new */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zuserns_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *zpp = NULL; if (*name == '\0') { /* * Null component name refers to the directory itself. */ zhold(dzp); zp = dzp; dl = NULL; error = 0; } else { /* possible igrab(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); zfs_exit(zfsvfs, FTAG); return (error); } } if (zp == NULL) { uint64_t txtype; uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { /* * Since, we failed to add the directory entry for it, * delete the newly created dnode. */ zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & O_APPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); - have_acl = B_FALSE; /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, mnt_ns))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if (S_ISREG(ZTOI(zp)->i_mode) && (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ if (dl) { zfs_dirent_unlock(dl); dl = NULL; } error = zfs_freesp(zp, 0, 0, mode, TRUE); } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, zuserns_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp = NULL, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); objset_t *os; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *ipp = NULL; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* Add to unlinked set */ zp->z_unlinked = B_TRUE; zfs_unlinked_add(zp, tx); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); out: if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *ipp = ZTOI(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove an entry from a directory. * * IN: dzp - znode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime * ip - ctime (if nlink > 0) */ static uint64_t null_xattr = 0; int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) { znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; uint64_t links; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp))) { if (realnmp) pn_free(realnmp); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { goto out; } /* * Need to use rmdir for removing directories. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EPERM); goto out; } mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !(zp->z_is_mapped); mutex_exit(&zp->z_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the inode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; clear_nlink(ZTOI(xzp)); links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } /* * Add to the unlinked set because a new reference could be * taken concurrently resulting in a deferred destruction. */ zfs_unlinked_add(zp, tx); mutex_exit(&zp->z_lock); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); if (delete_now) zrele(zp); else zfs_zrele_async(zp); if (xzp) { zfs_znode_update_vfs(xzp); zfs_zrele_async(xzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new directory and insert it into dzp using the name * provided. Return a pointer to the inserted directory. * * IN: dzp - znode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime updated * zpp - ctime|mtime|atime updated */ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(S_ISDIR(vap->va_mode)); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (dirname == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *zpp = NULL; if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); /* * Now put new name in parent dir. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (error != 0) { zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dzp - znode of directory to remove from. * name - name of directory to be removed. * cwd - inode of current working directory. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated */ int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { goto out; } if (!S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } if (zp == cwd) { error = SET_ERROR(EINVAL); goto out; } /* * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); goto top; } dmu_tx_abort(tx); zrele(zp); zfs_exit(zfsvfs, FTAG); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); zrele(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Read directory entries from the given directory cursor position and emit * name and position for each entry. * * IN: ip - inode of directory to read. * ctx - directory entry context. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; zap_cursor_t zc; zap_attribute_t zap; int error; uint8_t prefetch; uint8_t type; int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) goto out; /* * Quit if directory has been removed (posix) */ if (zp->z_unlinked) goto out; error = 0; os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Transform to file-system independent format */ while (!done) { uint64_t objnum; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if (error == ENOENT) break; else goto update; } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (zap.za_integer_length != 8 || zap.za_num_integers == 0) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld, " "length = %d, num = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset, zap.za_integer_length, (u_longlong_t)zap.za_num_integers); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); type = ZFS_DIRENT_TYPE(zap.za_first_integer); } done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), objnum, type); if (done) break; /* Prefetch znode */ if (prefetch) { dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); } /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } ctx->pos = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ update: zap_cursor_fini(&zc); if (error == ENOENT) error = 0; out: zfs_exit(zfsvfs, FTAG); return (error); } /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source * for most of the attributes. However, the znode currently has the * authoritative atime, blksize, and block count. * * IN: ip - inode of file. * * OUT: sp - kstat values. * * RETURN: 0 (always succeeds) */ int zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, struct kstat *sp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); zpl_generic_fillattr(user_ns, ip, sp); /* * +1 link count for root inode with visible '.zfs' directory. */ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) if (sp->nlink < ZFS_LINK_MAX) sp->nlink++; sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); sp->blksize = blksize; sp->blocks = nblocks; if (unlikely(zp->z_blksz == 0)) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ sp->blksize = zfsvfs->z_max_blksz; } mutex_exit(&zp->z_lock); /* * Required to prevent NFS client from detecting different inode * numbers of snapshot root dentry before and after snapshot mount. */ if (zfsvfs->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - dmu_objset_id(zfsvfs->z_os); } zfs_exit(zfsvfs, FTAG); return (0); } /* * For the operation of changing file's user/group/project, we need to * handle not only the main object that is assigned to the file directly, * but also the ones that are used by the file via hidden xattr directory. * * Because the xattr directory may contains many EA entries, as to it may * be impossible to change all of them via the transaction of changing the * main object's user/group/project attributes. Then we have to change them * via other multiple independent transactions one by one. It may be not good * solution, but we have no better idea yet. */ static int zfs_setattr_dir(znode_t *dzp) { struct inode *dxip = ZTOI(dzp); struct inode *xip = NULL; zfsvfs_t *zfsvfs = ZTOZSB(dzp); objset_t *os = zfsvfs->z_os; zap_cursor_t zc; zap_attribute_t zap; zfs_dirlock_t *dl; znode_t *zp = NULL; dmu_tx_t *tx = NULL; uint64_t uid, gid; sa_bulk_attr_t bulk[4]; int count; int err; zap_cursor_init(&zc, os, dzp->z_id); while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { count = 0; if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { err = ENXIO; break; } err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, ZEXISTS, NULL, NULL); if (err == ENOENT) goto next; if (err) break; xip = ZTOI(zp); if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && zp->z_projid == dzp->z_projid) goto next; tx = dmu_tx_create(os); if (!(zp->z_pflags & ZFS_PROJID)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); err = dmu_tx_assign(tx, TXG_WAIT); if (err) break; mutex_enter(&dzp->z_lock); if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { xip->i_uid = dxip->i_uid; uid = zfs_uid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, sizeof (uid)); } if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { xip->i_gid = dxip->i_gid; gid = zfs_gid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, sizeof (gid)); } if (zp->z_projid != dzp->z_projid) { if (!(zp->z_pflags & ZFS_PROJID)) { zp->z_pflags |= ZFS_PROJID; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); } zp->z_projid = dzp->z_projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } mutex_exit(&dzp->z_lock); if (likely(count > 0)) { err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } tx = NULL; if (err != 0 && err != ENOENT) break; next: if (zp) { zrele(zp); zp = NULL; zfs_dirent_unlock(dl); } zap_cursor_advance(&zc); } if (tx) dmu_tx_abort(tx); if (zp) { zrele(zp); zfs_dirent_unlock(dl); } zap_cursor_fini(&zc); return (err == ENOENT ? 0 : err); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If ATTR_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * mnt_ns - user namespace of the mount * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime updated, mtime updated if size changed. */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zuserns_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t *tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ip = ZTOI(zp); /* * If this is a xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); if (xoap != NULL && (mask & ATTR_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } } zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { err = SET_ERROR(EPERM); goto out3; } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (ATTR_ATIME | ATTR_MTIME)) { if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = SET_ERROR(EOVERFLOW); goto out3; } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { err = SET_ERROR(EROFS); goto out3; } /* * First validate permissions */ if (mask & ATTR_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, mnt_ns); if (err) goto out3; /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) goto out3; } if (mask & (ATTR_ATIME|ATTR_MTIME) || ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr, mnt_ns); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; uid_t uid; gid_t gid; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & ATTR_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ uid = zfs_uid_to_vfsuid((struct user_namespace *)mnt_ns, zfs_i_user_ns(ip), vap->va_uid); gid = zfs_gid_to_vfsgid((struct user_namespace *)mnt_ns, zfs_i_user_ns(ip), vap->va_gid); take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (ATTR_UID|ATTR_GID)) && take_owner && take_group) || ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr, mnt_ns) == 0) { /* * Remove setuid/setgid for non-privileged users */ (void) secpolicy_setid_clear(vap, cr); trim_mask = (mask & (ATTR_UID|ATTR_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & ATTR_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((!S_ISREG(ip->i_mode) && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); err = SET_ERROR(EPERM); goto out3; } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, mnt_ns) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); if (err) goto out3; trim_mask |= ATTR_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, zfs_zaccess_unix, zp); if (err) goto out3; if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); if (err) goto out2; } if (mask & ATTR_UID) { new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_kuid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & ATTR_GID) { new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_kgid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) zrele(attrzp); err = EDQUOT; goto out2; } } tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = EPERM; goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (ATTR_UID|ATTR_GID)) { if (mask & ATTR_UID) { ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); new_uid = zfs_uid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); } } if (mask & ATTR_GID) { ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); new_gid = zfs_gid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); } } if (!(mask & ATTR_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & ATTR_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = ZTOI(zp)->i_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; ZFS_TIME_ENCODE(&ip->i_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( vap->va_mtime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & ATTR_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (attrzp) zrele(attrzp); if (err == ERESTART) goto top; } else { if (count > 0) err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) - err2 = zfs_setattr_dir(attrzp); + err = zfs_setattr_dir(attrzp); zrele(attrzp); } zfs_znode_update_vfs(zp); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); zfs_exit(zfsvfs, FTAG); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) zfs_zrele_async(zl->zl_znode); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = ZTOZSB(zp)->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdzp - Source directory containing the "old entry". * snm - Old entry name. * tdzp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * flags - case flags * rflags - RENAME_* flags * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). * mnt_ns - user namespace of the mount * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdzp,tdzp - ctime|mtime updated */ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); zilog_t *zilog; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; /* Needed for whiteout inode creation. */ boolean_t fuid_dirtied; zfs_acl_ids_t acl_ids; boolean_t have_acl = B_FALSE; znode_t *wzp = NULL; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return (SET_ERROR(EINVAL)); /* Already checked by Linux VFS, but just to make sure. */ if (rflags & RENAME_EXCHANGE && (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) return (SET_ERROR(EINVAL)); /* * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the * right kind of vattr_t for the whiteout file. These are set * internally by ZFS so should never be incorrect. */ VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if ((error = zfs_verify_zp(tdzp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ zfs_exit(zfsvfs, FTAG); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) zrele(tzp); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = EINVAL; zfs_exit(zfsvfs, FTAG); return (serr); } if (terr) { zfs_dirent_unlock(sdl); zrele(szp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; zfs_exit(zfsvfs, FTAG); return (terr); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) goto out; } /* * Does target exist? */ if (tzp) { if (rflags & RENAME_NOREPLACE) { error = SET_ERROR(EEXIST); goto out; } /* * Source and target must be the same type (unless exchanging). */ if (!(rflags & RENAME_EXCHANGE)) { boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; if (s_is_dir != t_is_dir) { error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } else if (rflags & RENAME_EXCHANGE) { /* Target must exist for RENAME_EXCHANGE. */ error = SET_ERROR(ENOENT); goto out; } /* Set up inode creation for RENAME_WHITEOUT. */ if (rflags & RENAME_WHITEOUT) { /* * Whiteout files are not regular files or directories, so to * match zfs_create() we do not inherit the project id. */ uint64_t wo_projid = ZFS_DEFAULT_PROJID; error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); if (error) goto out; if (!have_acl) { error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, &acl_ids, mnt_ns); if (error) goto out; have_acl = B_TRUE; } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { error = SET_ERROR(EDQUOT); goto out; } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } if (rflags & RENAME_WHITEOUT) { dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); goto top; } dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Unlink the source. */ szp->z_pflags |= ZFS_AV_MODIFIED; if (tdzp->z_pflags & ZFS_PROJINHERIT) szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); VERIFY0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error) goto commit; /* * Unlink the target. */ if (tzp) { int tzflg = zflg; if (rflags & RENAME_EXCHANGE) { /* This inode will be re-linked soon. */ tzflg |= ZRENAMING; tzp->z_pflags |= ZFS_AV_MODIFIED; if (sdzp->z_pflags & ZFS_PROJINHERIT) tzp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&tzp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); } error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); if (error) goto commit_link_szp; } /* * Create the new target links: * * We always link the target. * * RENAME_EXCHANGE: Link the old target to the source. * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. */ error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error) { /* * If we have removed the existing target, a subsequent call to * zfs_link_create() to add back the same entry, but with a new * dnode (szp), should not fail. */ ASSERT3P(tzp, ==, NULL); goto commit_link_tzp; } switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: error = zfs_link_create(sdl, tzp, tx, ZRENAMING); /* * The same argument as zfs_link_create() failing for * szp applies here, since the source directory must * have had an entry we are replacing. */ ASSERT0(error); if (error) goto commit_unlink_td_szp; break; case RENAME_WHITEOUT: zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); error = zfs_link_create(sdl, wzp, tx, ZNEW); if (error) { zfs_znode_delete(wzp, tx); remove_inode_hash(ZTOI(wzp)); goto commit_unlink_td_szp; } break; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: zfs_log_rename_exchange(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; case RENAME_WHITEOUT: zfs_log_rename_whiteout(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp, wzp); break; default: ASSERT0(rflags & ~RENAME_NOREPLACE); zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; } commit: dmu_tx_commit(tx); out: if (have_acl) zfs_acl_ids_free(&acl_ids); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zrele(szp); if (wzp) { zfs_znode_update_vfs(wzp); zrele(wzp); } if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); /* * Clean-up path for broken link state. * * At this point we are in a (very) bad state, so we need to do our * best to correct the state. In particular, all of the nlinks are * wrong because we were destroying and creating links with ZRENAMING. * * In some form, all of these operations have to resolve the state: * * * link_destroy() *must* succeed. Fortunately, this is very likely * since we only just created it. * * * link_create()s are allowed to fail (though they shouldn't because * we only just unlinked them and are putting the entries back * during clean-up). But if they fail, we can just forcefully drop * the nlink value to (at the very least) avoid broken nlink values * -- though in the case of non-empty directories we will have to * panic (otherwise we'd have a leaked directory with a broken ..). */ commit_unlink_td_szp: VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); commit_link_tzp: if (tzp) { if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); } commit_link_szp: if (zfs_link_create(sdl, szp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(szp, tx, NULL)); goto commit; } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dzp - Directory to contain new symbolic link. * name - Name of directory entry in dip. * vap - Attributes of new entry. * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags * mnt_ns - user namespace of the mount * * OUT: zpp - Znode for new symbolic link. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns) { znode_t *zp; zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(S_ISLNK(vap->va_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } top: *zpp = NULL; /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datasets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); } else { zrele(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by ip. * * IN: ip - inode of symbolic link * uio - structure to contain the link path. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated */ int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); zfs_exit(zfsvfs, FTAG); return (error); } /* * Insert a new entry into directory tdzp referencing szp. * * IN: tdzp - Directory to contain new entry. * szp - znode of new entry. * name - name of new entry. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdzp - ctime|mtime updated * szp - ctime updated */ int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags) { struct inode *sip = ZTOI(szp); znode_t *tzp; zfsvfs_t *zfsvfs = ZTOZSB(tdzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; boolean_t is_tmpfile = 0; uint64_t txg; #ifdef HAVE_TMPFILE is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); #endif ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_verify_zp(szp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if (parent == zfsvfs->z_shares_dir) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, kcred->user_ns))) { zfs_exit(zfsvfs, FTAG); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { zfs_exit(zfsvfs, FTAG); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); if (is_tmpfile) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ if (is_tmpfile) szp->z_unlinked = B_FALSE; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; /* * tmpfile is created to be in z_unlinkedobj, so remove it. * Also, we don't log in ZIL, because all previous file * operation on the tmpfile are ignored by ZIL. Instead we * always wait for txg to sync to make sure all previous * operation are sync safe. */ if (is_tmpfile) { VERIFY(zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } } else if (is_tmpfile) { /* restore z_unlinked since when linking failed */ szp->z_unlinked = B_TRUE; } txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zfs_exit(zfsvfs, FTAG); return (error); } static void zfs_putpage_sync_commit_cb(void *arg) { struct page *pp = arg; ClearPageError(pp); end_page_writeback(pp); } static void zfs_putpage_async_commit_cb(void *arg) { struct page *pp = arg; znode_t *zp = ITOZ(pp->mapping->host); ClearPageError(pp); end_page_writeback(pp); atomic_dec_32(&zp->z_async_writes_cnt); } /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * * IN: ip - page mapped for inode. * pp - page to push (page is locked) * wbc - writeback control data * for_sync - does the caller intend to wait synchronously for the * page writeback to complete? * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated */ int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, boolean_t for_sync) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t offset; loff_t pgoff; unsigned int pglen; dmu_tx_t *tx; caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ASSERT(PageLocked(pp)); pgoff = page_offset(pp); /* Page byte-offset in file */ offset = i_size_read(ip); /* File length in bytes */ pglen = MIN(PAGE_SIZE, /* Page length in bytes */ P2ROUNDUP(offset, PAGE_SIZE)-pgoff); /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); zfs_exit(zfsvfs, FTAG); return (0); } /* Truncate page length to end of file */ if (pgoff + pglen > offset) pglen = offset - pgoff; #if 0 /* * FIXME: Allow mmap writes past its quota. The correct fix * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { err = EDQUOT; } #endif /* * The ordering here is critical and must adhere to the following * rules in order to avoid deadlocking in either zfs_read() or * zfs_free_range() due to a lock inversion. * * 1) The page must be unlocked prior to acquiring the range lock. * This is critical because zfs_read() calls find_lock_page() * which may block on the page lock while holding the range lock. * * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. * * This presents a problem because upon entering this function the * page lock is already held. To safely acquire the range lock the * page lock must be dropped. This creates a window where another * process could truncate, invalidate, dirty, or write out the page. * * Therefore, after successfully reacquiring the range and page locks * the current page state is checked. In the common case everything * will be as is expected and it can be written out. However, if * the page state has changed it must be handled accordingly. */ mapping = pp->mapping; redirty_page_for_writepage(wbc, pp); unlock_page(pp); zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Speed up any non-sync page writebacks since * they may take several seconds to complete. * Refer to the comment in zpl_fsync() (when * HAVE_FSYNC_RANGE is defined) for details. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { zil_commit(zfsvfs->z_log, zp->z_id); } if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); #else wait_on_page_bit(pp, PG_writeback); #endif } zfs_exit(zfsvfs, FTAG); return (0); } /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; if (!for_sync) atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO filemap_dirty_folio(page_mapping(pp), page_folio(pp)); #else __set_page_dirty_nobuffers(pp); #endif ClearPageError(pp); end_page_writeback(pp); if (!for_sync) atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (err); } va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, for_sync ? zfs_putpage_sync_commit_cb : zfs_putpage_async_commit_cb, pp); dmu_tx_commit(tx); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ zil_commit(zfsvfs->z_log, zp->z_id); } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { /* * If the caller does not intend to wait synchronously * for this page writeback to complete and there are active * synchronous calls on this file, do a commit so that * the latter don't accidentally end up waiting for * our writeback to complete. Refer to the comment in * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. */ zil_commit(zfsvfs->z_log, zp->z_id); } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); zfs_exit(zfsvfs, FTAG); return (err); } /* * Update the system attributes when the inode has been dirtied. For the * moment we only update the mode, atime, mtime, and ctime. */ int zfs_dirty_inode(struct inode *ip, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); #ifdef I_DIRTY_TIME /* * This is the lazytime semantic introduced in Linux 4.0 * This flag will only be called from update_time when lazytime is set. * (Note, I_DIRTY_SYNC will also set if not lazytime) * Fortunately mtime and ctime are managed within ZFS itself, so we * only need to dirty atime. */ if (flags == I_DIRTY_TIME) { zp->z_atime_dirty = B_TRUE; goto out; } #endif tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } mutex_enter(&zp->z_lock); zp->z_atime_dirty = B_FALSE; SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_atime, atime); ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); mode = ip->i_mode; zp->z_mode = mode; error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); mutex_exit(&zp->z_lock); dmu_tx_commit(tx); out: zfs_exit(zfsvfs, FTAG); return (error); } void zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { need_unlock = 1; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); return; } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { ZFS_TIME_ENCODE(&ip->i_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); zp->z_atime_dirty = B_FALSE; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Fill pages with data from the disk. */ static int zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; struct page *cur_pp; u_offset_t io_off, total; size_t io_len; loff_t i_size; unsigned page_idx; int err; os = zfsvfs->z_os; io_len = nr_pages << PAGE_SHIFT; i_size = i_size_read(ip); io_off = page_offset(pl[0]); if (io_off + io_len > i_size) io_len = i_size - io_off; /* * Iterate over list of pages and read each page individually. */ page_idx = 0; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { caddr_t va; cur_pp = pl[page_idx++]; va = kmap(cur_pp); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, DMU_READ_PREFETCH); kunmap(cur_pp); if (err) { /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } } return (0); } /* * Uses zfs_fillpage to read data from the file and fill the pages. * * IN: ip - inode of file to get data from. * pl - list of pages to read * nr_pages - number of pages to read * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int err; if (pl == NULL) return (0); if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); err = zfs_fillpage(ip, pl, nr_pages); dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE); zfs_exit(zfsvfs, FTAG); return (err); } /* * Check ZFS specific permissions to memory map a section of a file. * * IN: ip - inode of the file to mmap * off - file offset * addrp - start address in memory region * len - length of memory region * vm_flags- address flags * * RETURN: 0 if success * error code if failure */ int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags) { (void) addrp; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if ((vm_flags & VM_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENXIO)); } zfs_exit(zfsvfs, FTAG); return (0); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: zp - znode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * zp - ctime|mtime updated */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { (void) offset; zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if (cmd != F_FREESP) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, kcred->user_ns))) { zfs_exit(zfsvfs, FTAG); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } if ((error = zfs_verify_zp(zp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } gen = (uint32_t)gen64; size = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); EXPORT_SYMBOL(zfs_symlink); EXPORT_SYMBOL(zfs_readlink); EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); /* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); #endif diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index d673da38463b..395e2cf57e26 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -1,2266 +1,2265 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL static kmem_cache_t *znode_cache = NULL; static kmem_cache_t *znode_hold_cache = NULL; unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; /* * This is used by the test suite so that it can delay znodes from being * freed in order to inspect the unlinked set. */ static int zfs_unlink_suspend_progress = 0; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_t *zp = buf; inode_init_once(ZTOI(zp)); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_hold_t *zh = buf; mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); zh->zh_refcount = 0; return (0); } static void zfs_znode_hold_cache_destructor(void *buf, void *arg) { (void) arg; znode_hold_t *zh = buf; mutex_destroy(&zh->zh_lock); } void zfs_znode_init(void) { /* * Initialize zcache. The KMC_SLAB hint is used in order that it be * backed by kmalloc() when on the Linux slab in order that any * wait_on_bit() operations on the related inode operate properly. */ ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); ASSERT(znode_hold_cache == NULL); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); } void zfs_znode_fini(void) { /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; if (znode_hold_cache) kmem_cache_destroy(znode_hold_cache); znode_hold_cache = NULL; } /* * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to * serialize access to a znode and its SA buffer while the object is being * created or destroyed. This kind of locking would normally reside in the * znode itself but in this case that's impossible because the znode and SA * buffer may not yet exist. Therefore the locking is handled externally * with an array of mutexes and AVLs trees which contain per-object locks. * * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted * in to the correct AVL tree and finally the per-object lock is held. In * zfs_znode_hold_exit() the process is reversed. The per-object lock is * released, removed from the AVL tree and destroyed if there are no waiters. * * This scheme has two important properties: * * 1) No memory allocations are performed while holding one of the z_hold_locks. * This ensures evict(), which can be called from direct memory reclaim, will * never block waiting on a z_hold_locks which just happens to have hashed * to the same index. * * 2) All locks used to serialize access to an object are per-object and never * shared. This minimizes lock contention without creating a large number * of dedicated locks. * * On the downside it does require znode_lock_t structures to be frequently * allocated and freed. However, because these are backed by a kmem cache * and very short lived this cost is minimal. */ int zfs_znode_hold_compare(const void *a, const void *b) { const znode_hold_t *zh_a = (const znode_hold_t *)a; const znode_hold_t *zh_b = (const znode_hold_t *)b; return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); } static boolean_t __maybe_unused zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t held; search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; mutex_exit(&zfsvfs->z_hold_locks[i]); return (held); } static znode_hold_t * zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, *zh_new, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t found = B_FALSE; zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); if (likely(zh == NULL)) { zh = zh_new; zh->zh_obj = obj; avl_add(&zfsvfs->z_hold_trees[i], zh); } else { ASSERT3U(zh->zh_obj, ==, obj); found = B_TRUE; } zh->zh_refcount++; ASSERT3S(zh->zh_refcount, >, 0); mutex_exit(&zfsvfs->z_hold_locks[i]); if (found == B_TRUE) kmem_cache_free(znode_hold_cache, zh_new); ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); mutex_enter(&zh->zh_lock); return (zh); } static void zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) { int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); boolean_t remove = B_FALSE; ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); mutex_exit(&zh->zh_lock); mutex_enter(&zfsvfs->z_hold_locks[i]); ASSERT3S(zh->zh_refcount, >, 0); if (--zh->zh_refcount == 0) { avl_remove(&zfsvfs->z_hold_trees[i], zh); remove = B_TRUE; } mutex_exit(&zfsvfs->z_hold_locks[i]); if (remove == B_TRUE) kmem_cache_free(znode_hold_cache, zh); } dev_t zfs_cmpldev(uint64_t dev) { return (dev); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); mutex_enter(&zp->z_lock); ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_acl_cached == NULL); if (sa_hdl == NULL) { VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; mutex_exit(&zp->z_lock); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked || RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } /* * Called by new_inode() to allocate a new inode. */ int zfs_inode_alloc(struct super_block *sb, struct inode **ip) { znode_t *zp; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); *ip = ZTOI(zp); return (0); } /* * Called in multiple places when an inode should be destroyed. */ void zfs_inode_destroy(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); mutex_enter(&zfsvfs->z_znodes_lock); if (list_link_active(&zp->z_link_node)) { list_remove(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes--; } mutex_exit(&zfsvfs->z_znodes_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } static void zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) { uint64_t rdev = 0; switch (ip->i_mode & S_IFMT) { case S_IFREG: ip->i_op = &zpl_inode_operations; ip->i_fop = &zpl_file_operations; ip->i_mapping->a_ops = &zpl_address_space_operations; break; case S_IFDIR: #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER ip->i_flags |= S_IOPS_WRAPPER; ip->i_op = &zpl_dir_inode_operations.ops; #else ip->i_op = &zpl_dir_inode_operations; #endif ip->i_fop = &zpl_dir_file_operations; ITOZ(ip)->z_zn_prefetch = B_TRUE; break; case S_IFLNK: ip->i_op = &zpl_symlink_inode_operations; break; /* * rdev is only stored in a SA only for device files. */ case S_IFCHR: case S_IFBLK: (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, sizeof (rdev)); zfs_fallthrough; case S_IFIFO: case S_IFSOCK: init_special_inode(ip, ip->i_mode, rdev); ip->i_op = &zpl_special_inode_operations; break; default: zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", (u_longlong_t)ip->i_ino, ip->i_mode); /* Assume the inode is a file and attempt to continue */ ip->i_mode = S_IFREG | 0644; ip->i_op = &zpl_inode_operations; ip->i_fop = &zpl_file_operations; ip->i_mapping->a_ops = &zpl_address_space_operations; break; } } static void zfs_set_inode_flags(znode_t *zp, struct inode *ip) { /* * Linux and Solaris have different sets of file attributes, so we * restrict this conversion to the intersection of the two. */ #ifdef HAVE_INODE_SET_FLAGS unsigned int flags = 0; if (zp->z_pflags & ZFS_IMMUTABLE) flags |= S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) flags |= S_APPEND; inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); #else if (zp->z_pflags & ZFS_IMMUTABLE) ip->i_flags |= S_IMMUTABLE; else ip->i_flags &= ~S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) ip->i_flags |= S_APPEND; else ip->i_flags &= ~S_APPEND; #endif } /* * Update the embedded inode given the znode. */ void zfs_znode_update_vfs(znode_t *zp) { - zfsvfs_t *zfsvfs; struct inode *ip; uint32_t blksize; u_longlong_t i_blocks; ASSERT(zp != NULL); - zfsvfs = ZTOZSB(zp); ip = ZTOI(zp); /* Skip .zfs control nodes which do not exist on disk. */ if (zfsctl_is_node(ip)) return; dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); spin_lock(&ip->i_lock); ip->i_mode = zp->z_mode; ip->i_blocks = i_blocks; i_size_write(ip, zp->z_size); spin_unlock(&ip->i_lock); } /* * Construct a znode+inode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; struct inode *ip; uint64_t mode; uint64_t parent; uint64_t tmp_gen; uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; ASSERT(zfsvfs != NULL); ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); zp = ITOZ(ip); ASSERT(zp->z_dirlocks == NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; goto error; } zp->z_projid = projid; zp->z_mode = ip->i_mode = mode; ip->i_generation = (uint32_t)tmp_gen; ip->i_blkbits = SPA_MINBLOCKSHIFT; set_nlink(ip, (uint32_t)links); zfs_uid_write(ip, z_uid); zfs_gid_write(ip, z_gid); zfs_set_inode_flags(zp, ip); /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; ZFS_TIME_DECODE(&ip->i_atime, atime); ZFS_TIME_DECODE(&ip->i_mtime, mtime); ZFS_TIME_DECODE(&ip->i_ctime, ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; zfs_znode_update_vfs(zp); zfs_inode_set_ops(zfsvfs, ip); /* * The only way insert_inode_locked() can fail is if the ip->i_ino * number is already hashed for this super block. This can never * happen because the inode numbers map 1:1 with the object numbers. * * Exceptions include rolling back a mounted file system, either * from the zfs rollback or zfs recv command. * * Active inodes are unhashed during the rollback, but since zrele * can happen asynchronously, we can't guarantee they've been * unhashed. This can cause hash collisions in unlinked drain * processing so do not hash unlinked znodes. */ if (links > 0) VERIFY3S(insert_inode_locked(ip), ==, 0); mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes++; mutex_exit(&zfsvfs->z_znodes_lock); if (links > 0) unlock_new_inode(ip); return (zp); error: iput(ip); return (NULL); } /* * Safely mark an inode dirty. Inodes which are part of a read-only * file system or snapshot may not be dirtied. */ void zfs_mark_inode_dirty(struct inode *ip) { zfsvfs_t *zfsvfs = ITOZSB(ip); if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return; mark_inode_dirty(ip); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_TMPFILE - new object is of O_TMPFILE * IS_XATTR - new object is an attribute * acl_ids - ACL related attributes * * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t projid = ZFS_DEFAULT_PROJID; uint64_t rdev = 0; zfsvfs_t *zfsvfs = ZTOZSB(dzp); dmu_buf_t *db; inode_timespec_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; znode_hold_t *zh; if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (S_ISDIR(vap->va_mode)) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } zh = zfs_znode_hold_enter(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } /* * If parent is an xattr, so am I. */ if (dzp->z_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (S_ISDIR(vap->va_mode)) { size = 2; /* contents ("." and "..") */ links = 2; } else { size = 0; links = (flag & IS_TMPFILE) ? 0 : 1; } if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) rdev = vap->va_rdev; parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { /* * With ZFS_PROJID flag, we can easily know whether there is * project ID stored on disk or not. See zfs_space_delta_cb(). */ if (obj_type != DMU_OT_ZNODE && dmu_objset_projectquota_enabled(zfsvfs->z_os)) pflags |= ZFS_PROJID; /* * Inherit project ID from parent if required. */ projid = zfs_inherit_projid(dzp); if (dzp->z_pflags & ZFS_PROJINHERIT) pflags |= ZFS_PROJINHERIT; } /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & ATTR_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & ATTR_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && pflags & ZFS_PROJID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8); } if (obj_type == DMU_OT_ZNODE || (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); if (!(flag & IS_ROOT_NODE)) { /* * The call to zfs_znode_alloc() may fail if memory is low * via the call path: alloc_inode() -> inode_init_always() -> * security_inode_alloc() -> inode_alloc_security(). Since * the existing code is written such that zfs_mknode() can * not fail retry until sufficient memory has been reclaimed. */ do { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); } while (*zpp == NULL); VERIFY(*zpp != NULL); VERIFY(dzp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; (*zpp)->z_dnodesize = dnodesize; (*zpp)->z_projid = projid; if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); zfs_znode_hold_exit(zfsvfs, zh); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; boolean_t update_inode = B_FALSE; xoap = xva_getxoptattr(xvap); ASSERT(xoap); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (update_inode) zfs_set_inode_flags(zp, ZTOI(zp)); } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; znode_hold_t *zh; int err; sa_handle_t *hdl; *zpp = NULL; again: zh = zfs_znode_hold_enter(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); /* * If zp->z_unlinked is set, the znode is already marked * for deletion and should not be discovered. Check this * after checking igrab() due to fsetxattr() & O_TMPFILE. * * If igrab() returns NULL the VFS has independently * determined the inode should be evicted and has * called iput_final() to start the eviction process. * The SA handle is still valid but because the VFS * requires that the eviction succeed we must drop * our locks and references to allow the eviction to * complete. The zfs_zget() may then be retried. * * This unlikely case could be optimized by registering * a sops->drop_inode() callback. The callback would * need to detect the active SA hold thereby informing * the VFS that this inode should not be evicted. */ if (igrab(ZTOI(zp)) == NULL) { if (zp->z_unlinked) err = SET_ERROR(ENOENT); else err = SET_ERROR(EAGAIN); } else { *zpp = zp; err = 0; } mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); if (err == EAGAIN) { /* inode might need this to finish evict */ cond_resched(); goto again; } return (err); } /* * Not found create new znode/vnode but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } zfs_znode_hold_exit(zfsvfs, zh); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_object_info_t doi; dmu_buf_t *db; uint64_t obj_num = zp->z_id; uint64_t mode; uint64_t links; sa_bulk_attr_t bulk[11]; int err; int count = 0; uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; /* * skip ctldir, otherwise they will always get invalidated. This will * cause funny behaviour for the mounted snapdirs. Especially for * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent * anyone automount it again as long as someone is still using the * detached mount. */ if (zp->z_is_ctldir) return (0); zh = zfs_znode_hold_enter(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT(zp->z_sa_hdl == NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, sizeof (z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, sizeof (z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8); if (err != 0 && err != ENOENT) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(err)); } } zp->z_projid = projid; zp->z_mode = ZTOI(zp)->i_mode = mode; zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } set_nlink(ZTOI(zp), (uint32_t)links); zfs_set_inode_flags(zp, ZTOI(zp)); zp->z_blksz = doi.doi_data_block_size; zp->z_atime_dirty = B_FALSE; zfs_znode_update_vfs(zp); /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatic removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); if (zp->z_unlinked) zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); znode_hold_t *zh; zh = zfs_znode_hold_enter(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY(0 == dmu_object_free(os, acl_obj, tx)); } VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t z_id = zp->z_id; znode_hold_t *zh; ASSERT(zp->z_sa_hdl); /* * Don't allow a zfs_zget() while were trying to release this znode. */ zh = zfs_znode_hold_enter(zfsvfs, z_id); mutex_enter(&zp->z_lock); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { mutex_exit(&zp->z_lock); zfs_znode_hold_exit(zfsvfs, zh); zfs_rmnode(zp); return; } } mutex_exit(&zp->z_lock); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } #if defined(HAVE_INODE_TIMESPEC64_TIMES) #define zfs_compare_timespec timespec64_compare #else #define zfs_compare_timespec timespec_compare #endif /* * Determine whether the znode's atime must be updated. The logic mostly * duplicates the Linux kernel's relatime_need_update() functionality. * This function is only called if the underlying filesystem actually has * atime updates enabled. */ boolean_t zfs_relatime_need_update(const struct inode *ip) { inode_timespec_t now; gethrestime(&now); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) return (B_TRUE); if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) return (B_TRUE); if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); } /* * Prepare to update znode time stamps. * * IN: zp - znode requiring timestamp update * flag - ATTR_MTIME, ATTR_CTIME flags * * OUT: zp - z_seq * mtime - new mtime * ctime - new ctime * * Note: We don't update atime here, because we rely on Linux VFS to do * atime updating. */ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { inode_timespec_t now; gethrestime(&now); zp->z_seq++; if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * zfs_zero_partial_page - Modeled after update_pages() but * with different arguments and semantics for use by zfs_freesp(). * * Zeroes a piece of a single page cache entry for zp at offset * start and length len. * * Caller must acquire a range lock on the file for the region * being zeroed in order that the ARC and page cache stay in sync. */ static void zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) { struct address_space *mp = ZTOI(zp)->i_mapping; struct page *pp; int64_t off; void *pb; ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); off = start & (PAGE_SIZE - 1); start &= PAGE_MASK; pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); memset(pb + off, 0, len); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); /* * Zero partial page cache entries. This must be done under a * range lock in order to keep the ARC and page cache in sync. */ if (zp->z_is_mapped) { loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; /* first possible full page in hole */ first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; /* last page of hole */ last_page = (off + len) >> PAGE_SHIFT; /* offset of first_page */ first_page_offset = first_page << PAGE_SHIFT; /* offset of last_page */ last_page_offset = last_page << PAGE_SHIFT; /* truncate whole pages */ if (last_page_offset > first_page_offset) { truncate_inode_pages_range(ZTOI(zp)->i_mapping, first_page_offset, last_page_offset - 1); } /* truncate sub-page ranges */ if (first_page > last_page) { /* entire punched area within a single page */ zfs_zero_partial_page(zp, off, len); } else { /* beginning of punched area at the end of a page */ page_len = first_page_offset - off; if (page_len > 0) zfs_zero_partial_page(zp, off, page_len); /* end of punched area at the beginning of a page */ page_len = off + len - last_page_offset; if (page_len > 0) zfs_zero_partial_page(zp, last_page_offset, page_len); } } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(zp); zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; goto out; } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) goto out; log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); zfs_znode_update_vfs(zp); error = 0; out: /* * Truncate the page cache - for file truncate operations, use * the purpose-built API for truncations. For punching operations, * the truncation is handled under a range lock in zfs_free_range. */ if (len == 0) truncate_setsize(ZTOI(zp), off); return (error); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { struct super_block *sb; zfsvfs_t *zfsvfs; uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int size; int error; int i; znode_t *rootzp = NULL; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT(error == 0); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + ASSERT(error == 0); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT(error == 0); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/inode/zfsvfs/sb * to allow zfs_mknode to work. */ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); rootzp->z_unlinked = B_FALSE; rootzp->z_atime_dirty = B_FALSE; rootzp->z_is_sa = USE_SA(version, os); rootzp->z_pflags = 0; zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); sb->s_fs_info = zfsvfs; ZTOI(rootzp)->i_sb = sb; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, kcred->user_ns)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); zfs_acl_ids_free(&acl_ids); atomic_set(&ZTOI(rootzp)->i_count, 0); sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } mutex_destroy(&zfsvfs->z_znodes_lock); vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); kmem_free(sb, sizeof (struct super_block)); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } #endif /* _KERNEL */ static int zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) { uint64_t sa_obj = 0; int error; error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0 && error != ENOENT) return (error); error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); return (error); } static int zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, dmu_buf_t **db, const void *tag) { dmu_object_info_t doi; int error; if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) return (error); dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t))) { sa_buf_rele(*db, tag); return (SET_ERROR(ENOTSUP)); } error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); if (error != 0) { sa_buf_rele(*db, tag); return (error); } return (0); } static void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag) { sa_handle_destroy(hdl); sa_buf_rele(db, tag); } /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; uint64_t parent_mode; sa_bulk_attr_t bulk[3]; sa_handle_t *sa_hdl; dmu_buf_t *sa_db; int count = 0; int error; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &mode, sizeof (mode)); if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); /* * When a link is removed its parent pointer is not changed and will * be invalid. There are two cases where a link is removed but the * file stays around, when it goes to the delete queue and when there * are additional links. */ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); if (error != 0) return (error); error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); if (error != 0) return (error); *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); /* * Extended attributes can be applied to files, directories, etc. * Otherwise the parent must be a directory. */ if (!*is_xattrdir && !S_ISDIR(parent_mode)) return (SET_ERROR(EINVAL)); *pobjp = parent; return (0); } /* * Given an object number, return some zpl level statistics */ static int zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &sb->zs_mode, sizeof (sb->zs_mode)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, &sb->zs_gen, sizeof (sb->zs_gen)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, &sb->zs_links, sizeof (sb->zs_links)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, &sb->zs_ctime, sizeof (sb->zs_ctime)); return (sa_bulk_lookup(hdl, bulk, count)); } static int zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, sa_attr_type_t *sa_table, char *buf, int len) { sa_handle_t *sa_hdl; sa_handle_t *prevhdl = NULL; dmu_buf_t *prevdb = NULL; dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; int error; *path = '\0'; sa_hdl = hdl; uint64_t deleteq_obj; VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); error = zap_lookup_int(osp, deleteq_obj, obj); if (error == 0) { return (ESTALE); } else if (error != ENOENT) { return (error); } for (;;) { uint64_t pobj = 0; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir = 0; if (prevdb) { ASSERT(prevhdl != NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { strcpy(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT(path >= buf); memcpy(path, component, complen); obj = pobj; if (sa_hdl != hdl) { prevhdl = sa_hdl; prevdb = sa_db; } error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); if (error != 0) { sa_hdl = prevhdl; sa_db = prevdb; break; } } if (sa_hdl != NULL && sa_hdl != hdl) { ASSERT(sa_db != NULL); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, char *buf, int len) { char *path = buf + len - 1; sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; *path = '\0'; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_stats_impl(hdl, sa_table, sb); if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); /* CSTYLED */ module_param(zfs_object_mutex_size, uint, 0644); MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); module_param(zfs_unlink_suspend_progress, int, 0644); MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " "(debug - leaks space into the unlinked set)"); #endif diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 44c8f486f6d8..fa9b8447e983 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -1,992 +1,992 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. */ /* * Copyright (c) 2016 by Delphix. All rights reserved. */ /* * Fletcher Checksums * ------------------ * * ZFS's 2nd and 4th order Fletcher checksums are defined by the following * recurrence relations: * * a = a + f * i i-1 i-1 * * b = b + a * i i-1 i * * c = c + b (fletcher-4 only) * i i-1 i * * d = d + c (fletcher-4 only) * i i-1 i * * Where * a_0 = b_0 = c_0 = d_0 = 0 * and * f_0 .. f_(n-1) are the input data. * * Using standard techniques, these translate into the following series: * * __n_ __n_ * \ | \ | * a = > f b = > i * f * n /___| n - i n /___| n - i * i = 1 i = 1 * * * __n_ __n_ * \ | i*(i+1) \ | i*(i+1)*(i+2) * c = > ------- f d = > ------------- f * n /___| 2 n - i n /___| 6 n - i * i = 1 i = 1 * * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. * Since the additions are done mod (2^64), errors in the high bits may not * be noticed. For this reason, fletcher-2 is deprecated. * * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. * A conservative estimate of how big the buffer can get before we overflow * can be estimated using f_i = 0xffffffff for all i: * * % bc * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 * 2264 * quit * % * * So blocks of up to 2k will not overflow. Our largest block size is * 128k, which has 32k 4-byte words, so we can compute the largest possible * accumulators, then divide by 2^64 to figure the max amount of overflow: * * % bc * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } * a/2^64;b/2^64;c/2^64;d/2^64 * 0 * 0 * 1365 * 11186858 * quit * % * * So a and b cannot overflow. To make sure each bit of input has some * effect on the contents of c and d, we can look at what the factors of * the coefficients in the equations for c_n and d_n are. The number of 2s * in the factors determines the lowest set bit in the multiplier. Running * through the cases for n*(n+1)/2 reveals that the highest power of 2 is * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow * the 64-bit accumulators, every bit of every f_i effects every accumulator, * even for 128k blocks. * * If we wanted to make a stronger version of fletcher4 (fletcher4c?), * we could do our calculations mod (2^32 - 1) by adding in the carries * periodically, and store the number of carries in the top 32-bits. * * -------------------- * Checksum Performance * -------------------- * * There are two interesting components to checksum performance: cached and * uncached performance. With cached data, fletcher-2 is about four times * faster than fletcher-4. With uncached data, the performance difference is * negligible, since the cost of a cache fill dominates the processing time. * Even though fletcher-4 is slower than fletcher-2, it is still a pretty * efficient pass over the data. * * In normal operation, the data which is being checksummed is in a buffer * which has been filled either by: * * 1. a compression step, which will be mostly cached, or * 2. a memcpy() or copyin(), which will be uncached * (because the copy is cache-bypassing). * * For both cached and uncached data, both fletcher checksums are much faster * than sha-256, and slower than 'off', which doesn't touch the data at all. */ #include #include #include #include #include #include #include #include #define FLETCHER_MIN_SIMD_SIZE 64 static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx); static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp); static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size); static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size); static boolean_t fletcher_4_scalar_valid(void); static const fletcher_4_ops_t fletcher_4_scalar_ops = { .init_native = fletcher_4_scalar_init, .fini_native = fletcher_4_scalar_fini, .compute_native = fletcher_4_scalar_native, .init_byteswap = fletcher_4_scalar_init, .fini_byteswap = fletcher_4_scalar_fini, .compute_byteswap = fletcher_4_scalar_byteswap, .valid = fletcher_4_scalar_valid, .name = "scalar" }; static fletcher_4_ops_t fletcher_4_fastest_impl = { .name = "fastest", .valid = fletcher_4_scalar_valid }; static const fletcher_4_ops_t *fletcher_4_impls[] = { &fletcher_4_scalar_ops, &fletcher_4_superscalar_ops, &fletcher_4_superscalar4_ops, #if defined(HAVE_SSE2) &fletcher_4_sse2_ops, #endif #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) &fletcher_4_ssse3_ops, #endif #if defined(HAVE_AVX) && defined(HAVE_AVX2) &fletcher_4_avx2_ops, #endif #if defined(__x86_64) && defined(HAVE_AVX512F) &fletcher_4_avx512f_ops, #endif #if defined(__x86_64) && defined(HAVE_AVX512BW) &fletcher_4_avx512bw_ops, #endif #if defined(__aarch64__) && !defined(__FreeBSD__) &fletcher_4_aarch64_neon_ops, #endif }; /* Hold all supported implementations */ static uint32_t fletcher_4_supp_impls_cnt = 0; static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)]; /* Select fletcher4 implementation */ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX - 1) #define IMPL_SCALAR (0) static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST; #define IMPL_READ(i) (*(volatile uint32_t *) &(i)) static struct fletcher_4_impl_selector { const char *fis_name; uint32_t fis_sel; } fletcher_4_impl_selectors[] = { { "cycle", IMPL_CYCLE }, { "fastest", IMPL_FASTEST }, { "scalar", IMPL_SCALAR } }; #if defined(_KERNEL) static kstat_t *fletcher_4_kstat; static struct fletcher_4_kstat { uint64_t native; uint64_t byteswap; } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; #endif /* Indicate that benchmark has been completed */ static boolean_t fletcher_4_initialized = B_FALSE; void fletcher_init(zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } int fletcher_2_incremental_native(void *buf, size_t size, void *data) { zio_cksum_t *zcp = data; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; a0 = zcp->zc_word[0]; a1 = zcp->zc_word[1]; b0 = zcp->zc_word[2]; b1 = zcp->zc_word[3]; for (; ip < ipend; ip += 2) { a0 += ip[0]; a1 += ip[1]; b0 += a0; b1 += a1; } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); return (0); } void fletcher_2_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; fletcher_init(zcp); (void) fletcher_2_incremental_native((void *) buf, size, zcp); } int fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) { zio_cksum_t *zcp = data; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; a0 = zcp->zc_word[0]; a1 = zcp->zc_word[1]; b0 = zcp->zc_word[2]; b1 = zcp->zc_word[3]; for (; ip < ipend; ip += 2) { a0 += BSWAP_64(ip[0]); a1 += BSWAP_64(ip[1]); b0 += a0; b1 += a1; } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); return (0); } void fletcher_2_byteswap(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; fletcher_init(zcp); (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx) { ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t)); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; a = ctx->scalar.zc_word[0]; b = ctx->scalar.zc_word[1]; c = ctx->scalar.zc_word[2]; d = ctx->scalar.zc_word[3]; for (; ip < ipend; ip++) { a += ip[0]; b += a; c += b; d += c; } ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; a = ctx->scalar.zc_word[0]; b = ctx->scalar.zc_word[1]; c = ctx->scalar.zc_word[2]; d = ctx->scalar.zc_word[3]; for (; ip < ipend; ip++) { a += BSWAP_32(ip[0]); b += a; c += b; d += c; } ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); } static boolean_t fletcher_4_scalar_valid(void) { return (B_TRUE); } int fletcher_4_impl_set(const char *val) { int err = -EINVAL; uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); size_t i, val_len; val_len = strlen(val); while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ val_len--; /* check mandatory implementations */ for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) { const char *name = fletcher_4_impl_selectors[i].fis_name; if (val_len == strlen(name) && strncmp(val, name, val_len) == 0) { impl = fletcher_4_impl_selectors[i].fis_sel; err = 0; break; } } if (err != 0 && fletcher_4_initialized) { /* check all supported implementations */ for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { const char *name = fletcher_4_supp_impls[i]->name; if (val_len == strlen(name) && strncmp(val, name, val_len) == 0) { impl = i; err = 0; break; } } } if (err == 0) { atomic_swap_32(&fletcher_4_impl_chosen, impl); membar_producer(); } return (err); } /* * Returns the Fletcher 4 operations for checksums. When a SIMD * implementation is not allowed in the current context, then fallback * to the fastest generic implementation. */ static inline const fletcher_4_ops_t * fletcher_4_impl_get(void) { if (!kfpu_allowed()) return (&fletcher_4_superscalar4_ops); const fletcher_4_ops_t *ops = NULL; uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); switch (impl) { case IMPL_FASTEST: ASSERT(fletcher_4_initialized); ops = &fletcher_4_fastest_impl; break; case IMPL_CYCLE: /* Cycle through supported implementations */ ASSERT(fletcher_4_initialized); ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); static uint32_t cycle_count = 0; uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; ops = fletcher_4_supp_impls[idx]; break; default: ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); ops = fletcher_4_supp_impls[impl]; break; } ASSERT3P(ops, !=, NULL); return (ops); } static inline void fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) { fletcher_4_ctx_t ctx; const fletcher_4_ops_t *ops = fletcher_4_impl_get(); ops->init_native(&ctx); ops->compute_native(&ctx, buf, size); ops->fini_native(&ctx, zcp); } void fletcher_4_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); if (size == 0 || p2size == 0) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); if (size > 0) fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); } else { fletcher_4_native_impl(buf, p2size, zcp); if (p2size < size) fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, (char *)buf + p2size, size - p2size); } } void fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); } static inline void fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) { fletcher_4_ctx_t ctx; const fletcher_4_ops_t *ops = fletcher_4_impl_get(); ops->init_byteswap(&ctx); ops->compute_byteswap(&ctx, buf, size); ops->fini_byteswap(&ctx, zcp); } void fletcher_4_byteswap(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); if (size == 0 || p2size == 0) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); if (size > 0) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); } else { fletcher_4_byteswap_impl(buf, p2size, zcp); if (p2size < size) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, (char *)buf + p2size, size - p2size); } } /* Incremental Fletcher 4 */ #define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20) static inline void fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size, const zio_cksum_t *nzcp) { const uint64_t c1 = size / sizeof (uint32_t); const uint64_t c2 = c1 * (c1 + 1) / 2; const uint64_t c3 = c2 * (c1 + 2) / 3; /* * Value of 'c3' overflows on buffer sizes close to 16MiB. For that * reason we split incremental fletcher4 computation of large buffers * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size. */ ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE); zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] + c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0]; zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] + c2 * zcp->zc_word[0]; zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0]; zcp->zc_word[0] += nzcp->zc_word[0]; } static inline void fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size, zio_cksum_t *zcp) { while (size > 0) { zio_cksum_t nzc; uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE); if (native) fletcher_4_native(buf, len, NULL, &nzc); else fletcher_4_byteswap(buf, len, NULL, &nzc); fletcher_4_incremental_combine(zcp, len, &nzc); size -= len; buf += len; } } int fletcher_4_incremental_native(void *buf, size_t size, void *data) { zio_cksum_t *zcp = data; /* Use scalar impl to directly update cksum of small blocks */ if (size < SPA_MINBLOCKSIZE) fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); else fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); return (0); } int fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) { zio_cksum_t *zcp = data; /* Use scalar impl to directly update cksum of small blocks */ if (size < SPA_MINBLOCKSIZE) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); else fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); return (0); } #if defined(_KERNEL) /* * Fletcher 4 kstats */ static int fletcher_4_kstat_headers(char *buf, size_t size) { ssize_t off = 0; off += snprintf(buf + off, size, "%-17s", "implementation"); off += snprintf(buf + off, size - off, "%-15s", "native"); (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap"); return (0); } static int fletcher_4_kstat_data(char *buf, size_t size, void *data) { struct fletcher_4_kstat *fastest_stat = &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data; ssize_t off = 0; if (curr_stat == fastest_stat) { off += snprintf(buf + off, size - off, "%-17s", "fastest"); off += snprintf(buf + off, size - off, "%-15s", fletcher_4_supp_impls[fastest_stat->native]->name); - off += snprintf(buf + off, size - off, "%-15s\n", + (void) snprintf(buf + off, size - off, "%-15s\n", fletcher_4_supp_impls[fastest_stat->byteswap]->name); } else { ptrdiff_t id = curr_stat - fletcher_4_stat_data; off += snprintf(buf + off, size - off, "%-17s", fletcher_4_supp_impls[id]->name); off += snprintf(buf + off, size - off, "%-15llu", (u_longlong_t)curr_stat->native); - off += snprintf(buf + off, size - off, "%-15llu\n", + (void) snprintf(buf + off, size - off, "%-15llu\n", (u_longlong_t)curr_stat->byteswap); } return (0); } static void * fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) { if (n <= fletcher_4_supp_impls_cnt) ksp->ks_private = (void *) (fletcher_4_stat_data + n); else ksp->ks_private = NULL; return (ksp->ks_private); } #endif #define FLETCHER_4_FASTEST_FN_COPY(type, src) \ { \ fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \ fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \ fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ } #define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */ typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, zio_cksum_t *); #if defined(_KERNEL) static void fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) { struct fletcher_4_kstat *fastest_stat = &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; hrtime_t start; uint64_t run_bw, run_time_ns, best_run = 0; zio_cksum_t zc; uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); fletcher_checksum_func_t *fletcher_4_test = native ? fletcher_4_native : fletcher_4_byteswap; for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; uint64_t run_count = 0; /* temporary set an implementation */ fletcher_4_impl_chosen = i; kpreempt_disable(); start = gethrtime(); do { for (l = 0; l < 32; l++, run_count++) fletcher_4_test(data, data_size, NULL, &zc); run_time_ns = gethrtime() - start; } while (run_time_ns < FLETCHER_4_BENCH_NS); kpreempt_enable(); run_bw = data_size * run_count * NANOSEC; run_bw /= run_time_ns; /* B/s */ if (native) stat->native = run_bw; else stat->byteswap = run_bw; if (run_bw > best_run) { best_run = run_bw; if (native) { fastest_stat->native = i; FLETCHER_4_FASTEST_FN_COPY(native, fletcher_4_supp_impls[i]); } else { fastest_stat->byteswap = i; FLETCHER_4_FASTEST_FN_COPY(byteswap, fletcher_4_supp_impls[i]); } } } /* restore original selection */ atomic_swap_32(&fletcher_4_impl_chosen, sel_save); } #endif /* _KERNEL */ /* * Initialize and benchmark all supported implementations. */ static void fletcher_4_benchmark(void) { fletcher_4_ops_t *curr_impl; int i, c; /* Move supported implementations into fletcher_4_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; if (curr_impl->valid && curr_impl->valid()) fletcher_4_supp_impls[c++] = curr_impl; } membar_producer(); /* complete fletcher_4_supp_impls[] init */ fletcher_4_supp_impls_cnt = c; /* number of supported impl */ #if defined(_KERNEL) static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ char *databuf = vmem_alloc(data_size, KM_SLEEP); for (i = 0; i < data_size / sizeof (uint64_t); i++) ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ fletcher_4_benchmark_impl(B_FALSE, databuf, data_size); fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); vmem_free(databuf, data_size); #else /* * Skip the benchmark in user space to avoid impacting libzpool * consumers (zdb, zhack, zinject, ztest). The last implementation * is assumed to be the fastest and used by default. */ memcpy(&fletcher_4_fastest_impl, fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], sizeof (fletcher_4_fastest_impl)); fletcher_4_fastest_impl.name = "fastest"; membar_producer(); #endif /* _KERNEL */ } void fletcher_4_init(void) { /* Determine the fastest available implementation. */ fletcher_4_benchmark(); #if defined(_KERNEL) /* Install kstats for all implementations */ fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (fletcher_4_kstat != NULL) { fletcher_4_kstat->ks_data = NULL; fletcher_4_kstat->ks_ndata = UINT32_MAX; kstat_set_raw_ops(fletcher_4_kstat, fletcher_4_kstat_headers, fletcher_4_kstat_data, fletcher_4_kstat_addr); kstat_install(fletcher_4_kstat); } #endif /* Finish initialization */ fletcher_4_initialized = B_TRUE; } void fletcher_4_fini(void) { #if defined(_KERNEL) if (fletcher_4_kstat != NULL) { kstat_delete(fletcher_4_kstat); fletcher_4_kstat = NULL; } #endif } /* ABD adapters */ static void abd_fletcher_4_init(zio_abd_checksum_data_t *cdp) { const fletcher_4_ops_t *ops = fletcher_4_impl_get(); cdp->acd_private = (void *) ops; if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) ops->init_native(cdp->acd_ctx); else ops->init_byteswap(cdp->acd_ctx); } static void abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp) { fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; ASSERT(ops); if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) ops->fini_native(cdp->acd_ctx, cdp->acd_zcp); else ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp); } static void abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size, zio_abd_checksum_data_t *cdp) { zio_cksum_t *zcp = cdp->acd_zcp; ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); abd_fletcher_4_fini(cdp); cdp->acd_private = (void *)&fletcher_4_scalar_ops; if (native) fletcher_4_incremental_native(data, size, zcp); else fletcher_4_incremental_byteswap(data, size, zcp); } static int abd_fletcher_4_iter(void *data, size_t size, void *private) { zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private; fletcher_4_ctx_t *ctx = cdp->acd_ctx; fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE; uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); if (asize > 0) { if (native) ops->compute_native(ctx, data, asize); else ops->compute_byteswap(ctx, data, asize); size -= asize; data = (char *)data + asize; } if (size > 0) { ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); /* At this point we have to switch to scalar impl */ abd_fletcher_4_simd2scalar(native, data, size, cdp); } return (0); } zio_abd_checksum_func_t fletcher_4_abd_ops = { .acf_init = abd_fletcher_4_init, .acf_fini = abd_fletcher_4_fini, .acf_iter = abd_fletcher_4_iter }; #if defined(_KERNEL) #define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") #if defined(__linux__) static int fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) { const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); char *fmt; int cnt = 0; /* list fastest */ fmt = IMPL_FMT(impl, IMPL_FASTEST); cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest"); /* list all supported implementations */ for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { fmt = IMPL_FMT(impl, i); cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, fletcher_4_supp_impls[i]->name); } return (cnt); } static int fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) { return (fletcher_4_impl_set(val)); } #else #include static int fletcher_4_param(ZFS_MODULE_PARAM_ARGS) { int err; if (req->newptr == NULL) { const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); const int init_buflen = 64; const char *fmt; struct sbuf *s; s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); /* list fastest */ fmt = IMPL_FMT(impl, IMPL_FASTEST); (void) sbuf_printf(s, fmt, "fastest"); /* list all supported implementations */ for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { fmt = IMPL_FMT(impl, i); (void) sbuf_printf(s, fmt, fletcher_4_supp_impls[i]->name); } err = sbuf_finish(s); sbuf_delete(s); return (err); } char buf[16]; err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err) return (err); return (-fletcher_4_impl_set(buf)); } #endif #undef IMPL_FMT /* * Choose a fletcher 4 implementation in ZFS. * Users can choose "cycle" to exercise all implementations, but this is * for testing purpose therefore it can only be set in user space. */ ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl, fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW, "Select fletcher 4 implementation."); EXPORT_SYMBOL(fletcher_init); EXPORT_SYMBOL(fletcher_2_incremental_native); EXPORT_SYMBOL(fletcher_2_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_init); EXPORT_SYMBOL(fletcher_4_fini); EXPORT_SYMBOL(fletcher_2_native); EXPORT_SYMBOL(fletcher_2_byteswap); EXPORT_SYMBOL(fletcher_4_native); EXPORT_SYMBOL(fletcher_4_native_varsize); EXPORT_SYMBOL(fletcher_4_byteswap); EXPORT_SYMBOL(fletcher_4_incremental_native); EXPORT_SYMBOL(fletcher_4_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_abd_ops); #endif diff --git a/module/zfs/fm.c b/module/zfs/fm.c index 3f05d759770b..52ea6262a29f 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -1,1375 +1,1374 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Fault Management Architecture (FMA) Resource and Protocol Support * * The routines contained herein provide services to support kernel subsystems * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089). * * Name-Value Pair Lists * * The embodiment of an FMA protocol element (event, fmri or authority) is a * name-value pair list (nvlist_t). FMA-specific nvlist constructor and * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used * to create an nvpair list using custom allocators. Callers may choose to * allocate either from the kernel memory allocator, or from a preallocated * buffer, useful in constrained contexts like high-level interrupt routines. * * Protocol Event and FMRI Construction * * Convenience routines are provided to construct nvlist events according to * the FMA Event Protocol and Naming Schema specification for ereports and * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes. * * ENA Manipulation * * Routines to generate ENA formats 0, 1 and 2 are available as well as * routines to increment formats 1 and 2. Individual fields within the * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(), * fm_ena_format_get() and fm_ena_gen_get(). */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include static uint_t zfs_zevent_len_max = 512; static uint_t zevent_len_cur = 0; static int zevent_waiters = 0; static int zevent_flags = 0; /* Num events rate limited since the last time zfs_zevent_next() was called */ static uint64_t ratelimit_dropped = 0; /* * The EID (Event IDentifier) is used to uniquely tag a zevent when it is * posted. The posted EIDs are monotonically increasing but not persistent. * They will be reset to the initial value (1) each time the kernel module is * loaded. */ static uint64_t zevent_eid = 0; static kmutex_t zevent_lock; static list_t zevent_list; static kcondvar_t zevent_cv; #endif /* _KERNEL */ /* * Common fault management kstats to record event generation failures */ struct erpt_kstat { kstat_named_t erpt_dropped; /* num erpts dropped on post */ kstat_named_t erpt_set_failed; /* num erpt set failures */ kstat_named_t fmri_set_failed; /* num fmri set failures */ kstat_named_t payload_set_failed; /* num payload set failures */ kstat_named_t erpt_duplicates; /* num duplicate erpts */ }; static struct erpt_kstat erpt_kstat_data = { { "erpt-dropped", KSTAT_DATA_UINT64 }, { "erpt-set-failed", KSTAT_DATA_UINT64 }, { "fmri-set-failed", KSTAT_DATA_UINT64 }, { "payload-set-failed", KSTAT_DATA_UINT64 }, { "erpt-duplicates", KSTAT_DATA_UINT64 } }; kstat_t *fm_ksp; #ifdef _KERNEL static zevent_t * zfs_zevent_alloc(void) { zevent_t *ev; ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP); list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t), offsetof(zfs_zevent_t, ze_node)); list_link_init(&ev->ev_node); return (ev); } static void zfs_zevent_free(zevent_t *ev) { /* Run provided cleanup callback */ ev->ev_cb(ev->ev_nvl, ev->ev_detector); list_destroy(&ev->ev_ze_list); kmem_free(ev, sizeof (zevent_t)); } static void zfs_zevent_drain(zevent_t *ev) { zfs_zevent_t *ze; ASSERT(MUTEX_HELD(&zevent_lock)); list_remove(&zevent_list, ev); /* Remove references to this event in all private file data */ while ((ze = list_head(&ev->ev_ze_list)) != NULL) { list_remove(&ev->ev_ze_list, ze); ze->ze_zevent = NULL; ze->ze_dropped++; } zfs_zevent_free(ev); } void zfs_zevent_drain_all(uint_t *count) { zevent_t *ev; mutex_enter(&zevent_lock); while ((ev = list_head(&zevent_list)) != NULL) zfs_zevent_drain(ev); *count = zevent_len_cur; zevent_len_cur = 0; mutex_exit(&zevent_lock); } /* * New zevents are inserted at the head. If the maximum queue * length is exceeded a zevent will be drained from the tail. * As part of this any user space processes which currently have * a reference to this zevent_t in their private data will have * this reference set to NULL. */ static void zfs_zevent_insert(zevent_t *ev) { ASSERT(MUTEX_HELD(&zevent_lock)); list_insert_head(&zevent_list, ev); if (zevent_len_cur >= zfs_zevent_len_max) zfs_zevent_drain(list_tail(&zevent_list)); else zevent_len_cur++; } /* * Post a zevent. The cb will be called when nvl and detector are no longer * needed, i.e.: * - An error happened and a zevent can't be posted. In this case, cb is called * before zfs_zevent_post() returns. * - The event is being drained and freed. */ int zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) { inode_timespec_t tv; int64_t tv_array[2]; uint64_t eid; size_t nvl_size = 0; zevent_t *ev; int error; ASSERT(cb != NULL); gethrestime(&tv); tv_array[0] = tv.tv_sec; tv_array[1] = tv.tv_nsec; error = nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2); if (error) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); goto out; } eid = atomic_inc_64_nv(&zevent_eid); error = nvlist_add_uint64(nvl, FM_EREPORT_EID, eid); if (error) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); goto out; } error = nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE); if (error) { atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); goto out; } if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) { atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); error = EOVERFLOW; goto out; } ev = zfs_zevent_alloc(); if (ev == NULL) { atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); error = ENOMEM; goto out; } ev->ev_nvl = nvl; ev->ev_detector = detector; ev->ev_cb = cb; ev->ev_eid = eid; mutex_enter(&zevent_lock); zfs_zevent_insert(ev); cv_broadcast(&zevent_cv); mutex_exit(&zevent_lock); out: if (error) cb(nvl, detector); return (error); } void zfs_zevent_track_duplicate(void) { atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64); } static int zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) { *ze = zfsdev_get_state(minor, ZST_ZEVENT); if (*ze == NULL) return (SET_ERROR(EBADF)); return (0); } zfs_file_t * zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze) { zfs_file_t *fp = zfs_file_get(fd); if (fp == NULL) return (NULL); int error = zfsdev_getminor(fp, minorp); if (error == 0) error = zfs_zevent_minor_to_state(*minorp, ze); if (error) { zfs_zevent_fd_rele(fp); fp = NULL; } return (fp); } void zfs_zevent_fd_rele(zfs_file_t *fp) { zfs_file_put(fp); } /* * Get the next zevent in the stream and place a copy in 'event'. This * may fail with ENOMEM if the encoded nvlist size exceeds the passed * 'event_size'. In this case the stream pointer is not advanced and * and 'event_size' is set to the minimum required buffer size. */ int zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, uint64_t *dropped) { zevent_t *ev; size_t size; int error = 0; mutex_enter(&zevent_lock); if (ze->ze_zevent == NULL) { /* New stream start at the beginning/tail */ ev = list_tail(&zevent_list); if (ev == NULL) { error = ENOENT; goto out; } } else { /* * Existing stream continue with the next element and remove * ourselves from the wait queue for the previous element */ ev = list_prev(&zevent_list, ze->ze_zevent); if (ev == NULL) { error = ENOENT; goto out; } } VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0); if (size > *event_size) { *event_size = size; error = ENOMEM; goto out; } if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); ze->ze_zevent = ev; list_insert_head(&ev->ev_ze_list, ze); (void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP); *dropped = ze->ze_dropped; #ifdef _KERNEL /* Include events dropped due to rate limiting */ *dropped += atomic_swap_64(&ratelimit_dropped, 0); #endif ze->ze_dropped = 0; out: mutex_exit(&zevent_lock); return (error); } /* * Wait in an interruptible state for any new events. */ int zfs_zevent_wait(zfs_zevent_t *ze) { int error = EAGAIN; mutex_enter(&zevent_lock); zevent_waiters++; while (error == EAGAIN) { if (zevent_flags & ZEVENT_SHUTDOWN) { error = SET_ERROR(ESHUTDOWN); break; } - error = cv_wait_sig(&zevent_cv, &zevent_lock); - if (signal_pending(current)) { + if (cv_wait_sig(&zevent_cv, &zevent_lock) == 0) { error = SET_ERROR(EINTR); break; } else if (!list_is_empty(&zevent_list)) { error = 0; continue; } else { error = EAGAIN; } } zevent_waiters--; mutex_exit(&zevent_lock); return (error); } /* * The caller may seek to a specific EID by passing that EID. If the EID * is still available in the posted list of events the cursor is positioned * there. Otherwise ENOENT is returned and the cursor is not moved. * * There are two reserved EIDs which may be passed and will never fail. * ZEVENT_SEEK_START positions the cursor at the start of the list, and * ZEVENT_SEEK_END positions the cursor at the end of the list. */ int zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid) { zevent_t *ev; int error = 0; mutex_enter(&zevent_lock); if (eid == ZEVENT_SEEK_START) { if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); ze->ze_zevent = NULL; goto out; } if (eid == ZEVENT_SEEK_END) { if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); ev = list_head(&zevent_list); if (ev) { ze->ze_zevent = ev; list_insert_head(&ev->ev_ze_list, ze); } else { ze->ze_zevent = NULL; } goto out; } for (ev = list_tail(&zevent_list); ev != NULL; ev = list_prev(&zevent_list, ev)) { if (ev->ev_eid == eid) { if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); ze->ze_zevent = ev; list_insert_head(&ev->ev_ze_list, ze); break; } } if (ev == NULL) error = ENOENT; out: mutex_exit(&zevent_lock); return (error); } void zfs_zevent_init(zfs_zevent_t **zep) { zfs_zevent_t *ze; ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP); list_link_init(&ze->ze_node); } void zfs_zevent_destroy(zfs_zevent_t *ze) { mutex_enter(&zevent_lock); if (ze->ze_zevent) list_remove(&ze->ze_zevent->ev_ze_list, ze); mutex_exit(&zevent_lock); kmem_free(ze, sizeof (zfs_zevent_t)); } #endif /* _KERNEL */ /* * Wrappers for FM nvlist allocators */ static void * i_fm_alloc(nv_alloc_t *nva, size_t size) { (void) nva; return (kmem_alloc(size, KM_SLEEP)); } static void i_fm_free(nv_alloc_t *nva, void *buf, size_t size) { (void) nva; kmem_free(buf, size); } static const nv_alloc_ops_t fm_mem_alloc_ops = { .nv_ao_init = NULL, .nv_ao_fini = NULL, .nv_ao_alloc = i_fm_alloc, .nv_ao_free = i_fm_free, .nv_ao_reset = NULL }; /* * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer * to the newly allocated nv_alloc_t structure is returned upon success or NULL * is returned to indicate that the nv_alloc structure could not be created. */ nv_alloc_t * fm_nva_xcreate(char *buf, size_t bufsz) { nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) { kmem_free(nvhdl, sizeof (nv_alloc_t)); return (NULL); } return (nvhdl); } /* * Destroy a previously allocated nv_alloc structure. The fixed buffer * associated with nva must be freed by the caller. */ void fm_nva_xdestroy(nv_alloc_t *nva) { nv_alloc_fini(nva); kmem_free(nva, sizeof (nv_alloc_t)); } /* * Create a new nv list. A pointer to a new nv list structure is returned * upon success or NULL is returned to indicate that the structure could * not be created. The newly created nv list is created and managed by the * operations installed in nva. If nva is NULL, the default FMA nva * operations are installed and used. * * When called from the kernel and nva == NULL, this function must be called * from passive kernel context with no locks held that can prevent a * sleeping memory allocation from occurring. Otherwise, this function may * be called from other kernel contexts as long a valid nva created via * fm_nva_create() is supplied. */ nvlist_t * fm_nvlist_create(nv_alloc_t *nva) { int hdl_alloced = 0; nvlist_t *nvl; nv_alloc_t *nvhdl; if (nva == NULL) { nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) { kmem_free(nvhdl, sizeof (nv_alloc_t)); return (NULL); } hdl_alloced = 1; } else { nvhdl = nva; } if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) { if (hdl_alloced) { nv_alloc_fini(nvhdl); kmem_free(nvhdl, sizeof (nv_alloc_t)); } return (NULL); } return (nvl); } /* * Destroy a previously allocated nvlist structure. flag indicates whether * or not the associated nva structure should be freed (FM_NVA_FREE) or * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows * it to be re-used for future nvlist creation operations. */ void fm_nvlist_destroy(nvlist_t *nvl, int flag) { nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl); nvlist_free(nvl); if (nva != NULL) { if (flag == FM_NVA_FREE) fm_nva_xdestroy(nva); } } int i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap) { int nelem, ret = 0; data_type_t type; while (ret == 0 && name != NULL) { type = va_arg(ap, data_type_t); switch (type) { case DATA_TYPE_BYTE: ret = nvlist_add_byte(payload, name, va_arg(ap, uint_t)); break; case DATA_TYPE_BYTE_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_byte_array(payload, name, va_arg(ap, uchar_t *), nelem); break; case DATA_TYPE_BOOLEAN_VALUE: ret = nvlist_add_boolean_value(payload, name, va_arg(ap, boolean_t)); break; case DATA_TYPE_BOOLEAN_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_boolean_array(payload, name, va_arg(ap, boolean_t *), nelem); break; case DATA_TYPE_INT8: ret = nvlist_add_int8(payload, name, va_arg(ap, int)); break; case DATA_TYPE_INT8_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_int8_array(payload, name, va_arg(ap, int8_t *), nelem); break; case DATA_TYPE_UINT8: ret = nvlist_add_uint8(payload, name, va_arg(ap, uint_t)); break; case DATA_TYPE_UINT8_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_uint8_array(payload, name, va_arg(ap, uint8_t *), nelem); break; case DATA_TYPE_INT16: ret = nvlist_add_int16(payload, name, va_arg(ap, int)); break; case DATA_TYPE_INT16_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_int16_array(payload, name, va_arg(ap, int16_t *), nelem); break; case DATA_TYPE_UINT16: ret = nvlist_add_uint16(payload, name, va_arg(ap, uint_t)); break; case DATA_TYPE_UINT16_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_uint16_array(payload, name, va_arg(ap, uint16_t *), nelem); break; case DATA_TYPE_INT32: ret = nvlist_add_int32(payload, name, va_arg(ap, int32_t)); break; case DATA_TYPE_INT32_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_int32_array(payload, name, va_arg(ap, int32_t *), nelem); break; case DATA_TYPE_UINT32: ret = nvlist_add_uint32(payload, name, va_arg(ap, uint32_t)); break; case DATA_TYPE_UINT32_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_uint32_array(payload, name, va_arg(ap, uint32_t *), nelem); break; case DATA_TYPE_INT64: ret = nvlist_add_int64(payload, name, va_arg(ap, int64_t)); break; case DATA_TYPE_INT64_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_int64_array(payload, name, va_arg(ap, int64_t *), nelem); break; case DATA_TYPE_UINT64: ret = nvlist_add_uint64(payload, name, va_arg(ap, uint64_t)); break; case DATA_TYPE_UINT64_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_uint64_array(payload, name, va_arg(ap, uint64_t *), nelem); break; case DATA_TYPE_STRING: ret = nvlist_add_string(payload, name, va_arg(ap, char *)); break; case DATA_TYPE_STRING_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_string_array(payload, name, va_arg(ap, const char **), nelem); break; case DATA_TYPE_NVLIST: ret = nvlist_add_nvlist(payload, name, va_arg(ap, nvlist_t *)); break; case DATA_TYPE_NVLIST_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_nvlist_array(payload, name, va_arg(ap, const nvlist_t **), nelem); break; default: ret = EINVAL; } name = va_arg(ap, char *); } return (ret); } void fm_payload_set(nvlist_t *payload, ...) { int ret; const char *name; va_list ap; va_start(ap, payload); name = va_arg(ap, char *); ret = i_fm_payload_set(payload, name, ap); va_end(ap); if (ret) atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64); } /* * Set-up and validate the members of an ereport event according to: * * Member name Type Value * ==================================================== * class string ereport * version uint8_t 0 * ena uint64_t * detector nvlist_t * ereport-payload nvlist_t * * We don't actually add a 'version' member to the payload. Really, * the version quoted to us by our caller is that of the category 1 * "ereport" event class (and we require FM_EREPORT_VERS0) but * the payload version of the actual leaf class event under construction * may be something else. Callers should supply a version in the varargs, * or (better) we could take two version arguments - one for the * ereport category 1 classification (expect FM_EREPORT_VERS0) and one * for the leaf class. */ void fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class, uint64_t ena, const nvlist_t *detector, ...) { char ereport_class[FM_MAX_CLASS]; const char *name; va_list ap; int ret; if (version != FM_EREPORT_VERS0) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); return; } (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s", FM_EREPORT_CLASS, erpt_class); if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); return; } if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); } if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR, (nvlist_t *)detector) != 0) { atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); } va_start(ap, detector); name = va_arg(ap, const char *); ret = i_fm_payload_set(ereport, name, ap); va_end(ap); if (ret) atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); } /* * Set-up and validate the members of an hc fmri according to; * * Member name Type Value * =================================================== * version uint8_t 0 * auth nvlist_t * hc-name string * hc-id string * * Note that auth and hc-id are optional members. */ #define HC_MAXPAIRS 20 #define HC_MAXNAMELEN 50 static int fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth) { if (version != FM_HC_SCHEME_VERSION) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return (0); } if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 || nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return (0); } if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, (nvlist_t *)auth) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return (0); } return (1); } void fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth, nvlist_t *snvl, int npairs, ...) { nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); nvlist_t *pairs[HC_MAXPAIRS]; va_list ap; int i; if (!fm_fmri_hc_set_common(fmri, version, auth)) return; npairs = MIN(npairs, HC_MAXPAIRS); va_start(ap, npairs); for (i = 0; i < npairs; i++) { const char *name = va_arg(ap, const char *); uint32_t id = va_arg(ap, uint32_t); char idstr[11]; (void) snprintf(idstr, sizeof (idstr), "%u", id); pairs[i] = fm_nvlist_create(nva); if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } va_end(ap); if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, (const nvlist_t **)pairs, npairs) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); } for (i = 0; i < npairs; i++) fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); if (snvl != NULL) { if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } } void fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, nvlist_t *snvl, nvlist_t *bboard, int npairs, ...) { nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); nvlist_t *pairs[HC_MAXPAIRS]; nvlist_t **hcl; uint_t n; int i, j; va_list ap; char *hcname, *hcid; if (!fm_fmri_hc_set_common(fmri, version, auth)) return; /* * copy the bboard nvpairs to the pairs array */ if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } for (i = 0; i < n; i++) { if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME, &hcname) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); return; } pairs[i] = fm_nvlist_create(nva); if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 || nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) { for (j = 0; j <= i; j++) { if (pairs[j] != NULL) fm_nvlist_destroy(pairs[j], FM_NVA_RETAIN); } atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); return; } } /* * create the pairs from passed in pairs */ npairs = MIN(npairs, HC_MAXPAIRS); va_start(ap, npairs); for (i = n; i < npairs + n; i++) { const char *name = va_arg(ap, const char *); uint32_t id = va_arg(ap, uint32_t); char idstr[11]; (void) snprintf(idstr, sizeof (idstr), "%u", id); pairs[i] = fm_nvlist_create(nva); if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { for (j = 0; j <= i; j++) { if (pairs[j] != NULL) fm_nvlist_destroy(pairs[j], FM_NVA_RETAIN); } atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); va_end(ap); return; } } va_end(ap); /* * Create the fmri hc list */ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, (const nvlist_t **)pairs, npairs + n) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } for (i = 0; i < npairs + n; i++) { fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); } if (snvl != NULL) { if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); return; } } } /* * Set-up and validate the members of an dev fmri according to: * * Member name Type Value * ==================================================== * version uint8_t 0 * auth nvlist_t * devpath string * [devid] string * [target-port-l0id] string * * Note that auth and devid are optional members. */ void fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth, const char *devpath, const char *devid, const char *tpl0) { int err = 0; if (version != DEV_SCHEME_VERSION0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version); err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV); if (auth != NULL) { err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY, (nvlist_t *)auth); } err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath); if (devid != NULL) err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid); if (tpl0 != NULL) err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0); if (err) atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); } /* * Set-up and validate the members of an cpu fmri according to: * * Member name Type Value * ==================================================== * version uint8_t 0 * auth nvlist_t * cpuid uint32_t * cpumask uint8_t * serial uint64_t * * Note that auth, cpumask, serial are optional members. * */ void fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth, uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp) { uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64; if (version < CPU_SCHEME_VERSION1) { atomic_inc_64(failedp); return; } if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) { atomic_inc_64(failedp); return; } if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME, FM_FMRI_SCHEME_CPU) != 0) { atomic_inc_64(failedp); return; } if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY, (nvlist_t *)auth) != 0) atomic_inc_64(failedp); if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0) atomic_inc_64(failedp); if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK, *cpu_maskp) != 0) atomic_inc_64(failedp); if (serial_idp == NULL || nvlist_add_string(fmri_cpu, FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0) atomic_inc_64(failedp); } /* * Set-up and validate the members of a mem according to: * * Member name Type Value * ==================================================== * version uint8_t 0 * auth nvlist_t [optional] * unum string * serial string [optional*] * offset uint64_t [optional] * * * serial is required if offset is present */ void fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth, const char *unum, const char *serial, uint64_t offset) { if (version != MEM_SCHEME_VERSION0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (!serial && (offset != (uint64_t)-1)) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (auth != NULL) { if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, (nvlist_t *)auth) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); } if (serial != NULL) { if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID, (const char **)&serial, 1) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri, FM_FMRI_MEM_OFFSET, offset) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } } void fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid, uint64_t vdev_guid) { if (version != ZFS_SCHEME_VERSION0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); } if (vdev_guid != 0) { if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } } } uint64_t fm_ena_increment(uint64_t ena) { uint64_t new_ena; switch (ENA_FORMAT(ena)) { case FM_ENA_FMT1: new_ena = ena + (1 << ENA_FMT1_GEN_SHFT); break; case FM_ENA_FMT2: new_ena = ena + (1 << ENA_FMT2_GEN_SHFT); break; default: new_ena = 0; } return (new_ena); } uint64_t fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) { uint64_t ena = 0; switch (format) { case FM_ENA_FMT1: if (timestamp) { ena = (uint64_t)((format & ENA_FORMAT_MASK) | ((cpuid << ENA_FMT1_CPUID_SHFT) & ENA_FMT1_CPUID_MASK) | ((timestamp << ENA_FMT1_TIME_SHFT) & ENA_FMT1_TIME_MASK)); } else { ena = (uint64_t)((format & ENA_FORMAT_MASK) | ((cpuid << ENA_FMT1_CPUID_SHFT) & ENA_FMT1_CPUID_MASK) | ((gethrtime() << ENA_FMT1_TIME_SHFT) & ENA_FMT1_TIME_MASK)); } break; case FM_ENA_FMT2: ena = (uint64_t)((format & ENA_FORMAT_MASK) | ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK)); break; default: break; } return (ena); } uint64_t fm_ena_generate(uint64_t timestamp, uchar_t format) { uint64_t ena; kpreempt_disable(); ena = fm_ena_generate_cpu(timestamp, getcpuid(), format); kpreempt_enable(); return (ena); } uint64_t fm_ena_generation_get(uint64_t ena) { uint64_t gen; switch (ENA_FORMAT(ena)) { case FM_ENA_FMT1: gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT; break; case FM_ENA_FMT2: gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT; break; default: gen = 0; break; } return (gen); } uchar_t fm_ena_format_get(uint64_t ena) { return (ENA_FORMAT(ena)); } uint64_t fm_ena_id_get(uint64_t ena) { uint64_t id; switch (ENA_FORMAT(ena)) { case FM_ENA_FMT1: id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT; break; case FM_ENA_FMT2: id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT; break; default: id = 0; } return (id); } uint64_t fm_ena_time_get(uint64_t ena) { uint64_t time; switch (ENA_FORMAT(ena)) { case FM_ENA_FMT1: time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT; break; case FM_ENA_FMT2: time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT; break; default: time = 0; } return (time); } #ifdef _KERNEL /* * Helper function to increment ereport dropped count. Used by the event * rate limiting code to give feedback to the user about how many events were * rate limited by including them in the 'dropped' count. */ void fm_erpt_dropped_increment(void) { atomic_inc_64(&ratelimit_dropped); } void fm_init(void) { zevent_len_cur = 0; zevent_flags = 0; /* Initialize zevent allocation and generation kstats */ fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED, sizeof (struct erpt_kstat) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (fm_ksp != NULL) { fm_ksp->ks_data = &erpt_kstat_data; kstat_install(fm_ksp); } else { cmn_err(CE_NOTE, "failed to create fm/misc kstat\n"); } mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zevent_list, sizeof (zevent_t), offsetof(zevent_t, ev_node)); cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); zfs_ereport_init(); } void fm_fini(void) { uint_t count; zfs_ereport_fini(); zfs_zevent_drain_all(&count); mutex_enter(&zevent_lock); cv_broadcast(&zevent_cv); zevent_flags |= ZEVENT_SHUTDOWN; while (zevent_waiters > 0) { mutex_exit(&zevent_lock); kpreempt(KPREEMPT_SYNC); mutex_enter(&zevent_lock); } mutex_exit(&zevent_lock); cv_destroy(&zevent_cv); list_destroy(&zevent_list); mutex_destroy(&zevent_lock); if (fm_ksp != NULL) { kstat_delete(fm_ksp); fm_ksp = NULL; } } #endif /* _KERNEL */ ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, UINT, ZMOD_RW, "Max event queue length"); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index f913c3509b61..a1717d4d6038 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1,7885 +1,7883 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Portions Copyright 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, 2021, Klara Inc. * Copyright (c) 2019, Allan Jude */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * We want newer versions of libzfs and libzfs_core to run against * existing zfs kernel modules (i.e. a deferred reboot after an update). * Therefore the ioctl numbers cannot change from release to release. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * zfs_ioc_key_t *nvl_keys * The list of expected/allowable innvl input keys. This list is used * to validate the nvlist input to the ioctl. * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. * * IOCTL Interface Errors * * The following ioctl input errors can be returned: * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include #include #include kmutex_t zfsdev_state_lock; static zfsdev_state_t *zfsdev_state_list; /* * Limit maximum nvlist size. We don't want users passing in insane values * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ uint64_t zfs_max_nvlist_src_size = 0; /* * When logging the output nvlist of an ioctl in the on-disk history, limit * the logged size to this many bytes. This must be less than DMU_MAX_ACCESS. * This applies primarily to zfs_ioc_channel_program(). */ static uint64_t zfs_history_output_max = 1024 * 1024; uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ #define DATA_TYPE_ANY DATA_TYPE_UNKNOWN typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; const zfs_ioc_key_t *zvec_nvl_keys; size_t zvec_nvl_key_count; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_PERM_PROJECTOBJQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl, (void) cr; if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs_t; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static const char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, B_TRUE, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); #else return (SET_ERROR(ENOTSUP)); #endif /* HAVE_MLSLABEL */ } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { default: break; case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ (void) innvl; return (zfs_dozonecheck(zc->zc_name, cr)); } static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; dsl_pool_t *dp; dsl_dataset_t *ds; const char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strlcpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvlist_t *snaps; int error = 0; nvpair_t *pair; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each bookmark in the nvlist. */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl, (void) cr; /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl; if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; int error; - if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) + if (secpolicy_sys_config(cr, B_FALSE) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc, (void) innvl; return (secpolicy_zinject(cr)); } static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA || zc->zc_objset_type == ZFS_PROP_USEROBJUSED || zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED || zc->zc_objset_type == ZFS_PROP_GROUPQUOTA || zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED || zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) { if (groupmember(zc->zc_guid, cr)) return (0); } /* else is for project quota/used */ } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) innvl; return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair; nvlist_t *holds; int error; holds = fnvlist_lookup_nvlist(innvl, "holds"); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { (void) zc; nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; - if ((error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_DIFF, cr)) == 0) + if (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_DIFF, cr) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (innvl != NULL) { if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); } return (error); } static int zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_LOAD_KEY, cr)); } static int zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CHANGE_KEY, cr)); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = vmem_alloc(size, KM_SLEEP); - if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, - iflag)) != 0) { + if (ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag) != 0) { vmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { vmem_free(packed, size); return (error); } vmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { error = SET_ERROR(ENOMEM); } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) { int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); /* bump s_active only when non-zero to prevent umount race */ error = zfs_vfs_ref(zfvp); mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, zfvp); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all inode ops from running. */ static int zfsvfs_hold(const char *name, const void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, B_FALSE, zfvp); if (error == 0) { if (writer) ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag); else ZFS_TEARDOWN_ENTER_READ(*zfvp, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ ZFS_TEARDOWN_EXIT(*zfvp, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, const void *tag) { ZFS_TEARDOWN_EXIT(zfsvfs, tag); if (zfs_vfs_held(zfsvfs)) { zfs_vfs_rele(zfsvfs); } else { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; dsl_crypto_params_t *dcp = NULL; const char *spa_name = zc->zc_name; boolean_t unload_wkey = B_TRUE; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, &hidden_args); error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, rootprops, hidden_args, &dcp); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) spa_name = tname; } error = spa_create(zc->zc_name, config, props, zplprops, dcp); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(spa_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) { (void) spa_destroy(spa_name); unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */ } pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); dsl_crypto_params_free(dcp, unload_wkey && !!error); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = vmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); vmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL && zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_REMOVED: error = vdev_remove_wanted(spa, zc->zc_guid); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config; int replacing = zc->zc_cookie; int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, rebuild); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; const char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; const char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (!zc->zc_simple && zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading without owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) { nvlist_free(nv); return (error); } VERIFY0(error); } if (error == 0) error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_src iteration range nvlist * zc_nvlist_src_size size of iteration range nvlist * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { int error; objset_t *os, *ossnap; dsl_dataset_t *ds; uint64_t min_txg = 0, max_txg = 0; if (zc->zc_nvlist_src_size != 0) { nvlist_t *props = NULL; error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props); if (error != 0) return (error); (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, &min_txg); (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, &max_txg); nvlist_free(props); } error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? SET_ERROR(ESRCH) : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } while (error == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { error = SET_ERROR(EINTR); break; } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == ENOENT) { error = SET_ERROR(ESRCH); break; } else if (error != 0) { break; } error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, FTAG, &ds); if (error != 0) break; if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { dsl_dataset_rele(ds, FTAG); /* undo snapshot name append */ *(strchr(zc->zc_name, '@') + 1) = '\0'; /* skip snapshot */ continue; } if (zc->zc_simple) { dsl_dataset_fast_stat(ds, &zc->zc_objset_stats); dsl_dataset_rele(ds, FTAG); break; } if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } dsl_dataset_rele(ds, FTAG); break; } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *dash, *domain; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; int err = -1; if (prop == ZPROP_USERPROP) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* all special properties are numeric except for keylocation */ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { strval = fnvpair_value_string(pair); } else { intval = fnvpair_value_uint64(pair); } switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_KEYLOCATION: err = dsl_crypto_can_set_keylocation(dsname, strval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_COMPRESSION: err = dsl_dataset_set_compression(dsname, source, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_SNAPDEV: err = zvol_set_snapdev(dsname, source, intval); break; case ZFS_PROP_VOLMODE: err = zvol_set_volmode(dsname, source, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dsname, sizeof (zc->zc_name)); (void) zfs_ioc_userspace_upgrade(zc); (void) zfs_ioc_id_quota_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; int err; uint64_t intval; const char *strval; boolean_t should_update_mount_cache = B_FALSE; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && source == ZPROP_SRC_INHERITED) { /* inherited properties are expected to be booleans */ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) err = SET_ERROR(EINVAL); } else if (err == 0 && prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(ZFS_ERR_BADPROP); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { if (source == ZPROP_SRC_INHERITED) err = -1; /* does not need special handling */ else err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } if (zfs_is_namespace_prop(prop)) should_update_mount_cache = B_TRUE; } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (nvlist_empty(genericnvl)) goto out; /* * Try to set them all in one batch. */ err = dsl_props_set(dsname, source, genericnvl); if (err == 0) goto out; /* * If batching fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); - err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { err = dsl_prop_inherit(dsname, propname, source); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } out: if (should_update_mount_cache) zfs_ioctl_update_mount_cache(dsname); nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (SET_ERROR(E2BIG)); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; if (!received) { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ dummy = fnvlist_alloc(); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: err = SET_ERROR(EINVAL); goto errout; } pair = nvlist_next_nvpair(dummy, NULL); if (pair == NULL) { err = SET_ERROR(EINVAL); } else { err = zfs_prop_set_special(zc->zc_name, source, pair); if (err == -1) /* property is not "special", needs handling */ err = dsl_prop_inherit(zc->zc_name, zc->zc_value, source); } errout: nvlist_free(dummy); return (err); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * innvl: { * "vdevprops_set_vdev" -> guid * "vdevprops_set_props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_vdev_set_props[] = { {ZPOOL_VDEV_PROPS_SET_VDEV, DATA_TYPE_UINT64, 0}, {ZPOOL_VDEV_PROPS_SET_PROPS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; vdev_t *vd; uint64_t vdev_guid; /* Early validation */ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (outnvl == NULL) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); ASSERT(spa_writeable(spa)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_set(vd, innvl, outnvl); spa_close(spa, FTAG); return (error); } /* * innvl: { * "vdevprops_get_vdev" -> guid * (optional) "vdevprops_get_props" -> { propname -> propid } * } * * outnvl: propname -> value */ static const zfs_ioc_key_t zfs_keys_vdev_get_props[] = { {ZPOOL_VDEV_PROPS_GET_VDEV, DATA_TYPE_UINT64, 0}, {ZPOOL_VDEV_PROPS_GET_PROPS, DATA_TYPE_NVLIST, ZK_OPTIONAL} }; static int zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; vdev_t *vd; uint64_t vdev_guid; /* Early validation */ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (outnvl == NULL) return (SET_ERROR(EINVAL)); if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { spa_close(spa, FTAG); return (SET_ERROR(ENOENT)); } error = vdev_prop_get(vd, innvl, outnvl); spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ - if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { + if (zfs_deleg_verify_nvlist(fsaclnv) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; int error; ASSERT(zplprops != NULL); /* parent dataset must be a filesystem */ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; spa_t *spa; uint64_t spa_vers; int error; zfs_get_parent(dataset, parentname, sizeof (parentname)); if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_create[] = { {"type", DATA_TYPE_INT32, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; dsl_crypto_params_t *dcp = NULL; type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize(fsname, volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, hidden_args, &dcp); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); nvlist_free(zct.zct_zplprops); dsl_crypto_params_free(dcp, !!error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) { spa_t *spa; int error2; /* * Volumes will return EBUSY and cannot be destroyed * until all asynchronous minor handling (e.g. from * setting the volmode property) has completed. Wait for * the spa_zvol_taskq to drain then retry. */ error2 = dsl_destroy_head(fsname); while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { error2 = spa_open(fsname, &spa, FTAG); if (error2 == 0) { taskq_wait(spa->spa_zvol_taskq); spa_close(spa, FTAG); } error2 = dsl_destroy_head(fsname); } } } return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outputs: * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_clone[] = { {"origin", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; const char *origin_name; origin_name = fnvlist_lookup_string(innvl, "origin"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } static const zfs_ioc_key_t zfs_keys_remap[] = { /* no nvl keys */ }; static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { /* This IOCTL is no longer supported. */ (void) fsname, (void) innvl, (void) outnvl; return (0); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_snapshot[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if ((error = zfs_check_userprops(props)) != 0) return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* * Check for permission to set the properties on the fs. */ if (!nvlist_empty(props)) { *cp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_USERPROP, CRED()); *cp = '@'; if (error != 0) return (error); } /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ static const zfs_ioc_key_t zfs_keys_log_history[] = { {"message", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { (void) unused, (void) outnvl; const char *message; char *poolname; spa_t *spa; int error; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); if (poolname == NULL) return (SET_ERROR(EINVAL)); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); kmem_strfree(poolname); if (error != 0) return (error); message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * This ioctl is used to set the bootenv configuration on the current * pool. This configuration is stored in the second padding area of the label, * and it is used by the bootloader(s) to store the bootloader and/or system * specific data. * The data is stored as nvlist data stream, and is protected by * an embedded checksum. * The version can have two possible values: * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. * VB_NVLIST: nvlist with arbitrary pairs. */ static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { {"version", DATA_TYPE_UINT64, 0}, {"", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, }; static int zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int error; spa_t *spa; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { /* no nvl keys */ }; static int zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { if (strchr(snapname, '@') == NULL) return; (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE); } static int zfs_unmount_snap_cb(const char *snapname, void *arg) { (void) arg; zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; spa_t *spa; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); zfs_unmount_snap(nvpair_name(pair)); if (spa_open(name, &spa, FTAG) == 0) { zvol_remove_minors(spa, name, B_TRUE); spa_close(spa, FTAG); } } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. The bookmark names are of the form #. * All bookmarks and snapshots must be in the same pool. * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. * * innvl: { * new_bookmark1 -> existing_snapshot, * new_bookmark2 -> existing_bookmark, * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_bookmark[] = { {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, }; static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) poolname; return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, }; static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl is not used. * * outnvl: { * property 1, property 2, ... * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { /* no nvl keys */ }; static int zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmname; bmname = strchr(bookmark, '#'); if (bmname == NULL) return (SET_ERROR(EINVAL)); bmname++; (void) strlcpy(fsname, bookmark, sizeof (fsname)); *(strchr(fsname, '#')) = '\0'; return (dsl_get_bookmark_props(fsname, bmname, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, }; static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static const zfs_ioc_key_t zfs_keys_channel_program[] = { {"program", DATA_TYPE_STRING, 0}, {"arg", DATA_TYPE_ANY, 0}, {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (SET_ERROR(EINVAL)); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (SET_ERROR(EINVAL)); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { /* no nvl keys */ }; static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { /* no nvl keys */ }; static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; return (spa_checkpoint_discard(poolname)); } /* * inputs: * zc_name name of dataset to destroy * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; int err; err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); if (ost == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) { err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); } else { err = dsl_destroy_head(zc->zc_name); if (err == EEXIST) { /* * It is possible that the given DS may have * hidden child (%recv) datasets - "leftovers" * resulting from the previously interrupted * 'zfs receive'. * * 6 extra bytes for /%recv */ char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6]; if (snprintf(namebuf, sizeof (namebuf), "%s/%s", zc->zc_name, recv_clone_name) >= sizeof (namebuf)) return (SET_ERROR(EINVAL)); /* * Try to remove the hidden child (%recv) and after * that try to remove the target dataset. * If the hidden child (%recv) does not exist * the original error (EEXIST) will be returned */ err = dsl_destroy_head(namebuf); if (err == 0) err = dsl_destroy_head(zc->zc_name); else if (err == ENOENT) err = SET_ERROR(EEXIST); } } return (err); } /* * innvl: { * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64) * "initialize_vdevs": { -> guids to initialize (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * } * * outnvl: { * "initialize_vdevs": { -> initialization errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { return (SET_ERROR(EINVAL)); } if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || cmd_type == POOL_INITIALIZE_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { return (SET_ERROR(EINVAL)); } for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist); } fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* * innvl: { * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64) * "trim_vdevs": { -> guids to TRIM (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * "trim_rate" -> Target TRIM rate in bytes/sec. * "trim_secure" -> Set to request a secure TRIM. * } * * outnvl: { * "trim_vdevs": { -> TRIM errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_trim[] = { {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0) return (SET_ERROR(EINVAL)); if (!(cmd_type == POOL_TRIM_CANCEL || cmd_type == POOL_TRIM_START || cmd_type == POOL_TRIM_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0) return (SET_ERROR(EINVAL)); for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } /* Optional, defaults to maximum rate when not provided */ uint64_t rate; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0) rate = 0; /* Optional, defaults to standard TRIM when not provided */ boolean_t secure; if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE, &secure) != 0) { secure = B_FALSE; } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type, rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist); fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a tag is provided, it identifies a particular instance of an activity to * wait for. Currently, this is only valid for use with 'initialize', because * that is the only activity for which there can be multiple instances running * concurrently. In the case of 'initialize', the tag corresponds to the guid of * the vdev on which to wait. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * (optional) "wait_tag" -> uint64_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_pool_wait[] = { {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; uint64_t tag; boolean_t waited; int error; if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) return (EINVAL); if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) error = spa_wait_tag(name, activity, tag, &waited); else error = spa_wait(name, activity, &waited); if (error == 0) fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); return (error); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_fs_wait[] = { {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, }; static int zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; boolean_t waited = B_FALSE; int error; dsl_pool_t *dp; dsl_dir_t *dd; dsl_dataset_t *ds; if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) return (SET_ERROR(EINVAL)); if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) return (SET_ERROR(EINVAL)); if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) return (error); if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { dsl_pool_rele(dp, FTAG); return (error); } dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); dd->dd_activity_waiters++; /* * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t * aren't evicted while we're waiting. Normally this is prevented by * holding the pool, but we can't do that while we're waiting since * that would prevent TXGs from syncing out. Some of the functionality * of long-holds (e.g. preventing deletion) is unnecessary for this * case, since we would cancel the waiters before proceeding with a * deletion. An alternative mechanism for keeping the dataset around * could be developed but this is simpler. */ dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, FTAG); error = dsl_dir_wait(dd, ds, activity, &waited); dsl_dataset_long_rele(ds, FTAG); dd->dd_activity_waiters--; if (dd->dd_activity_waiters == 0) cv_signal(&dd->dd_activity_cv); mutex_exit(&dd->dd_activity_lock); dsl_dataset_rele(ds, FTAG); if (error == 0) fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); return (error); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ static const zfs_ioc_key_t zfs_keys_rollback[] = { {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, }; static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; zvol_state_handle_t *zv; char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(fsname)) != NULL) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char *fullname; fullname = kmem_asprintf("%s@%s", fsname, snapname); zfs_unmount_snap(fullname); kmem_strfree(fullname); return (0); } /* * * snapname is the snapshot to redact. * innvl: { * "bookname" -> (string) * shortname of the redaction bookmark to generate * "snapnv" -> (nvlist, values ignored) * snapshots to redact snapname with respect to * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_redact[] = { {"bookname", DATA_TYPE_STRING, 0}, {"snapnv", DATA_TYPE_NVLIST, 0}, }; static int zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; nvlist_t *redactnvl = NULL; char *redactbook = NULL; if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) return (SET_ERROR(EINVAL)); if (fnvlist_num_pairs(redactnvl) == 0) return (SET_ERROR(ENXIO)); if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) return (SET_ERROR(EINVAL)); return (dmu_redact_snap(snapname, redactnvl, redactbook)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; boolean_t nounmount = !!(zc->zc_cookie & 2); char *at; int err; /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (ost == DMU_OST_ZFS && !nounmount) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval, compval; int err; if (prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; const char *uiq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA]; const char *giq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA]; const char *pq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA]; const char *piq_prefix = zfs_userquota_prop_prefixes[\ ZFS_PROP_PROJECTOBJQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, uiq_prefix, strlen(uiq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USEROBJQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else if (strncmp(propname, giq_prefix, strlen(giq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPOBJQUOTA; } else if (strncmp(propname, pq_prefix, strlen(pq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTQUOTA; } else if (strncmp(propname, piq_prefix, strlen(piq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA; } else { /* {USER|GROUP|PROJECT}USED are read-only */ return (SET_ERROR(EINVAL)); } if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { compval = ZIO_COMPRESS_ALGO(intval); if (compval >= ZIO_COMPRESS_GZIP_1 && compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } if (compval == ZIO_COMPRESS_ZSTD) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_DNODESIZE: /* Dnode sizes above 512 need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval != ZFS_DNSIZE_LEGACY) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SPECIAL_SMALL_BLOCKS: /* * This property could require the allocation classes * feature to be active for setting, however we allow * it so that tests of settable properties succeed. * The CLI will issue a warning in this case. */ break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; int err; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { /* check prop value is enabled in features */ feature = zio_checksum_to_feature( intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; } default: break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strlcpy(zc->zc_value, nvpair_name(pair), sizeof (zc->zc_value)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, /* * Setting ZFS_PROP_SHARESMB requires the objset type to be * known, which is not possible prior to receipt of raw sends. */ ZFS_PROP_SHARESMB, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; if (poolname != NULL) kmem_strfree(poolname); } #ifdef ZFS_DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * nvlist 'errors' is always allocated. It will contain descriptions of * encountered errors, if any. It's the callers responsibility to free. */ static int zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, boolean_t heal, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; int props_error = 0; offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; nvlist_t *inherited_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; boolean_t tofs_was_redacted; zfs_file_t *input_fp; *read_bytes = 0; *errflags = 0; *errors = fnvlist_alloc(); off = 0; if ((input_fp = zfs_file_get(input_fd)) == NULL) return (SET_ERROR(EBADF)); noff = off = zfs_file_off(input_fp); error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) goto out; tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (recvprops != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, * so stash away the existing ones. */ if (dsl_prop_get_received(tofs, &origrecvd) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(recvprops, origrecvd); if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0) (void) nvlist_merge(*errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origrecvd, first_recvd_props ? NULL : recvprops) != 0) *errflags |= ZPROP_ERR_NOCLEAR; } else { *errflags |= ZPROP_ERR_NOCLEAR; } } /* * Stash away existing properties so we can restore them on error unless * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which * case "origrecvd" will take care of that. */ if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) { objset_t *os; if (dmu_objset_hold(tofs, FTAG, &os) == 0) { if (dsl_prop_get_all(os, &origprops) != 0) { *errflags |= ZPROP_ERR_NOCLEAR; } dmu_objset_rele(os, FTAG); } else { *errflags |= ZPROP_ERR_NOCLEAR; } } if (recvprops != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { recv_delayprops = extract_delay_props(recvprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recvprops, *errors); } } if (localprops != NULL) { nvlist_t *oprops = fnvlist_alloc(); nvlist_t *xprops = fnvlist_alloc(); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) { /* -x property */ const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); if (prop != ZPROP_USERPROP) { if (!zfs_prop_inheritable(prop)) continue; } else if (!zfs_prop_user(name)) continue; fnvlist_add_boolean(xprops, name); } else { /* -o property=value */ fnvlist_add_nvpair(oprops, nvp); } } local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); inherited_delayprops = extract_delay_props(xprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); nvlist_free(oprops); nvlist_free(xprops); } error = dmu_recv_stream(&drc, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; zvol_state_handle_t *zv = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( begin_record->drr_u.drr_begin. drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); /* * If the dataset was not redacted, but we received a * redacted stream onto it, we need to unmount the * dataset. Otherwise, resume the filesystem. */ if (error == 0 && !drc.drc_newfs && stream_is_redacted && !tofs_was_redacted) { error = zfs_end_fs(zfsvfs, ds); } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(tofs)) != NULL) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (recv_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recv_delayprops, *errors); } if (local_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } if (inherited_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inherited_delayprops, *errors); } } /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ if (recv_delayprops != NULL) { ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); nvlist_free(recv_delayprops); } if (local_delayprops != NULL) { ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } if (inherited_delayprops != NULL) { ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0); nvlist_free(inherited_delayprops); } *read_bytes = off - noff; #ifdef ZFS_DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error != 0 && recvprops != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, recvprops, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ *errflags |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origrecvd == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explicitly if we're restoring local properties cleared in the * first new-style receive. */ if (origrecvd != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origrecvd, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ *errflags |= ZPROP_ERR_NORESTORE; } } if (error != 0 && localprops != NULL && !drc.drc_newfs && !first_recvd_props) { nvlist_t *setprops; nvlist_t *inheritprops; nvpair_t *nvp; if (origprops == NULL) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; goto out; } /* Restore original props */ setprops = fnvlist_alloc(); inheritprops = fnvlist_alloc(); nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); const char *source; nvlist_t *attrs; if (!nvlist_exists(origprops, name)) { /* * Property was not present or was explicitly * inherited before the receive, restore this. */ fnvlist_add_boolean(inheritprops, name); continue; } attrs = fnvlist_lookup_nvlist(origprops, name); source = fnvlist_lookup_string(attrs, ZPROP_SOURCE); /* Skip received properties */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) continue; if (strcmp(source, tofs) == 0) { /* Property was locally set */ fnvlist_add_nvlist(setprops, name, attrs); } else { /* Property was implicitly inherited */ fnvlist_add_boolean(inheritprops, name); } } if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; nvlist_free(setprops); nvlist_free(inheritprops); } out: zfs_file_put(input_fp); nvlist_free(origrecvd); nvlist_free(origprops); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of containing filesystem (unused) * zc_nvlist_src{_size} nvlist of properties to apply * zc_nvlist_conf{_size} nvlist of properties to exclude * (DATA_TYPE_BOOLEAN) and override (everything else) * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * * outputs: * zc_cookie number of bytes read * zc_obj zprop_errflags_t * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_recv(zfs_cmd_t *zc) { dmu_replay_record_t begin_record; nvlist_t *errors = NULL; nvlist_t *recvdprops = NULL; nvlist_t *localprops = NULL; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; int error = 0; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &recvdprops)) != 0) return (error); if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &localprops)) != 0) return (error); if (zc->zc_string[0]) origin = zc->zc_string; begin_record.drr_type = DRR_BEGIN; begin_record.drr_payloadlen = 0; begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); nvlist_free(recvdprops); nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && errors != NULL && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ error = SET_ERROR(EINVAL); } nvlist_free(errors); return (error); } /* * innvl: { * "snapname" -> full name of the snapshot to create * (optional) "props" -> received properties to set (nvlist) * (optional) "localprops" -> override and exclude properties (nvlist) * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "heal" -> use send stream to heal data corruption * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused * (optional) "hidden_args" -> { "wkeydata" -> value } * } * * outnvl: { * "read_bytes" -> number of bytes read * "error_flags" -> zprop_errflags_t * "errors" -> error for each unapplied received property (nvlist) * } */ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"snapname", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"origin", DATA_TYPE_STRING, ZK_OPTIONAL}, {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"heal", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { dmu_replay_record_t *begin_record; uint_t begin_record_size; nvlist_t *errors = NULL; nvlist_t *recvprops = NULL; nvlist_t *localprops = NULL; nvlist_t *hidden_args = NULL; char *snapname; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; boolean_t heal; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; int input_fd = -1; int error; snapname = fnvlist_lookup_string(innvl, "snapname"); if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || strchr(snapname, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; error = nvlist_lookup_string(innvl, "origin", &origin); if (error && error != ENOENT) return (error); error = nvlist_lookup_byte_array(innvl, "begin_record", (uchar_t **)&begin_record, &begin_record_size); if (error != 0 || begin_record_size != sizeof (*begin_record)) return (SET_ERROR(EINVAL)); input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); heal = nvlist_exists(innvl, "heal"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, heal, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); return (error); } typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; int dbi_len; int dbi_err; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; zfs_file_t *fp; caddr_t buf; fp = dbi->dbi_fp; buf = dbi->dbi_buf; dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { dump_bytes_io_t dbi; dbi.dbi_fp = arg; dbi.dbi_buf = buf; dbi.dbi_len = len; #if defined(HAVE_LARGE_STACKS) dump_bytes_cb(&dbi); #else /* * The vn_rdwr() call is performed in a taskq to ensure that there is * always enough stack space to write safely to the target filesystem. * The ZIO_TYPE_FREE threads are used because there can be a lot of * them and they are used in vdev_file.c for a similar purpose. */ spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); #endif /* HAVE_LARGE_STACKS */ return (dbi.dbi_err); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set * * NOTE: This is no longer the preferred interface, any new functionality * should be added to zfs_ioc_send_new() instead. */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); boolean_t rawok = (zc->zc_flags & 0x8); boolean_t savedok = (zc->zc_flags & 0x10); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, compressok || rawok, savedok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { zfs_file_t *fp; dmu_send_outparams_t out = {0}; if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); zfs_file_put(fp); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dss_outfd == zc->zc_cookie && zfs_proc_is_caller(dsp->dss_proc)) break; } if (dsp != NULL) { zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, 0, 0); /* This is the closest thing we have to atomic_read_64. */ zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); } else { error = SET_ERROR(ENOENT); } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &zc->zc_nvlist_dst_size); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); /* * If multihost is enabled, resuming I/O is unsafe as another * host may have imported the pool. */ if (spa_multihost(spa) && spa_suspended(spa)) return (SET_ERROR(EINVAL)); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { error = SET_ERROR(ENODEV); (void) spa_vdev_state_exit(spa, NULL, error); spa_close(spa, FTAG); return (error); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, spa_suspended(spa) ? NULL : spa->spa_root_vdev, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } /* * Reopen all the vdevs associated with the pool. * * innvl: { * "scrub_restart" -> when true and scrub is running, allow to restart * scrub as the side effect of the reopen (boolean). * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; spa_t *spa; int error; boolean_t rc, scrub_restart = B_TRUE; if (innvl) { error = nvlist_lookup_boolean_value(innvl, "scrub_restart", &rc); if (error == 0) scrub_restart = rc; } error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If the scrub_restart flag is B_FALSE and a scrub is already * in progress then set spa_scrub_reopen flag to B_TRUE so that * we don't restart the scrub as a side effect of the reopen. * Otherwise, let vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = (!scrub_restart && dsl_scan_scrubbing(spa->spa_dsl_pool)); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group|project}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = vmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } vmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, B_TRUE, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) { mutex_enter(&zfsvfs->z_os->os_upgrade_lock); if (zfsvfs->z_os->os_upgrade_id == 0) { /* clear potential error code and retry */ zfsvfs->z_os->os_upgrade_status = 0; mutex_exit(&zfsvfs->z_os->os_upgrade_lock); dsl_pool_config_enter( dmu_objset_pool(zfsvfs->z_os), FTAG); dmu_objset_userspace_upgrade(zfsvfs->z_os); dsl_pool_config_exit( dmu_objset_pool(zfsvfs->z_os), FTAG); } else { mutex_exit(&zfsvfs->z_os->os_upgrade_lock); } taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq, zfsvfs->z_os->os_upgrade_id); error = zfsvfs->z_os->os_upgrade_status; } zfs_vfs_rele(zfsvfs); } else { objset_t *os; /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_userspace_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); if (dmu_objset_userobjspace_upgradable(os) || dmu_objset_projectquota_upgradable(os)) { mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_id_quota_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; } else { dsl_pool_rele(dmu_objset_pool(os), FTAG); } dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } static int zfs_ioc_share(zfs_cmd_t *zc) { return (SET_ERROR(ENOSYS)); } /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; minor_t minor; zfs_file_t *fp = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (fp == NULL) return (SET_ERROR(EBADF)); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); int error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strlcpy(zc->zc_value, snap_name, sizeof (zc->zc_value)); kmem_strfree(snap_name); kmem_strfree(hold_name); zfs_onexit_fd_rele(fp); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { zfs_file_t *fp; offset_t off; int error; if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); zfs_file_put(fp); return (error); } static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { return (SET_ERROR(ENOTSUP)); } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_hold[] = { {"holds", DATA_TYPE_NVLIST, 0}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, }; static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { (void) pool; nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; zfs_file_t *fp = NULL; holds = fnvlist_lookup_nvlist(args, "holds"); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { fp = zfs_onexit_fd_hold(cleanup_fd, &minor); if (fp == NULL) return (SET_ERROR(EBADF)); } error = dsl_dataset_user_hold(holds, minor, errlist); if (fp != NULL) { ASSERT3U(minor, !=, 0); zfs_onexit_fd_rele(fp); } return (SET_ERROR(error)); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ static const zfs_ioc_key_t zfs_keys_get_holds[] = { /* no nvl keys */ }; static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { (void) args; return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_release[] = { {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, }; static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { (void) pool; return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_guid flags (ZEVENT_NONBLOCK) * zc_cleanup_fd zevent file descriptor * * outputs: * zc_nvlist_dst next nvlist event * zc_cookie dropped events since last get */ static int zfs_ioc_events_next(zfs_cmd_t *zc) { zfs_zevent_t *ze; nvlist_t *event = NULL; minor_t minor; uint64_t dropped = 0; int error; zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (fp == NULL) return (SET_ERROR(EBADF)); do { error = zfs_zevent_next(ze, &event, &zc->zc_nvlist_dst_size, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); nvlist_free(event); } if (zc->zc_guid & ZEVENT_NONBLOCK) break; if ((error == 0) || (error != ENOENT)) break; error = zfs_zevent_wait(ze); if (error != 0) break; } while (1); zfs_zevent_fd_rele(fp); return (error); } /* * outputs: * zc_cookie cleared events count */ static int zfs_ioc_events_clear(zfs_cmd_t *zc) { uint_t count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; return (0); } /* * inputs: * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END * zc_cleanup zevent file descriptor */ static int zfs_ioc_events_seek(zfs_cmd_t *zc) { zfs_zevent_t *ze; minor_t minor; int error; zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (fp == NULL) return (SET_ERROR(EBADF)); error = zfs_zevent_seek(ze, zc->zc_guid); zfs_zevent_fd_rele(fp); return (error); } /* * inputs: * zc_name name of later filesystem or snapshot * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (strchr(zc->zc_value, '#') != NULL) { zfs_bookmark_phys_t bmp; error = dsl_bookmark_lookup(dp, zc->zc_value, new, &bmp); if (error == 0) { error = dsl_dataset_space_written_bookmark(&bmp, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); } } else { dsl_dataset_t *old; error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error == 0) { error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); } } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static const zfs_ioc_key_t zfs_keys_space_snaps[] = { {"firstsnap", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "savedok" -> (value ignored) * presence indicates we should send a partially received snapshot * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "redactbook" -> (string) * if present, use this bookmark's redaction list to generate a redacted * send stream * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"fd", DATA_TYPE_INT32, 0}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int error; offset_t off; char *fromname = NULL; int fd; zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); if ((fp = zfs_file_get(fd)) == NULL) return (SET_ERROR(EBADF)); off = zfs_file_off(fp); dmu_send_outparams_t out = {0}; out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); zfs_file_put(fp); return (error); } static int send_space_sum(objset_t *os, void *buf, int len, void *arg) { (void) os, (void) buf; uint64_t *size = arg; *size += len; return (0); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "fd" -> file descriptor to use as a cookie for progress * tracking (int32) * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; int error; char *fromname = NULL; char *redactlist_book = NULL; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t space = 0; boolean_t full_estimate = B_FALSE; uint64_t resumeobj = 0; uint64_t resumeoff = 0; uint64_t resume_bytes = 0; int32_t fd = -1; zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } (void) nvlist_lookup_int32(innvl, "fd", &fd); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", &redactlist_book) == 0); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); if (altbook) { full_estimate = B_TRUE; } else if (from) { if (strchr(fromname, '#')) { error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); /* * dsl_bookmark_lookup() will fail with EXDEV if * the from-bookmark and tosnap are at the same txg. * However, it's valid to do a send (and therefore, * a send estimate) from and to the same time point, * if the bookmark is redacted (the incremental send * can change what's redacted on the target). In * this case, dsl_bookmark_lookup() fills in zbm * but returns EXDEV. Ignore this error. */ if (error == EXDEV && zbm.zbm_redaction_obj != 0 && zbm.zbm_guid == dsl_dataset_phys(tosnap)->ds_guid) error = 0; if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & ZBM_FLAG_HAS_FBN)) { full_estimate = B_TRUE; } } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { full_estimate = B_TRUE; dsl_dataset_rele(fromsnap, FTAG); } } else { /* * from is not properly formatted as a snapshot or * bookmark */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } } if (full_estimate) { dmu_send_outparams_t out = {0}; offset_t off = 0; out.dso_outfunc = send_space_sum; out.dso_arg = &space; out.dso_dryrun = B_TRUE; /* * We have to release these holds so dmu_send can take them. It * will do all the error checking we need. */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactlist_book, fd, &off, &out); } else { error = dmu_send_estimate_fast(tosnap, fromsnap, (from && strchr(fromname, '#') != NULL ? &zbm : NULL), compressok || rawok, savedok, &space); space -= resume_bytes; if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); return (error); } /* * Sync the currently open TXG to disk for the specified pool. * This is somewhat similar to 'zfs_sync()'. * For cases that do not result in error this ioctl will wait for * the currently open TXG to commit before returning back to the caller. * * innvl: { * "force" -> when true, force uberblock update even if there is no dirty data. * In addition this will cause the vdev configuration to be written * out including updating the zpool cache file. (boolean_t) * } * * onvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_sync[] = { {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, }; static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { (void) onvl; int err; boolean_t rc, force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); if (innvl) { err = nvlist_lookup_boolean_value(innvl, "force", &rc); if (err == 0) force = rc; } if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } txg_wait_synced(spa_get_dsl(spa), 0); spa_close(spa, FTAG); return (0); } /* * Load a user's wrapping key into the kernel. * innvl: { * "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * (optional) "noop" -> (value ignored) * presence indicated key should only be verified, not loaded * } */ static const zfs_ioc_key_t zfs_keys_load_key[] = { {"hidden_args", DATA_TYPE_NVLIST, 0}, {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int ret; dsl_crypto_params_t *dcp = NULL; nvlist_t *hidden_args; boolean_t noop = nvlist_exists(innvl, "noop"); if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = SET_ERROR(EINVAL); goto error; } hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_load_wkey(dsname, dcp, noop); if (ret != 0) goto error; dsl_crypto_params_free(dcp, noop); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } /* * Unload a user's wrapping key from the kernel. * Both innvl and outnvl are unused. */ static const zfs_ioc_key_t zfs_keys_unload_key[] = { /* no nvl keys */ }; static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) innvl, (void) outnvl; int ret = 0; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto out; } ret = spa_keystore_unload_wkey(dsname); if (ret != 0) goto out; out: return (ret); } /* * Changes a user's wrapping key used to decrypt a dataset. The keyformat, * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified * here to change how the key is derived in userspace. * * innvl: { * "hidden_args" (optional) -> { "wkeydata" -> value } * raw uint8_t array of new encryption wrapping key data (32 bytes) * "props" (optional) -> { prop -> value } * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_change_key[] = { {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { (void) outnvl; int ret; uint64_t cmd = DCP_CMD_NONE; dsl_crypto_params_t *dcp = NULL; nvlist_t *args = NULL, *hidden_args = NULL; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto error; } (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); (void) nvlist_lookup_nvlist(innvl, "props", &args); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_change_key(dsname, dcp); if (ret != 0) goto error; dsl_crypto_params_free(dcp, B_FALSE); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; vec->zvec_nvl_keys = nvl_keys; vec->zvec_nvl_key_count = num_keys; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, ARRAY_SIZE(zfs_keys_get_bookmark_props)); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_bookmarks, ARRAY_SIZE(zfs_keys_destroy_bookmarks)); zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, zfs_ioc_recv_new, zfs_secpolicy_recv, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, zfs_ioc_load_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, zfs_ioc_unload_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, zfs_ioc_change_key, zfs_secpolicy_change_key, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_change_key, ARRAY_SIZE(zfs_keys_change_key)); zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); zfs_ioctl_register("redact", ZFS_IOC_REDACT, zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); zfs_ioctl_register("wait", ZFS_IOC_WAIT, zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); zfs_ioctl_register("zpool_vdev_get_props", ZFS_IOC_VDEV_GET_PROPS, zfs_ioc_vdev_get_props, zfs_secpolicy_read, POOL_NAME, POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_vdev_get_props, ARRAY_SIZE(zfs_keys_vdev_get_props)); zfs_ioctl_register("zpool_vdev_set_props", ZFS_IOC_VDEV_SET_PROPS, zfs_ioc_vdev_set_props, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props)); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_init_os(); } /* * Verify that for non-legacy ioctls the input nvlist * pairs match against the expected input. * * Possible errors are: * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair */ static int zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) { const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; boolean_t required_keys_found = B_FALSE; /* * examine each input pair */ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); data_type_t type = nvpair_type(pair); boolean_t identified = B_FALSE; /* * check pair against the documented names and type */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { /* if not a wild card name, check for an exact match */ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && strcmp(nvl_keys[k].zkey_name, name) != 0) continue; identified = B_TRUE; if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && nvl_keys[k].zkey_type != type) { return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); } if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; required_keys_found = B_TRUE; break; } /* allow an 'optional' key, everything else is invalid */ if (!identified && (strcmp(name, "optional") != 0 || type != DATA_TYPE_NVLIST)) { return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); } } /* verify that all required keys were found */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { /* at least one non-optional key is expected here */ if (!required_keys_found) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); continue; } if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); } return (0); } static int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME || type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } int zfsdev_getminor(zfs_file_t *fp, minor_t *minorp) { zfsdev_state_t *zs, *fpd; ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); fpd = zfs_file_private(fp); if (fpd == NULL) return (SET_ERROR(EBADF)); mutex_enter(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) continue; if (fpd == zs) { *minorp = fpd->zs_minor; mutex_exit(&zfsdev_state_lock); return (0); } } mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EBADF)); } void * zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { membar_consumer(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); case ZST_ZEVENT: return (zs->zs_zevent); case ZST_ALL: return (zs); } } } return (NULL); } /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ static minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (zfsdev_get_state(m, ZST_ALL) == NULL) { last_minor = m; return (m); } } return (0); } int zfsdev_state_init(void *priv) { zfsdev_state_t *zs, *zsprev = NULL; minor_t minor; boolean_t newzs = B_FALSE; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) break; zsprev = zs; } if (!zs) { zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); newzs = B_TRUE; } zfsdev_private_set_state(priv, zs); zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); /* * In order to provide for lock-free concurrent read access * to the minor list in zfsdev_get_state(), new entries * must be completely written before linking them into the * list whereas existing entries are already linked; the last * operation must be updating zs_minor (from -1 to the new * value). */ if (newzs) { zs->zs_minor = minor; membar_producer(); zsprev->zs_next = zs; } else { membar_producer(); zs->zs_minor = minor; } return (0); } void zfsdev_state_destroy(void *priv) { zfsdev_state_t *zs = zfsdev_private_get_state(priv); ASSERT(zs != NULL); ASSERT3S(zs->zs_minor, >, 0); /* * The last reference to this zfsdev file descriptor is being dropped. * We don't have to worry about lookup grabbing this state object, and * zfsdev_state_init() will not try to reuse this object until it is * invalidated by setting zs_minor to -1. Invalidation must be done * last, with a memory barrier to ensure ordering. This lets us avoid * taking the global zfsdev state lock around destruction. */ zfs_onexit_destroy(zs->zs_onexit); zfs_zevent_destroy(zs->zs_zevent); zs->zs_onexit = NULL; zs->zs_zevent = NULL; membar_producer(); zs->zs_minor = -1; } long zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) { int error, cmd; const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; uint64_t max_nvlist_src_size; size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; hrtime_t start_time = gethrtime(); cmd = vecnum; error = 0; if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); vec = &zfs_ioc_vec[vecnum]; /* * The registered ioctl list may be sparse, verify that either * a normal or legacy handler are registered. */ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); zc->zc_iflags = flag & FKIOCTL; max_nvlist_src_size = zfs_max_nvlist_src_size_os(); if (zc->zc_nvlist_src_size > max_nvlist_src_size) { /* * Make sure the user doesn't pass in an insane value for * zc_nvlist_src_size. We have to check, since we will end * up allocating that much memory inside of get_nvlist(). This * prevents a nefarious user from allocating tons of kernel * memory. * * Also, we return EINVAL instead of ENOMEM here. The reason * being that returning ENOMEM from an ioctl() has a special * connotation; that the user's size value is too small and * needs to be expanded to hold the nvlist. See * zcmd_expand_dst_nvlist() for details. */ error = SET_ERROR(EINVAL); /* User's size too big */ } else if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case ENTITY_NAME: if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { error = SET_ERROR(EINVAL); } else { error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); } break; case NO_NAME: break; } /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. * * The vectored functions can use fnvlist_lookup_{type} for any * required pairs since zfs_check_input_nvpairs() confirmed that * they exist and are of the correct type. */ if (error == 0 && vec->zvec_func != NULL) { error = zfs_check_input_nvpairs(innvl, vec); if (error != 0) goto out; } if (error == 0) { cookie = spl_fstrans_mark(); error = vec->zvec_secpolicy(zc, innvl, CRED()); spl_fstrans_unmark(cookie); } if (error != 0) goto out; /* legacy ioctls can modify zc_name */ /* * Can't use kmem_strdup() as we might truncate the string and * kmem_strfree() would then free with incorrect size. */ saved_poolname_len = strlen(zc->zc_name) + 1; saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); cookie = spl_fstrans_mark(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); spl_fstrans_unmark(cookie); /* * Some commands can partially execute, modify state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { size_t out_size = fnvlist_size(outnvl); if (out_size > zfs_history_output_max) { fnvlist_add_int64(lognv, ZPOOL_HIST_OUTPUT_SIZE, out_size); } else { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS, gethrtime() - start_time); (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { cookie = spl_fstrans_mark(); error = vec->zvec_legacy_func(zc); spl_fstrans_unmark(cookie); } out: nvlist_free(innvl); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) kmem_strfree(s); (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); } if (saved_poolname != NULL) kmem_free(saved_poolname, saved_poolname_len); return (error); } int zfs_kmod_init(void) { int error; if ((error = zvol_init()) != 0) return (error); spa_init(SPA_MODE_READ | SPA_MODE_WRITE); zfs_init(); zfs_ioctl_init(); mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); zfsdev_state_list->zs_minor = -1; if ((error = zfsdev_attach()) != 0) goto out; tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); return (0); out: zfs_fini(); spa_fini(); zvol_fini(); return (error); } void zfs_kmod_fini(void) { zfsdev_state_t *zs, *zsnext = NULL; zfsdev_detach(); mutex_destroy(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) { zsnext = zs->zs_next; if (zs->zs_onexit) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); } ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, U64, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, U64, ZMOD_RW, "Maximum size in bytes of ZFS ioctl output that will be logged"); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 1371e5187516..1511f763fd77 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1,1739 +1,1739 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev// * * Volumes are persistent through reboot and module load. No user command * needs to be run before opening and using a device. * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ /* * Note on locking of zvol state structures. * * These structures are used to maintain internal state used to emulate block * devices on top of zvols. In particular, management of device minor number * operations - create, remove, rename, and set_snapdev - involves access to * these structures. The zvol_state_lock is primarily used to protect the * zvol_state_list. The zv->zv_state_lock is used to protect the contents * of the zvol_state_t structures, as well as to make sure that when the * time comes to remove the structure from the list, it is not in use, and * therefore, it can be taken off zvol_state_list and freed. * * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, * e.g. for the duration of receive and rollback operations. This lock can be * held for significant periods of time. Given that it is undesirable to hold * mutexes for long periods of time, the following lock ordering applies: * - take zvol_state_lock if necessary, to protect zvol_state_list * - take zv_suspend_lock if necessary, by the code path in question * - take zv_state_lock to protect zvol_state_t * * The minor operations are issued to spa->spa_zvol_taskq queues, that are * single-threaded (to preserve order of minor operations), and are executed * through the zvol_task_cb that dispatches the specific operations. Therefore, * these operations are serialized per pool. Consequently, we can be certain * that for a given zvol, there is only one operation at a time in progress. * That is why one can be sure that first, zvol_state_t for a given zvol is * allocated and placed on zvol_state_list, and then other minor operations * for this zvol are going to proceed in the order of issue. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; struct hlist_head *zvol_htable; static list_t zvol_state_list; krwlock_t zvol_state_lock; typedef enum { ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, ZVOL_ASYNC_SET_VOLMODE, ZVOL_ASYNC_MAX } zvol_async_op_t; typedef struct { zvol_async_op_t op; char name1[MAXNAMELEN]; char name2[MAXNAMELEN]; uint64_t value; } zvol_task_t; uint64_t zvol_name_hash(const char *name) { int i; uint64_t crc = -1ULL; const uint8_t *p = (const uint8_t *)name; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; } return (crc); } /* * Find a zvol_state_t given the name and hash generated by zvol_name_hash. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ zvol_state_t * zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) { zvol_state_t *zv; struct hlist_node *p = NULL; rw_enter(&zvol_state_lock, RW_READER); hlist_for_each(p, ZVOL_HT_HEAD(hash)) { zv = hlist_entry(p, zvol_state_t, zv_hlink); mutex_enter(&zv->zv_state_lock); if (zv->zv_hash == hash && strncmp(zv->zv_name, name, MAXNAMELEN) == 0) { /* * this is the right zvol, take the locks in the * right order */ if (mode != RW_NONE && !rw_tryenter(&zv->zv_suspend_lock, mode)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, mode); mutex_enter(&zv->zv_state_lock); /* * zvol cannot be renamed as we continue * to hold zvol_state_lock */ ASSERT(zv->zv_hash == hash && strncmp(zv->zv_name, name, MAXNAMELEN) == 0); } rw_exit(&zvol_state_lock); return (zv); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (NULL); } /* * Find a zvol_state_t given the name. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ static zvol_state_t * zvol_find_by_name(const char *name, int mode) { return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); } /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; nvlist_t *nvprops = zct->zct_props; int error; uint64_t volblocksize, volsize; VERIFY(nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); /* * These properties must be removed from the list so the generic * property setting step won't apply to them. */ VERIFY(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); ASSERT(error == 0); } /* * ZFS_IOC_OBJSET_STATS entry point. */ int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; dmu_object_info_t *doi; uint64_t val; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (SET_ERROR(error)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_object_info(os, ZVOL_OBJ, doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, doi->doi_data_block_size); } kmem_free(doi, sizeof (dmu_object_info_t)); return (SET_ERROR(error)); } /* * Sanity check volume size. */ int zvol_check_volsize(uint64_t volsize, uint64_t blocksize) { if (volsize == 0) return (SET_ERROR(EINVAL)); if (volsize % blocksize != 0) return (SET_ERROR(EINVAL)); #ifdef _ILP32 if (volsize - 1 > SPEC_MAXOFFSET_T) return (SET_ERROR(EOVERFLOW)); #endif return (0); } /* * Ensure the zap is flushed then inform the VFS of the capacity change. */ static int zvol_update_volsize(uint64_t volsize, objset_t *os) { dmu_tx_t *tx; int error; uint64_t txg; tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (SET_ERROR(error)); } txg = dmu_tx_get_txg(tx); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); if (error == 0) error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); return (error); } /* * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume * size will result in a udev "change" event being generated. */ int zvol_set_volsize(const char *name, uint64_t volsize) { objset_t *os = NULL; uint64_t readonly; int error; boolean_t owned = B_FALSE; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); if (error != 0) return (SET_ERROR(error)); if (readonly) return (SET_ERROR(EROFS)); zvol_state_t *zv = zvol_find_by_name(name, RW_READER); ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) && RW_READ_HELD(&zv->zv_suspend_lock))); if (zv == NULL || zv->zv_objset == NULL) { if (zv != NULL) rw_exit(&zv->zv_suspend_lock); if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE, FTAG, &os)) != 0) { if (zv != NULL) mutex_exit(&zv->zv_state_lock); return (SET_ERROR(error)); } owned = B_TRUE; if (zv != NULL) zv->zv_objset = os; } else { os = zv->zv_objset; } dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP); if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) goto out; error = zvol_update_volsize(volsize, os); if (error == 0 && zv != NULL) { zv->zv_volsize = volsize; zv->zv_changed = 1; } out: kmem_free(doi, sizeof (dmu_object_info_t)); if (owned) { dmu_objset_disown(os, B_TRUE, FTAG); if (zv != NULL) zv->zv_objset = NULL; } else { rw_exit(&zv->zv_suspend_lock); } if (zv != NULL) mutex_exit(&zv->zv_state_lock); if (error == 0 && zv != NULL) zvol_os_update_volsize(zv, volsize); return (SET_ERROR(error)); } /* * Sanity check volume block size. */ int zvol_check_volblocksize(const char *name, uint64_t volblocksize) { /* Record sizes above 128k need the feature to be enabled */ if (volblocksize > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (volblocksize > zfs_max_recordsize) return (SET_ERROR(EDOM)); spa_close(spa, FTAG); } if (volblocksize < SPA_MINBLOCKSIZE || volblocksize > SPA_MAXBLOCKSIZE || !ISP2(volblocksize)) return (SET_ERROR(EDOM)); return (0); } /* * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we * implement DKIOCFREE/free-long-range. */ static int zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_truncate_t *lr = arg2; uint64_t offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); int error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length); } return (error); } /* * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ static int zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_write_t *lr = arg2; objset_t *os = zv->zv_objset; char *data = (char *)(lr + 1); /* data follows lr_write_t */ uint64_t offset, length; dmu_tx_t *tx; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, offset, length, data, tx); (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); } return (error); } static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { (void) arg1, (void) arg2, (void) byteswap; return (SET_ERROR(ENOTSUP)); } /* * Callback vectors for replaying records. * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* no such transaction type */ zvol_replay_err, /* TX_CREATE */ zvol_replay_err, /* TX_MKDIR */ zvol_replay_err, /* TX_MKXATTR */ zvol_replay_err, /* TX_SYMLINK */ zvol_replay_err, /* TX_REMOVE */ zvol_replay_err, /* TX_RMDIR */ zvol_replay_err, /* TX_LINK */ zvol_replay_err, /* TX_RENAME */ zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ }; /* * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. * * We store data in the log buffers if it's small enough. * Otherwise we will later flush the data out via dmu_sync(). */ static const ssize_t zvol_immediate_write_sz = 32768; void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, int sync) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; uint64_t sz = size; if (zil_replaying(zilog, tx)) return; if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && size >= blocksize && blocksize > zvol_immediate_write_sz) write_state = WR_INDIRECT; else if (sync) write_state = WR_COPIED; else write_state = WR_NEED_COPY; while (size) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = size; if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(offset, blocksize), size); itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } itx->itx_wr_state = wr_state; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = offset; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; itx->itx_sync = sync; (void) zil_itx_assign(zilog, itx, tx); offset += len; size -= len; } if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg); } } /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, boolean_t sync) { itx_t *itx; lr_truncate_t *lr; zilog_t *zilog = zv->zv_zilog; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = len; itx->itx_sync = sync; zil_itx_assign(zilog, itx, tx); } static void zvol_get_done(zgd_t *zgd, int error) { (void) error; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } /* * Get data to generate a TX_WRITE intent log record. */ int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change * the data. Contrarily to zfs_get_data we need not re-check * blocksize after we get the lock because it cannot be changed. */ size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db != NULL); ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) return (0); } } zvol_get_done(zgd, error); return (SET_ERROR(error)); } /* * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ void zvol_insert(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_insert_head(&zvol_state_list, zv); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } /* * Simply remove the zvol from to list of zvols. */ static void zvol_remove(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_remove(&zvol_state_list, zv); hlist_del(&zv->zv_hlink); } /* * Setup zv after we just own the zv->objset */ static int zvol_setup_zv(zvol_state_t *zv) { uint64_t volsize; int error; uint64_t ro; objset_t *os = zv->zv_objset; ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock)); zv->zv_zilog = NULL; zv->zv_flags &= ~ZVOL_WRITTEN_TO; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) return (SET_ERROR(error)); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) return (SET_ERROR(error)); error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); if (error) return (SET_ERROR(error)); zvol_os_set_capacity(zv, volsize >> 9); zv->zv_volsize = volsize; if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { zvol_os_set_disk_ro(zv, 1); zv->zv_flags |= ZVOL_RDONLY; } else { zvol_os_set_disk_ro(zv, 0); zv->zv_flags &= ~ZVOL_RDONLY; } return (0); } /* * Shutdown every zv_objset related stuff except zv_objset itself. * The is the reverse of zvol_setup_zv. */ static void zvol_shutdown_zv(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock) && RW_LOCK_HELD(&zv->zv_suspend_lock)); if (zv->zv_flags & ZVOL_WRITTEN_TO) { ASSERT(zv->zv_zilog != NULL); zil_close(zv->zv_zilog); } zv->zv_zilog = NULL; dnode_rele(zv->zv_dn, zv); zv->zv_dn = NULL; /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ if (zv->zv_flags & ZVOL_WRITTEN_TO) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); (void) dmu_objset_evict_dbufs(zv->zv_objset); } /* * return the proper tag for rollback and recv */ void * zvol_tag(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); return (zv->zv_open_count > 0 ? zv : NULL); } /* * Suspend the zvol for recv and rollback. */ zvol_state_t * zvol_suspend(const char *name) { zvol_state_t *zv; zv = zvol_find_by_name(name, RW_WRITER); if (zv == NULL) return (NULL); /* block all I/O, release in zvol_resume. */ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) zvol_shutdown_zv(zv); /* * do not hold zv_state_lock across suspend/resume to * avoid locking up zvol lookups */ mutex_exit(&zv->zv_state_lock); /* zv_suspend_lock is released in zvol_resume() */ return (zv); } int zvol_resume(zvol_state_t *zv) { int error = 0; ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); mutex_enter(&zv->zv_state_lock); if (zv->zv_open_count > 0) { VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); dmu_objset_rele(zv->zv_objset, zv); error = zvol_setup_zv(zv); } mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); /* * We need this because we don't hold zvol_state_lock while releasing * zv_suspend_lock. zvol_remove_minors_impl thus cannot check * zv_suspend_lock to determine it is safe to free because rwlock is * not inherent atomic. */ atomic_dec(&zv->zv_suspend_ref); return (SET_ERROR(error)); } int zvol_first_open(zvol_state_t *zv, boolean_t readonly) { objset_t *os; int error; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(mutex_owned(&spa_namespace_lock)); boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); if (error) return (SET_ERROR(error)); zv->zv_objset = os; error = zvol_setup_zv(zv); if (error) { dmu_objset_disown(os, 1, zv); zv->zv_objset = NULL; } return (error); } void zvol_last_close(zvol_state_t *zv) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, 1, zv); zv->zv_objset = NULL; } typedef struct minors_job { list_t *list; list_node_t link; /* input */ char *name; /* output */ int error; } minors_job_t; /* * Prefetch zvol dnodes for the minors_job */ static void zvol_prefetch_minors_impl(void *arg) { minors_job_t *job = arg; char *dsname = job->name; objset_t *os = NULL; job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_snap_minor_cb(const char *dsname, void *arg) { minors_job_t *j = arg; list_t *minors_list = j->list; const char *name = j->name; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); /* skip the designated dataset */ if (name && strcmp(dsname, name) == 0) return (0); /* at this point, the dsname should name a snapshot */ if (strchr(dsname, '@') == 0) { dprintf("zvol_create_snap_minor_cb(): " "%s is not a snapshot name\n", dsname); } else { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); } return (0); } /* * If spa_keystore_load_wkey() is called for an encrypted zvol, * we need to look for any clones also using the key. This function * is "best effort" - so we just skip over it if there are failures. */ static void zvol_add_clones(const char *dsname, list_t *minors_list) { /* Also check if it has clones */ dsl_dir_t *dd = NULL; dsl_pool_t *dp = NULL; if (dsl_pool_hold(dsname, FTAG, &dp) != 0) return; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) goto out; if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0) goto out; if (dsl_dir_phys(dd)->dd_clones == 0) goto out; zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); objset_t *mos = dd->dd_pool->dp_meta_objset; for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dataset_t *clone; minors_job_t *job; if (dsl_dataset_hold_obj(dd->dd_pool, za->za_first_integer, FTAG, &clone) == 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(clone, name); char *n = kmem_strdup(name); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); dsl_dataset_rele(clone, FTAG); } } zap_cursor_fini(zc); kmem_free(za, sizeof (zap_attribute_t)); kmem_free(zc, sizeof (zap_cursor_t)); out: if (dd != NULL) dsl_dir_rele(dd, FTAG); dsl_pool_rele(dp, FTAG); } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_minors_cb(const char *dsname, void *arg) { uint64_t snapdev; int error; list_t *minors_list = arg; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); if (error) return (0); /* * Given the name and the 'snapdev' property, create device minor nodes * with the linkages to zvols/snapshots as needed. * If the name represents a zvol, create a minor node for the zvol, then * check if its snapshots are 'visible', and if so, iterate over the * snapshots and create device minor nodes for those. */ if (strchr(dsname, '@') == 0) { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); zvol_add_clones(dsname, minors_list); if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ - error = dmu_objset_find(dsname, + (void) dmu_objset_find(dsname, zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); } } else { dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", dsname); } return (0); } /* * Create minors for the specified dataset, including children and snapshots. * Pay attention to the 'snapdev' property and iterate over the snapshots * only if they are 'visible'. This approach allows one to assure that the * snapshot metadata is read from disk only if it is needed. * * The name can represent a dataset to be recursively scanned for zvols and * their snapshots, or a single zvol snapshot. If the name represents a * dataset, the scan is performed in two nested stages: * - scan the dataset for zvols, and * - for each zvol, create a minor node, then check if the zvol's snapshots * are 'visible', and only then iterate over the snapshots if needed * * If the name represents a snapshot, a check is performed if the snapshot is * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ void zvol_create_minors_recursive(const char *name) { list_t minors_list; minors_job_t *job; if (zvol_inhibit_dev) return; /* * This is the list for prefetch jobs. Whenever we found a match * during dmu_objset_find, we insert a minors_job to the list and do * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need * any lock because all list operation is done on the current thread. * * We will use this list to do zvol_os_create_minor after prefetch * so we don't have to traverse using dmu_objset_find again. */ list_create(&minors_list, sizeof (minors_job_t), offsetof(minors_job_t, link)); if (strchr(name, '@') != NULL) { uint64_t snapdev; int error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) (void) zvol_os_create_minor(name); } else { fstrans_cookie_t cookie = spl_fstrans_mark(); (void) dmu_objset_find(name, zvol_create_minors_cb, &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } taskq_wait_outstanding(system_taskq, 0); /* * Prefetch is completed, we can do zvol_os_create_minor * sequentially. */ while ((job = list_head(&minors_list)) != NULL) { list_remove(&minors_list, job); if (!job->error) (void) zvol_os_create_minor(job->name); kmem_strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); } void zvol_create_minor(const char *name) { /* * Note: the dsl_pool_config_lock must not be held. * Minor node creation needs to obtain the zvol_state_lock. * zvol_open() obtains the zvol_state_lock and then the dsl pool * config lock. Therefore, we can't have the config lock now if * we are going to wait for the zvol_state_lock, because it * would be a lock order inversion which could lead to deadlock. */ if (zvol_inhibit_dev) return; if (strchr(name, '@') != NULL) { uint64_t snapdev; int error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) (void) zvol_os_create_minor(name); } else { (void) zvol_os_create_minor(name); } } /* * Remove minors for specified dataset including children and snapshots. */ static void zvol_free_task(void *arg) { zvol_os_free(arg); } void zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); taskqid_t t; list_t free_list; if (zvol_inhibit_dev) return; list_create(&free_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (name == NULL || strcmp(zv->zv_name, name) == 0 || (strncmp(zv->zv_name, name, namelen) == 0 && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zvol_os_clear_private(zv); /* Drop zv_state_lock before zvol_free() */ mutex_exit(&zv->zv_state_lock); /* Try parallel zv_free, if failed do it in place */ t = taskq_dispatch(system_taskq, zvol_free_task, zv, TQ_SLEEP); if (t == TASKQID_INVALID) list_insert_head(&free_list, zv); } else { mutex_exit(&zv->zv_state_lock); } } rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ while ((zv = list_head(&free_list)) != NULL) { list_remove(&free_list, zv); zvol_os_free(zv); } } /* Remove minor for this specific volume only */ static void zvol_remove_minor_impl(const char *name) { zvol_state_t *zv = NULL, *zv_next; if (zvol_inhibit_dev) return; rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, name) == 0) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); zvol_os_clear_private(zv); mutex_exit(&zv->zv_state_lock); break; } else { mutex_exit(&zv->zv_state_lock); } } /* Drop zvol_state_lock before calling zvol_free() */ rw_exit(&zvol_state_lock); if (zv != NULL) zvol_os_free(zv); } /* * Rename minors for specified dataset including children and snapshots. */ static void zvol_rename_minors_impl(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; int oldnamelen; if (zvol_inhibit_dev) return; oldnamelen = strlen(oldname); rw_enter(&zvol_state_lock, RW_READER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { zvol_os_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); zvol_os_rename_minor(zv, name); kmem_strfree(name); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); } typedef struct zvol_snapdev_cb_arg { uint64_t snapdev; } zvol_snapdev_cb_arg_t; static int zvol_set_snapdev_cb(const char *dsname, void *param) { zvol_snapdev_cb_arg_t *arg = param; if (strchr(dsname, '@') == NULL) return (0); switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: (void) zvol_os_create_minor(dsname); break; case ZFS_SNAPDEV_HIDDEN: (void) zvol_remove_minor_impl(dsname); break; } return (0); } static void zvol_set_snapdev_impl(char *name, uint64_t snapdev) { zvol_snapdev_cb_arg_t arg = {snapdev}; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * The zvol_set_snapdev_sync() sets snapdev appropriately * in the dataset hierarchy. Here, we only scan snapshots. */ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); spl_fstrans_unmark(cookie); } static void zvol_set_volmode_impl(char *name, uint64_t volmode) { fstrans_cookie_t cookie; uint64_t old_volmode; zvol_state_t *zv; if (strchr(name, '@') != NULL) return; /* * It's unfortunate we need to remove minors before we create new ones: * this is necessary because our backing gendisk (zvol_state->zv_disk) * could be different when we set, for instance, volmode from "geom" * to "dev" (or vice versa). */ zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL && volmode == ZFS_VOLMODE_NONE) return; if (zv != NULL) { old_volmode = zv->zv_volmode; mutex_exit(&zv->zv_state_lock); if (old_volmode == volmode) return; zvol_wait_close(zv); } cookie = spl_fstrans_mark(); switch (volmode) { case ZFS_VOLMODE_NONE: (void) zvol_remove_minor_impl(name); break; case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: (void) zvol_remove_minor_impl(name); (void) zvol_os_create_minor(name); break; case ZFS_VOLMODE_DEFAULT: (void) zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ (void) zvol_os_create_minor(name); break; } spl_fstrans_unmark(cookie); } static zvol_task_t * zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, uint64_t value) { zvol_task_t *task; /* Never allow tasks on hidden names. */ if (name1[0] == '$') return (NULL); task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->op = op; task->value = value; strlcpy(task->name1, name1, MAXNAMELEN); if (name2 != NULL) strlcpy(task->name2, name2, MAXNAMELEN); return (task); } static void zvol_task_free(zvol_task_t *task) { kmem_free(task, sizeof (zvol_task_t)); } /* * The worker thread function performed asynchronously. */ static void zvol_task_cb(void *arg) { zvol_task_t *task = arg; switch (task->op) { case ZVOL_ASYNC_REMOVE_MINORS: zvol_remove_minors_impl(task->name1); break; case ZVOL_ASYNC_RENAME_MINORS: zvol_rename_minors_impl(task->name1, task->name2); break; case ZVOL_ASYNC_SET_SNAPDEV: zvol_set_snapdev_impl(task->name1, task->value); break; case ZVOL_ASYNC_SET_VOLMODE: zvol_set_volmode_impl(task->name1, task->value); break; default: VERIFY(0); break; } zvol_task_free(task); } typedef struct zvol_set_prop_int_arg { const char *zsda_name; uint64_t zsda_value; zprop_source_t zsda_source; dmu_tx_t *zsda_tx; } zvol_set_prop_int_arg_t; /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } static int zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) arg; char dsname[MAXNAMELEN]; zvol_task_t *task; uint64_t snapdev; dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0) return (0); task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev); if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply snapdev appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "snapdev" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); zsda->zsda_tx = tx; error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, zsda->zsda_tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = snapdev; return (dsl_sync_task(ddname, zvol_set_snapdev_check, zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_volmode_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } static int zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) arg; char dsname[MAXNAMELEN]; zvol_task_t *task; uint64_t volmode; dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0) return (0); task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode); if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply volmode appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "volmode" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_volmode_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); zsda->zsda_tx = tx; error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, zsda->zsda_tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = volmode; return (dsl_sync_task(ddname, zvol_set_volmode_check, zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } void zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } boolean_t zvol_is_zvol(const char *name) { return (zvol_os_is_zvol(name)); } int zvol_init_impl(void) { int i; list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), KM_SLEEP); for (i = 0; i < ZVOL_HT_SIZE; i++) INIT_HLIST_HEAD(&zvol_htable[i]); return (0); } void zvol_fini_impl(void) { zvol_remove_minors_impl(NULL); /* * The call to "zvol_remove_minors_impl" may dispatch entries to * the system_taskq, but it doesn't wait for those entries to * complete before it returns. Thus, we must wait for all of the * removals to finish, before we can continue. */ taskq_wait_outstanding(system_taskq, 0); kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); rw_destroy(&zvol_state_lock); } diff --git a/module/zstd/include/zstd_compat_wrapper.h b/module/zstd/include/zstd_compat_wrapper.h index 2c4baad27d4e..4e6561f31a68 100644 --- a/module/zstd/include/zstd_compat_wrapper.h +++ b/module/zstd/include/zstd_compat_wrapper.h @@ -1,424 +1,422 @@ /* * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2020, Sebastian Gottschall */ /* * This wrapper fixes a problem, in case the ZFS filesystem driver, is compiled * statically into the kernel. * This will cause a symbol collision with the older in-kernel zstd library. * * On update, truncate this file at the scissor line, rebuild the module, * and make gen-zstd-symbols. */ #define MEM_MODULE #define XXH_NAMESPACE ZSTD_ #define XXH_PRIVATE_API #define XXH_INLINE_ALL #define ZSTD_LEGACY_SUPPORT 0 #define ZSTD_LIB_DICTBUILDER 0 #define ZSTD_LIB_DEPRECATED 0 #define ZSTD_NOBENCH #define DEBUGLEVEL 0 #ifdef _KERNEL #define ZSTD_DEPS_ASSERT #endif /* -- >8 -- */ /* lib/common/entropy_common.o: */ #define FSE_getErrorName zfs_FSE_getErrorName #define FSE_isError zfs_FSE_isError #define FSE_readNCount zfs_FSE_readNCount #define FSE_versionNumber zfs_FSE_versionNumber #define HUF_getErrorName zfs_HUF_getErrorName #define HUF_isError zfs_HUF_isError #define HUF_readStats zfs_HUF_readStats /* lib/common/error_private.o: */ #define ERR_getErrorString zfs_ERR_getErrorString /* lib/common/fse_decompress.o: */ #define FSE_buildDTable_raw zfs_FSE_buildDTable_raw #define FSE_buildDTable_rle zfs_FSE_buildDTable_rle #define FSE_buildDTable zfs_FSE_buildDTable -#define FSE_createDTable zfs_FSE_createDTable #define FSE_decompress_usingDTable zfs_FSE_decompress_usingDTable #define FSE_decompress_wksp zfs_FSE_decompress_wksp #define FSE_decompress zfs_FSE_decompress -#define FSE_freeDTable zfs_FSE_freeDTable /* lib/common/pool.o: */ #define POOL_add zfs_POOL_add #define POOL_create_advanced zfs_POOL_create_advanced #define POOL_create zfs_POOL_create #define POOL_free zfs_POOL_free #define POOL_resize zfs_POOL_resize #define POOL_sizeof zfs_POOL_sizeof #define POOL_tryAdd zfs_POOL_tryAdd /* lib/common/zstd_common.o: */ #define ZSTD_calloc zfs_ZSTD_calloc #define ZSTD_free zfs_ZSTD_free #define ZSTD_getErrorCode zfs_ZSTD_getErrorCode #define ZSTD_getErrorName zfs_ZSTD_getErrorName #define ZSTD_getErrorString zfs_ZSTD_getErrorString #define ZSTD_isError zfs_ZSTD_isError #define ZSTD_malloc zfs_ZSTD_malloc #define ZSTD_versionNumber zfs_ZSTD_versionNumber #define ZSTD_versionString zfs_ZSTD_versionString /* lib/compress/fse_compress.o: */ #define FSE_buildCTable_raw zfs_FSE_buildCTable_raw #define FSE_buildCTable_rle zfs_FSE_buildCTable_rle #define FSE_buildCTable_wksp zfs_FSE_buildCTable_wksp #define FSE_buildCTable zfs_FSE_buildCTable #define FSE_compress2 zfs_FSE_compress2 #define FSE_compressBound zfs_FSE_compressBound #define FSE_compress_usingCTable zfs_FSE_compress_usingCTable #define FSE_compress_wksp zfs_FSE_compress_wksp #define FSE_compress zfs_FSE_compress #define FSE_createCTable zfs_FSE_createCTable #define FSE_freeCTable zfs_FSE_freeCTable #define FSE_NCountWriteBound zfs_FSE_NCountWriteBound #define FSE_normalizeCount zfs_FSE_normalizeCount #define FSE_optimalTableLog_internal zfs_FSE_optimalTableLog_internal #define FSE_optimalTableLog zfs_FSE_optimalTableLog #define FSE_writeNCount zfs_FSE_writeNCount /* lib/compress/hist.o: */ #define HIST_countFast_wksp zfs_HIST_countFast_wksp #define HIST_countFast zfs_HIST_countFast #define HIST_count_simple zfs_HIST_count_simple #define HIST_count_wksp zfs_HIST_count_wksp #define HIST_count zfs_HIST_count #define HIST_isError zfs_HIST_isError /* lib/compress/huf_compress.o: */ #define HUF_buildCTable_wksp zfs_HUF_buildCTable_wksp #define HUF_buildCTable zfs_HUF_buildCTable #define HUF_compress1X_repeat zfs_HUF_compress1X_repeat #define HUF_compress1X_usingCTable zfs_HUF_compress1X_usingCTable #define HUF_compress1X_wksp zfs_HUF_compress1X_wksp #define HUF_compress1X zfs_HUF_compress1X #define HUF_compress2 zfs_HUF_compress2 #define HUF_compress4X_repeat zfs_HUF_compress4X_repeat #define HUF_compress4X_usingCTable zfs_HUF_compress4X_usingCTable #define HUF_compress4X_wksp zfs_HUF_compress4X_wksp #define HUF_compressBound zfs_HUF_compressBound #define HUF_compress zfs_HUF_compress #define HUF_estimateCompressedSize zfs_HUF_estimateCompressedSize #define HUF_getNbBits zfs_HUF_getNbBits #define HUF_optimalTableLog zfs_HUF_optimalTableLog #define HUF_readCTable zfs_HUF_readCTable #define HUF_validateCTable zfs_HUF_validateCTable #define HUF_writeCTable zfs_HUF_writeCTable /* lib/compress/zstd_compress_literals.o: */ #define ZSTD_compressLiterals zfs_ZSTD_compressLiterals #define ZSTD_compressRleLiteralsBlock zfs_ZSTD_compressRleLiteralsBlock #define ZSTD_noCompressLiterals zfs_ZSTD_noCompressLiterals /* lib/compress/zstd_compress_sequences.o: */ #define ZSTD_buildCTable zfs_ZSTD_buildCTable #define ZSTD_crossEntropyCost zfs_ZSTD_crossEntropyCost #define ZSTD_encodeSequences zfs_ZSTD_encodeSequences #define ZSTD_fseBitCost zfs_ZSTD_fseBitCost #define ZSTD_selectEncodingType zfs_ZSTD_selectEncodingType /* lib/compress/zstd_compress_superblock.o: */ #define ZSTD_compressSuperBlock zfs_ZSTD_compressSuperBlock /* lib/compress/zstd_compress.o: */ #define ZSTD_adjustCParams zfs_ZSTD_adjustCParams #define ZSTD_CCtx_getParameter zfs_ZSTD_CCtx_getParameter #define ZSTD_CCtx_loadDictionary_advanced zfs_ZSTD_CCtx_loadDictionary_advanced #define ZSTD_CCtx_loadDictionary_byReference zfs_ZSTD_CCtx_loadDictionary_byReference #define ZSTD_CCtx_loadDictionary zfs_ZSTD_CCtx_loadDictionary #define ZSTD_CCtxParams_getParameter zfs_ZSTD_CCtxParams_getParameter #define ZSTD_CCtxParams_init_advanced zfs_ZSTD_CCtxParams_init_advanced #define ZSTD_CCtxParams_init zfs_ZSTD_CCtxParams_init #define ZSTD_CCtxParams_reset zfs_ZSTD_CCtxParams_reset #define ZSTD_CCtxParams_setParameter zfs_ZSTD_CCtxParams_setParameter #define ZSTD_CCtx_refCDict zfs_ZSTD_CCtx_refCDict #define ZSTD_CCtx_refPrefix_advanced zfs_ZSTD_CCtx_refPrefix_advanced #define ZSTD_CCtx_refPrefix zfs_ZSTD_CCtx_refPrefix #define ZSTD_CCtx_reset zfs_ZSTD_CCtx_reset #define ZSTD_CCtx_setParametersUsingCCtxParams zfs_ZSTD_CCtx_setParametersUsingCCtxParams #define ZSTD_CCtx_setParameter zfs_ZSTD_CCtx_setParameter #define ZSTD_CCtx_setPledgedSrcSize zfs_ZSTD_CCtx_setPledgedSrcSize #define ZSTD_checkCParams zfs_ZSTD_checkCParams #define ZSTD_compress2 zfs_ZSTD_compress2 #define ZSTD_compress_advanced_internal zfs_ZSTD_compress_advanced_internal #define ZSTD_compress_advanced zfs_ZSTD_compress_advanced #define ZSTD_compressBegin_advanced_internal zfs_ZSTD_compressBegin_advanced_internal #define ZSTD_compressBegin_advanced zfs_ZSTD_compressBegin_advanced #define ZSTD_compressBegin_usingCDict_advanced zfs_ZSTD_compressBegin_usingCDict_advanced #define ZSTD_compressBegin_usingCDict zfs_ZSTD_compressBegin_usingCDict #define ZSTD_compressBegin_usingDict zfs_ZSTD_compressBegin_usingDict #define ZSTD_compressBegin zfs_ZSTD_compressBegin #define ZSTD_compressBlock zfs_ZSTD_compressBlock #define ZSTD_compressBound zfs_ZSTD_compressBound #define ZSTD_compressCCtx zfs_ZSTD_compressCCtx #define ZSTD_compressContinue zfs_ZSTD_compressContinue #define ZSTD_compressEnd zfs_ZSTD_compressEnd #define ZSTD_compressStream2_simpleArgs zfs_ZSTD_compressStream2_simpleArgs #define ZSTD_compressStream2 zfs_ZSTD_compressStream2 #define ZSTD_compressStream zfs_ZSTD_compressStream #define ZSTD_compress_usingCDict_advanced zfs_ZSTD_compress_usingCDict_advanced #define ZSTD_compress_usingCDict zfs_ZSTD_compress_usingCDict #define ZSTD_compress_usingDict zfs_ZSTD_compress_usingDict #define ZSTD_compress zfs_ZSTD_compress #define ZSTD_copyCCtx zfs_ZSTD_copyCCtx #define ZSTD_cParam_getBounds zfs_ZSTD_cParam_getBounds #define ZSTD_createCCtx_advanced zfs_ZSTD_createCCtx_advanced #define ZSTD_createCCtxParams zfs_ZSTD_createCCtxParams #define ZSTD_createCCtx zfs_ZSTD_createCCtx #define ZSTD_createCDict_advanced zfs_ZSTD_createCDict_advanced #define ZSTD_createCDict_byReference zfs_ZSTD_createCDict_byReference #define ZSTD_createCDict zfs_ZSTD_createCDict #define ZSTD_createCStream_advanced zfs_ZSTD_createCStream_advanced #define ZSTD_createCStream zfs_ZSTD_createCStream #define ZSTD_CStreamInSize zfs_ZSTD_CStreamInSize #define ZSTD_CStreamOutSize zfs_ZSTD_CStreamOutSize #define ZSTD_cycleLog zfs_ZSTD_cycleLog #define ZSTD_endStream zfs_ZSTD_endStream #define ZSTD_estimateCCtxSize_usingCCtxParams zfs_ZSTD_estimateCCtxSize_usingCCtxParams #define ZSTD_estimateCCtxSize_usingCParams zfs_ZSTD_estimateCCtxSize_usingCParams #define ZSTD_estimateCCtxSize zfs_ZSTD_estimateCCtxSize #define ZSTD_estimateCDictSize_advanced zfs_ZSTD_estimateCDictSize_advanced #define ZSTD_estimateCDictSize zfs_ZSTD_estimateCDictSize #define ZSTD_estimateCStreamSize_usingCCtxParams zfs_ZSTD_estimateCStreamSize_usingCCtxParams #define ZSTD_estimateCStreamSize_usingCParams zfs_ZSTD_estimateCStreamSize_usingCParams #define ZSTD_estimateCStreamSize zfs_ZSTD_estimateCStreamSize #define ZSTD_flushStream zfs_ZSTD_flushStream #define ZSTD_freeCCtxParams zfs_ZSTD_freeCCtxParams #define ZSTD_freeCCtx zfs_ZSTD_freeCCtx #define ZSTD_freeCDict zfs_ZSTD_freeCDict #define ZSTD_freeCStream zfs_ZSTD_freeCStream #define ZSTD_getBlockSize zfs_ZSTD_getBlockSize #define ZSTD_getCParamsFromCCtxParams zfs_ZSTD_getCParamsFromCCtxParams #define ZSTD_getCParamsFromCDict zfs_ZSTD_getCParamsFromCDict #define ZSTD_getCParams zfs_ZSTD_getCParams #define ZSTD_getFrameProgression zfs_ZSTD_getFrameProgression #define ZSTD_getParams zfs_ZSTD_getParams #define ZSTD_getSeqStore zfs_ZSTD_getSeqStore #define ZSTD_getSequences zfs_ZSTD_getSequences #define ZSTD_initCStream_advanced zfs_ZSTD_initCStream_advanced #define ZSTD_initCStream_internal zfs_ZSTD_initCStream_internal #define ZSTD_initCStream_srcSize zfs_ZSTD_initCStream_srcSize #define ZSTD_initCStream_usingCDict_advanced zfs_ZSTD_initCStream_usingCDict_advanced #define ZSTD_initCStream_usingCDict zfs_ZSTD_initCStream_usingCDict #define ZSTD_initCStream_usingDict zfs_ZSTD_initCStream_usingDict #define ZSTD_initCStream zfs_ZSTD_initCStream #define ZSTD_initStaticCCtx zfs_ZSTD_initStaticCCtx #define ZSTD_initStaticCDict zfs_ZSTD_initStaticCDict #define ZSTD_initStaticCStream zfs_ZSTD_initStaticCStream #define ZSTD_invalidateRepCodes zfs_ZSTD_invalidateRepCodes #define ZSTD_loadCEntropy zfs_ZSTD_loadCEntropy #define ZSTD_maxCLevel zfs_ZSTD_maxCLevel #define ZSTD_minCLevel zfs_ZSTD_minCLevel #define ZSTD_referenceExternalSequences zfs_ZSTD_referenceExternalSequences #define ZSTD_reset_compressedBlockState zfs_ZSTD_reset_compressedBlockState #define ZSTD_resetCStream zfs_ZSTD_resetCStream #define ZSTD_resetSeqStore zfs_ZSTD_resetSeqStore #define ZSTD_selectBlockCompressor zfs_ZSTD_selectBlockCompressor #define ZSTD_seqToCodes zfs_ZSTD_seqToCodes #define ZSTD_sizeof_CCtx zfs_ZSTD_sizeof_CCtx #define ZSTD_sizeof_CDict zfs_ZSTD_sizeof_CDict #define ZSTD_sizeof_CStream zfs_ZSTD_sizeof_CStream #define ZSTD_toFlushNow zfs_ZSTD_toFlushNow #define ZSTD_writeLastEmptyBlock zfs_ZSTD_writeLastEmptyBlock /* lib/compress/zstd_double_fast.o: */ #define ZSTD_compressBlock_doubleFast_dictMatchState zfs_ZSTD_compressBlock_doubleFast_dictMatchState #define ZSTD_compressBlock_doubleFast_extDict zfs_ZSTD_compressBlock_doubleFast_extDict #define ZSTD_compressBlock_doubleFast zfs_ZSTD_compressBlock_doubleFast #define ZSTD_fillDoubleHashTable zfs_ZSTD_fillDoubleHashTable /* lib/compress/zstd_fast.o: */ #define ZSTD_compressBlock_fast_dictMatchState zfs_ZSTD_compressBlock_fast_dictMatchState #define ZSTD_compressBlock_fast_extDict zfs_ZSTD_compressBlock_fast_extDict #define ZSTD_compressBlock_fast zfs_ZSTD_compressBlock_fast #define ZSTD_fillHashTable zfs_ZSTD_fillHashTable /* lib/compress/zstd_lazy.o: */ #define ZSTD_compressBlock_btlazy2_dictMatchState zfs_ZSTD_compressBlock_btlazy2_dictMatchState #define ZSTD_compressBlock_btlazy2_extDict zfs_ZSTD_compressBlock_btlazy2_extDict #define ZSTD_compressBlock_btlazy2 zfs_ZSTD_compressBlock_btlazy2 #define ZSTD_compressBlock_greedy_dictMatchState zfs_ZSTD_compressBlock_greedy_dictMatchState #define ZSTD_compressBlock_greedy_extDict zfs_ZSTD_compressBlock_greedy_extDict #define ZSTD_compressBlock_greedy zfs_ZSTD_compressBlock_greedy #define ZSTD_compressBlock_lazy2_dictMatchState zfs_ZSTD_compressBlock_lazy2_dictMatchState #define ZSTD_compressBlock_lazy2_extDict zfs_ZSTD_compressBlock_lazy2_extDict #define ZSTD_compressBlock_lazy2 zfs_ZSTD_compressBlock_lazy2 #define ZSTD_compressBlock_lazy_dictMatchState zfs_ZSTD_compressBlock_lazy_dictMatchState #define ZSTD_compressBlock_lazy_extDict zfs_ZSTD_compressBlock_lazy_extDict #define ZSTD_compressBlock_lazy zfs_ZSTD_compressBlock_lazy #define ZSTD_insertAndFindFirstIndex zfs_ZSTD_insertAndFindFirstIndex /* lib/compress/zstd_ldm.o: */ #define ZSTD_ldm_adjustParameters zfs_ZSTD_ldm_adjustParameters #define ZSTD_ldm_blockCompress zfs_ZSTD_ldm_blockCompress #define ZSTD_ldm_fillHashTable zfs_ZSTD_ldm_fillHashTable #define ZSTD_ldm_generateSequences zfs_ZSTD_ldm_generateSequences #define ZSTD_ldm_getMaxNbSeq zfs_ZSTD_ldm_getMaxNbSeq #define ZSTD_ldm_getTableSize zfs_ZSTD_ldm_getTableSize #define ZSTD_ldm_skipSequences zfs_ZSTD_ldm_skipSequences /* lib/compress/zstd_opt.o: */ #define ZSTD_compressBlock_btopt_dictMatchState zfs_ZSTD_compressBlock_btopt_dictMatchState #define ZSTD_compressBlock_btopt_extDict zfs_ZSTD_compressBlock_btopt_extDict #define ZSTD_compressBlock_btopt zfs_ZSTD_compressBlock_btopt #define ZSTD_compressBlock_btultra2 zfs_ZSTD_compressBlock_btultra2 #define ZSTD_compressBlock_btultra_dictMatchState zfs_ZSTD_compressBlock_btultra_dictMatchState #define ZSTD_compressBlock_btultra_extDict zfs_ZSTD_compressBlock_btultra_extDict #define ZSTD_compressBlock_btultra zfs_ZSTD_compressBlock_btultra #define ZSTD_updateTree zfs_ZSTD_updateTree /* lib/decompress/huf_decompress.o: */ #define HUF_decompress1X1_DCtx_wksp_bmi2 zfs_HUF_decompress1X1_DCtx_wksp_bmi2 #define HUF_decompress1X1_DCtx_wksp zfs_HUF_decompress1X1_DCtx_wksp #define HUF_decompress1X1_DCtx zfs_HUF_decompress1X1_DCtx #define HUF_decompress1X1_usingDTable zfs_HUF_decompress1X1_usingDTable #define HUF_decompress1X1 zfs_HUF_decompress1X1 #define HUF_decompress1X2_DCtx_wksp zfs_HUF_decompress1X2_DCtx_wksp #define HUF_decompress1X2_DCtx zfs_HUF_decompress1X2_DCtx #define HUF_decompress1X2_usingDTable zfs_HUF_decompress1X2_usingDTable #define HUF_decompress1X2 zfs_HUF_decompress1X2 #define HUF_decompress1X_DCtx_wksp zfs_HUF_decompress1X_DCtx_wksp #define HUF_decompress1X_DCtx zfs_HUF_decompress1X_DCtx #define HUF_decompress1X_usingDTable_bmi2 zfs_HUF_decompress1X_usingDTable_bmi2 #define HUF_decompress1X_usingDTable zfs_HUF_decompress1X_usingDTable #define HUF_decompress4X1_DCtx_wksp zfs_HUF_decompress4X1_DCtx_wksp #define HUF_decompress4X1_DCtx zfs_HUF_decompress4X1_DCtx #define HUF_decompress4X1_usingDTable zfs_HUF_decompress4X1_usingDTable #define HUF_decompress4X1 zfs_HUF_decompress4X1 #define HUF_decompress4X2_DCtx_wksp zfs_HUF_decompress4X2_DCtx_wksp #define HUF_decompress4X2_DCtx zfs_HUF_decompress4X2_DCtx #define HUF_decompress4X2_usingDTable zfs_HUF_decompress4X2_usingDTable #define HUF_decompress4X2 zfs_HUF_decompress4X2 #define HUF_decompress4X_DCtx zfs_HUF_decompress4X_DCtx #define HUF_decompress4X_hufOnly_wksp_bmi2 zfs_HUF_decompress4X_hufOnly_wksp_bmi2 #define HUF_decompress4X_hufOnly_wksp zfs_HUF_decompress4X_hufOnly_wksp #define HUF_decompress4X_hufOnly zfs_HUF_decompress4X_hufOnly #define HUF_decompress4X_usingDTable_bmi2 zfs_HUF_decompress4X_usingDTable_bmi2 #define HUF_decompress4X_usingDTable zfs_HUF_decompress4X_usingDTable #define HUF_decompress zfs_HUF_decompress #define HUF_readDTableX1_wksp zfs_HUF_readDTableX1_wksp #define HUF_readDTableX1 zfs_HUF_readDTableX1 #define HUF_readDTableX2_wksp zfs_HUF_readDTableX2_wksp #define HUF_readDTableX2 zfs_HUF_readDTableX2 #define HUF_selectDecoder zfs_HUF_selectDecoder /* lib/decompress/zstd_ddict.o: */ #define ZSTD_copyDDictParameters zfs_ZSTD_copyDDictParameters #define ZSTD_createDDict_advanced zfs_ZSTD_createDDict_advanced #define ZSTD_createDDict_byReference zfs_ZSTD_createDDict_byReference #define ZSTD_createDDict zfs_ZSTD_createDDict #define ZSTD_DDict_dictContent zfs_ZSTD_DDict_dictContent #define ZSTD_DDict_dictSize zfs_ZSTD_DDict_dictSize #define ZSTD_estimateDDictSize zfs_ZSTD_estimateDDictSize #define ZSTD_freeDDict zfs_ZSTD_freeDDict #define ZSTD_getDictID_fromDDict zfs_ZSTD_getDictID_fromDDict #define ZSTD_initStaticDDict zfs_ZSTD_initStaticDDict #define ZSTD_sizeof_DDict zfs_ZSTD_sizeof_DDict /* lib/decompress/zstd_decompress.o: */ #define ZSTD_copyDCtx zfs_ZSTD_copyDCtx #define ZSTD_createDCtx_advanced zfs_ZSTD_createDCtx_advanced #define ZSTD_createDCtx zfs_ZSTD_createDCtx #define ZSTD_createDStream_advanced zfs_ZSTD_createDStream_advanced #define ZSTD_createDStream zfs_ZSTD_createDStream #define ZSTD_DCtx_loadDictionary_advanced zfs_ZSTD_DCtx_loadDictionary_advanced #define ZSTD_DCtx_loadDictionary_byReference zfs_ZSTD_DCtx_loadDictionary_byReference #define ZSTD_DCtx_loadDictionary zfs_ZSTD_DCtx_loadDictionary #define ZSTD_DCtx_refDDict zfs_ZSTD_DCtx_refDDict #define ZSTD_DCtx_refPrefix_advanced zfs_ZSTD_DCtx_refPrefix_advanced #define ZSTD_DCtx_refPrefix zfs_ZSTD_DCtx_refPrefix #define ZSTD_DCtx_reset zfs_ZSTD_DCtx_reset #define ZSTD_DCtx_setFormat zfs_ZSTD_DCtx_setFormat #define ZSTD_DCtx_setMaxWindowSize zfs_ZSTD_DCtx_setMaxWindowSize #define ZSTD_DCtx_setParameter zfs_ZSTD_DCtx_setParameter #define ZSTD_decodingBufferSize_min zfs_ZSTD_decodingBufferSize_min #define ZSTD_decompressBegin_usingDDict zfs_ZSTD_decompressBegin_usingDDict #define ZSTD_decompressBegin_usingDict zfs_ZSTD_decompressBegin_usingDict #define ZSTD_decompressBegin zfs_ZSTD_decompressBegin #define ZSTD_decompressBound zfs_ZSTD_decompressBound #define ZSTD_decompressContinue zfs_ZSTD_decompressContinue #define ZSTD_decompressDCtx zfs_ZSTD_decompressDCtx #define ZSTD_decompressStream_simpleArgs zfs_ZSTD_decompressStream_simpleArgs #define ZSTD_decompressStream zfs_ZSTD_decompressStream #define ZSTD_decompress_usingDDict zfs_ZSTD_decompress_usingDDict #define ZSTD_decompress_usingDict zfs_ZSTD_decompress_usingDict #define ZSTD_decompress zfs_ZSTD_decompress #define ZSTD_dParam_getBounds zfs_ZSTD_dParam_getBounds #define ZSTD_DStreamInSize zfs_ZSTD_DStreamInSize #define ZSTD_DStreamOutSize zfs_ZSTD_DStreamOutSize #define ZSTD_estimateDCtxSize zfs_ZSTD_estimateDCtxSize #define ZSTD_estimateDStreamSize_fromFrame zfs_ZSTD_estimateDStreamSize_fromFrame #define ZSTD_estimateDStreamSize zfs_ZSTD_estimateDStreamSize #define ZSTD_findDecompressedSize zfs_ZSTD_findDecompressedSize #define ZSTD_findFrameCompressedSize zfs_ZSTD_findFrameCompressedSize #define ZSTD_frameHeaderSize zfs_ZSTD_frameHeaderSize #define ZSTD_freeDCtx zfs_ZSTD_freeDCtx #define ZSTD_freeDStream zfs_ZSTD_freeDStream #define ZSTD_getDecompressedSize zfs_ZSTD_getDecompressedSize #define ZSTD_getDictID_fromDict zfs_ZSTD_getDictID_fromDict #define ZSTD_getDictID_fromFrame zfs_ZSTD_getDictID_fromFrame #define ZSTD_getFrameContentSize zfs_ZSTD_getFrameContentSize #define ZSTD_getFrameHeader_advanced zfs_ZSTD_getFrameHeader_advanced #define ZSTD_getFrameHeader zfs_ZSTD_getFrameHeader #define ZSTD_initDStream_usingDDict zfs_ZSTD_initDStream_usingDDict #define ZSTD_initDStream_usingDict zfs_ZSTD_initDStream_usingDict #define ZSTD_initDStream zfs_ZSTD_initDStream #define ZSTD_initStaticDCtx zfs_ZSTD_initStaticDCtx #define ZSTD_initStaticDStream zfs_ZSTD_initStaticDStream #define ZSTD_insertBlock zfs_ZSTD_insertBlock #define ZSTD_isFrame zfs_ZSTD_isFrame #define ZSTD_loadDEntropy zfs_ZSTD_loadDEntropy #define ZSTD_nextInputType zfs_ZSTD_nextInputType #define ZSTD_nextSrcSizeToDecompress zfs_ZSTD_nextSrcSizeToDecompress #define ZSTD_resetDStream zfs_ZSTD_resetDStream #define ZSTD_sizeof_DCtx zfs_ZSTD_sizeof_DCtx #define ZSTD_sizeof_DStream zfs_ZSTD_sizeof_DStream /* lib/decompress/zstd_decompress_block.o: */ #define ZSTD_buildFSETable zfs_ZSTD_buildFSETable #define ZSTD_checkContinuity zfs_ZSTD_checkContinuity #define ZSTD_decodeLiteralsBlock zfs_ZSTD_decodeLiteralsBlock #define ZSTD_decodeSeqHeaders zfs_ZSTD_decodeSeqHeaders #define ZSTD_decompressBlock_internal zfs_ZSTD_decompressBlock_internal #define ZSTD_decompressBlock zfs_ZSTD_decompressBlock #define ZSTD_getcBlockSize zfs_ZSTD_getcBlockSize diff --git a/module/zstd/lib/common/fse_decompress.c b/module/zstd/lib/common/fse_decompress.c index bcc2223ccc65..6b3205c63cc8 100644 --- a/module/zstd/lib/common/fse_decompress.c +++ b/module/zstd/lib/common/fse_decompress.c @@ -1,286 +1,275 @@ /* ****************************************************************** * FSE : Finite State Entropy decoder * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. * * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy * - Public forum : https://groups.google.com/forum/#!forum/lz4c * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. ****************************************************************** */ /* ************************************************************** * Includes ****************************************************************/ #include /* malloc, free, qsort */ #include /* memcpy, memset */ #include "bitstream.h" #include "compiler.h" #define FSE_STATIC_LINKING_ONLY #include "fse.h" #include "error_private.h" /* ************************************************************** * Error Management ****************************************************************/ #define FSE_isError ERR_isError #define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */ /* ************************************************************** * Templates ****************************************************************/ /* designed to be included for type-specific functions (template emulation in C) Objective is to write these functions only once, for improved maintenance */ /* safety checks */ #ifndef FSE_FUNCTION_EXTENSION # error "FSE_FUNCTION_EXTENSION must be defined" #endif #ifndef FSE_FUNCTION_TYPE # error "FSE_FUNCTION_TYPE must be defined" #endif /* Function names */ #define FSE_CAT(X,Y) X##Y #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) /* Function templates */ -FSE_DTable* FSE_createDTable (unsigned tableLog) -{ - if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; - return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); -} - -void FSE_freeDTable (FSE_DTable* dt) -{ - free(dt); -} - size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) { void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr); U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1]; U32 const maxSV1 = maxSymbolValue + 1; U32 const tableSize = 1 << tableLog; U32 highThreshold = tableSize-1; /* Sanity Checks */ if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge); if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Init, lay down lowprob symbols */ { FSE_DTableHeader DTableH; DTableH.tableLog = (U16)tableLog; DTableH.fastMode = 1; { S16 const largeLimit= (S16)(1 << (tableLog-1)); U32 s; for (s=0; s= largeLimit) DTableH.fastMode=0; symbolNext[s] = normalizedCounter[s]; } } } memcpy(dt, &DTableH, sizeof(DTableH)); } /* Spread symbols */ { U32 const tableMask = tableSize-1; U32 const step = FSE_TABLESTEP(tableSize); U32 s, position = 0; for (s=0; s highThreshold) position = (position + step) & tableMask; /* lowprob area */ } } if (position!=0) return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ } /* Build Decoding table */ { U32 u; for (u=0; utableLog = 0; DTableH->fastMode = 0; cell->newState = 0; cell->symbol = symbolValue; cell->nbBits = 0; return 0; } size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) { void* ptr = dt; FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; void* dPtr = dt + 1; FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; const unsigned tableSize = 1 << nbBits; const unsigned tableMask = tableSize - 1; const unsigned maxSV1 = tableMask+1; unsigned s; /* Sanity checks */ if (nbBits < 1) return ERROR(GENERIC); /* min size */ /* Build Decoding Table */ DTableH->tableLog = (U16)nbBits; DTableH->fastMode = 1; for (s=0; s sizeof(bitD.bitContainer)*8) /* This test must be static */ BIT_reloadDStream(&bitD); op[1] = FSE_GETSYMBOL(&state2); if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */ { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } } op[2] = FSE_GETSYMBOL(&state1); if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */ BIT_reloadDStream(&bitD); op[3] = FSE_GETSYMBOL(&state2); } /* tail */ /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */ while (1) { if (op>(omax-2)) return ERROR(dstSize_tooSmall); *op++ = FSE_GETSYMBOL(&state1); if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) { *op++ = FSE_GETSYMBOL(&state2); break; } if (op>(omax-2)) return ERROR(dstSize_tooSmall); *op++ = FSE_GETSYMBOL(&state2); if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) { *op++ = FSE_GETSYMBOL(&state1); break; } } return op-ostart; } size_t FSE_decompress_usingDTable(void* dst, size_t originalSize, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt) { const void* ptr = dt; const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; const U32 fastMode = DTableH->fastMode; /* select fast mode (static) */ if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); } size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog) { const BYTE* const istart = (const BYTE*)cSrc; const BYTE* ip = istart; short counting[FSE_MAX_SYMBOL_VALUE+1]; unsigned tableLog; unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; /* normal FSE decoding mode */ size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize); if (FSE_isError(NCountLength)) return NCountLength; /* if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong); */ /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */ if (tableLog > maxLog) return ERROR(tableLog_tooLarge); ip += NCountLength; cSrcSize -= NCountLength; CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) ); return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */ } typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize) { DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */ return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG); } #endif /* FSE_COMMONDEFS_ONLY */