diff --git a/lib/libzpool/include/sys/zfs_context.h b/lib/libzpool/include/sys/zfs_context.h index 34c351bd0dbc..338c871e0df5 100644 --- a/lib/libzpool/include/sys/zfs_context.h +++ b/lib/libzpool/include/sys/zfs_context.h @@ -1,645 +1,650 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H #ifdef __cplusplus extern "C" { #endif #define _SYS_MUTEX_H #define _SYS_RWLOCK_H #define _SYS_CONDVAR_H #define _SYS_SYSTM_H #define _SYS_DEBUG_H #define _SYS_T_LOCK_H #define _SYS_VNODE_H #define _SYS_VFS_H #define _SYS_SUNDDI_H #define _SYS_CALLB_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Stack */ #define noinline __attribute__((noinline)) /* * Debugging */ /* * Note that we are not using the debugging levels. */ #define CE_CONT 0 /* continuation */ #define CE_NOTE 1 /* notice */ #define CE_WARN 2 /* warning */ #define CE_PANIC 3 /* panic */ #define CE_IGNORE 4 /* print nothing */ extern int aok; /* * ZFS debugging */ #ifdef ZFS_DEBUG extern void dprintf_setup(int *argc, char **argv); #endif /* ZFS_DEBUG */ extern void cmn_err(int, const char *, ...); extern void vcmn_err(int, const char *, __va_list); extern void panic(const char *, ...); extern void vpanic(const char *, __va_list); #define fm_panic panic extern int aok; /* This definition is copied from assert.h. */ #if defined(__STDC__) #if __STDC_VERSION__ - 0 >= 199901L #define zverify(EX) (void)((EX) || (aok) || \ (__assert_c99(#EX, __FILE__, __LINE__, __func__), 0)) #else #define zverify(EX) (void)((EX) || (aok) || \ (__assert(#EX, __FILE__, __LINE__), 0)) #endif /* __STDC_VERSION__ - 0 >= 199901L */ #else #define zverify(EX) (void)((EX) || (aok) || \ (_assert("EX", __FILE__, __LINE__), 0)) #endif /* __STDC__ */ #define VERIFY zverify #define ASSERT zverify #undef assert #define assert zverify extern void __assert(const char *, const char *, int); #ifdef lint #define VERIFY3_IMPL(x, y, z, t) if (x == z) ((void)0) #else /* BEGIN CSTYLED */ #define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \ const TYPE __left = (TYPE)(LEFT); \ const TYPE __right = (TYPE)(RIGHT); \ if (!(__left OP __right) && (!aok)) { \ char *__buf = alloca(256); \ (void) snprintf(__buf, 256, "%s %s %s (0x%llx %s 0x%llx)", \ #LEFT, #OP, #RIGHT, \ (u_longlong_t)__left, #OP, (u_longlong_t)__right); \ __assert(__buf, __FILE__, __LINE__); \ } \ _NOTE(CONSTCOND) } while (0) /* END CSTYLED */ #endif /* lint */ #define VERIFY3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t) #define VERIFY3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t) #define VERIFY3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t) #ifdef NDEBUG #define ASSERT3S(x, y, z) ((void)0) #define ASSERT3U(x, y, z) ((void)0) #define ASSERT3P(x, y, z) ((void)0) #else #define ASSERT3S(x, y, z) VERIFY3S(x, y, z) #define ASSERT3U(x, y, z) VERIFY3U(x, y, z) #define ASSERT3P(x, y, z) VERIFY3P(x, y, z) #endif /* * DTrace SDT probes have different signatures in userland than they do in * kernel. If they're being used in kernel code, re-define them out of * existence for their counterparts in libzpool. */ #ifdef DTRACE_PROBE #undef DTRACE_PROBE #define DTRACE_PROBE(a) ((void)0) #endif /* DTRACE_PROBE */ #ifdef DTRACE_PROBE1 #undef DTRACE_PROBE1 #define DTRACE_PROBE1(a, b, c) ((void)0) #endif /* DTRACE_PROBE1 */ #ifdef DTRACE_PROBE2 #undef DTRACE_PROBE2 #define DTRACE_PROBE2(a, b, c, d, e) ((void)0) #endif /* DTRACE_PROBE2 */ #ifdef DTRACE_PROBE3 #undef DTRACE_PROBE3 #define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) #endif /* DTRACE_PROBE3 */ #ifdef DTRACE_PROBE4 #undef DTRACE_PROBE4 #define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) #endif /* DTRACE_PROBE4 */ /* * Threads */ #define TS_MAGIC 0x72f158ab4261e538ull #define TS_RUN 0x00000002 #ifdef __linux__ #define STACK_SIZE 8192 /* Linux x86 and amd64 */ #else #define STACK_SIZE 24576 /* Solaris */ #endif #ifdef NPTL_GUARD_WITHIN_STACK #define EXTRA_GUARD_BYTES PAGESIZE #else #define EXTRA_GUARD_BYTES 0 #endif /* in libzpool, p0 exists only to have its address taken */ typedef struct proc { uintptr_t this_is_never_used_dont_dereference_it; } proc_t; extern struct proc p0; typedef void (*thread_func_t)(void *); typedef void (*thread_func_arg_t)(void *); typedef pthread_t kt_did_t; typedef struct kthread { kt_did_t t_tid; thread_func_t t_func; void * t_arg; } kthread_t; #define tsd_get(key) pthread_getspecific(key) #define tsd_set(key, val) pthread_setspecific(key, val) #define curthread zk_thread_current() #define thread_exit zk_thread_exit #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(stk, stksize, (thread_func_t)func, arg, \ len, NULL, state, pri) #define thread_join(t) zk_thread_join(t) #define newproc(f,a,cid,pri,ctp,pid) (ENOSYS) extern kthread_t *zk_thread_current(void); extern void zk_thread_exit(void); extern kthread_t *zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, size_t len, proc_t *pp, int state, pri_t pri); extern void zk_thread_join(kt_did_t tid); #define PS_NONE -1 #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) /* * Mutexes */ #define MTX_MAGIC 0x9522f51362a6e326ull #define MTX_INIT ((void *)NULL) #define MTX_DEST ((void *)-1UL) typedef struct kmutex { void *m_owner; uint64_t m_magic; pthread_mutex_t m_lock; } kmutex_t; #define MUTEX_DEFAULT 0 #define MUTEX_HELD(m) ((m)->m_owner == curthread) #define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m)) extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); extern void *mutex_owner(kmutex_t *mp); extern int mutex_held(kmutex_t *mp); /* * RW locks */ #define RW_MAGIC 0x4d31fb123648e78aull #define RW_INIT ((void *)NULL) #define RW_DEST ((void *)-1UL) typedef struct krwlock { void *rw_owner; void *rw_wr_owner; uint64_t rw_magic; pthread_rwlock_t rw_lock; uint_t rw_readers; } krwlock_t; typedef int krw_t; #define RW_READER 0 #define RW_WRITER 1 #define RW_DEFAULT RW_READER #define RW_READ_HELD(x) ((x)->rw_readers > 0) #define RW_WRITE_HELD(x) ((x)->rw_wr_owner == curthread) #define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); extern void rw_destroy(krwlock_t *rwlp); extern void rw_enter(krwlock_t *rwlp, krw_t rw); extern int rw_tryenter(krwlock_t *rwlp, krw_t rw); extern int rw_tryupgrade(krwlock_t *rwlp); extern void rw_exit(krwlock_t *rwlp); #define rw_downgrade(rwlp) do { } while (0) extern uid_t crgetuid(cred_t *cr); extern gid_t crgetgid(cred_t *cr); extern int crgetngroups(cred_t *cr); extern gid_t *crgetgroups(cred_t *cr); /* * Condition variables */ #define CV_MAGIC 0xd31ea9a83b1b30c4ull typedef struct kcondvar { uint64_t cv_magic; pthread_cond_t cv; } kcondvar_t; #define CV_DEFAULT 0 extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); extern void cv_destroy(kcondvar_t *cv); extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); extern void cv_signal(kcondvar_t *cv); extern void cv_broadcast(kcondvar_t *cv); /* * kstat creation, installation and deletion */ extern kstat_t *kstat_create(char *, int, char *, char *, uchar_t, ulong_t, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); /* * Kernel memory */ #define KM_SLEEP UMEM_NOFAIL #define KM_PUSHPAGE KM_SLEEP #define KM_NOSLEEP UMEM_DEFAULT +#define KM_NODEBUG 0x0 #define KMC_NODEBUG UMC_NODEBUG #define kmem_alloc(_s, _f) umem_alloc(_s, _f) #define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) #define kmem_free(_b, _s) umem_free(_b, _s) +#define vmem_alloc(_s, _f) kmem_alloc(_s, _f) +#define vmem_zalloc(_s, _f) kmem_zalloc(_s, _f) +#define vmem_free(_b, _s) kmem_free(_b, _s) #define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \ umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) #define kmem_cache_destroy(_c) umem_cache_destroy(_c) #define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f) #define kmem_cache_free(_c, _b) umem_cache_free(_c, _b) #define kmem_debugging() 0 #define kmem_cache_reap_now(_c) /* nothing */ #define kmem_cache_set_move(_c, _cb) /* nothing */ #define POINTER_INVALIDATE(_pp) /* nothing */ #define POINTER_IS_VALID(_p) 0 typedef umem_cache_t kmem_cache_t; typedef enum kmem_cbrc { KMEM_CBRC_YES, KMEM_CBRC_NO, KMEM_CBRC_LATER, KMEM_CBRC_DONT_NEED, KMEM_CBRC_DONT_KNOW } kmem_cbrc_t; /* * Task queues */ typedef struct taskq taskq_t; typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ #define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */ #define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */ #define TQ_SLEEP KM_SLEEP /* Can block for memory */ #define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_FRONT 0x08 /* Queue in front */ extern taskq_t *system_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); #define taskq_create_proc(a, b, c, d, e, p, f) \ (taskq_create(a, b, c, d, e, f)) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ (taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern int taskq_member(taskq_t *, kthread_t *); extern void system_taskq_init(void); extern void system_taskq_fini(void); #define XVA_MAPSIZE 3 #define XVA_MAGIC 0x78766174 /* * vnodes */ typedef struct vnode { uint64_t v_size; int v_fd; char *v_path; } vnode_t; #define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ typedef struct xoptattr { timestruc_t xoa_createtime; /* Create time of file */ uint8_t xoa_archive; uint8_t xoa_system; uint8_t xoa_readonly; uint8_t xoa_hidden; uint8_t xoa_nounlink; uint8_t xoa_immutable; uint8_t xoa_appendonly; uint8_t xoa_nodump; uint8_t xoa_settable; uint8_t xoa_opaque; uint8_t xoa_av_quarantined; uint8_t xoa_av_modified; uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; uint8_t xoa_reparse; uint8_t xoa_offline; uint8_t xoa_sparse; } xoptattr_t; typedef struct vattr { uint_t va_mask; /* bit-mask of attributes */ u_offset_t va_size; /* file size in bytes */ } vattr_t; typedef struct xvattr { vattr_t xva_vattr; /* Embedded vattr structure */ uint32_t xva_magic; /* Magic Number */ uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ xoptattr_t xva_xoptattrs; /* Optional attributes */ } xvattr_t; typedef struct vsecattr { uint_t vsa_mask; /* See below */ int vsa_aclcnt; /* ACL entry count */ void *vsa_aclentp; /* pointer to ACL entries */ int vsa_dfaclcnt; /* default ACL entry count */ void *vsa_dfaclentp; /* pointer to default ACL entries */ size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ } vsecattr_t; #define AT_TYPE 0x00001 #define AT_MODE 0x00002 #define AT_UID 0x00004 #define AT_GID 0x00008 #define AT_FSID 0x00010 #define AT_NODEID 0x00020 #define AT_NLINK 0x00040 #define AT_SIZE 0x00080 #define AT_ATIME 0x00100 #define AT_MTIME 0x00200 #define AT_CTIME 0x00400 #define AT_RDEV 0x00800 #define AT_BLKSIZE 0x01000 #define AT_NBLOCKS 0x02000 #define AT_SEQ 0x08000 #define AT_XVATTR 0x10000 #define CRCREAT 0 extern int fop_getattr(vnode_t *vp, vattr_t *vap); #define VOP_CLOSE(vp, f, c, o, cr, ct) 0 #define VOP_PUTPAGE(vp, of, sz, fl, cr, ct) 0 #define VOP_GETATTR(vp, vap, fl, cr, ct) fop_getattr((vp), (vap)); #define VOP_FSYNC(vp, f, cr, ct) fsync((vp)->v_fd) #define VN_RELE(vp) vn_close(vp) extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp, int x2, int x3); extern int vn_openat(char *path, int x1, int oflags, int mode, vnode_t **vpp, int x2, int x3, vnode_t *vp, int fd); extern int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp); extern void vn_close(vnode_t *vp); #define vn_remove(path, x1, x2) remove(path) #define vn_rename(from, to, seg) rename((from), (to)) #define vn_is_readonly(vp) B_FALSE extern vnode_t *rootdir; #include /* for FREAD, FWRITE, etc */ /* * Random stuff */ #define ddi_get_lbolt() (gethrtime() >> 23) #define ddi_get_lbolt64() (gethrtime() >> 23) #define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */ extern void delay(clock_t ticks); #define gethrestime_sec() time(NULL) #define gethrestime(t) \ do {\ (t)->tv_sec = gethrestime_sec();\ (t)->tv_nsec = 0;\ } while (0); #define max_ncpus 64 #define minclsyspri 60 #define maxclsyspri 99 #define CPU_SEQID (pthread_self() & (max_ncpus - 1)) #define kcred NULL #define CRED() NULL #define ptob(x) ((x) * PAGESIZE) extern uint64_t physmem; extern int highbit(ulong_t i); extern int random_get_bytes(uint8_t *ptr, size_t len); extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); extern void kernel_init(int); extern void kernel_fini(void); struct spa; extern void nicenum(uint64_t num, char *buf); extern void show_pool_stats(struct spa *); typedef struct callb_cpr { kmutex_t *cc_lockp; } callb_cpr_t; #define CALLB_CPR_INIT(cp, lockp, func, name) { \ (cp)->cc_lockp = lockp; \ } #define CALLB_CPR_SAFE_BEGIN(cp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ } #define CALLB_CPR_SAFE_END(cp, lockp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ } #define CALLB_CPR_EXIT(cp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ mutex_exit((cp)->cc_lockp); \ } #define zone_dataset_visible(x, y) (1) #define INGLOBALZONE(z) (1) +extern char *kmem_vasprintf(const char *fmt, va_list adx); extern char *kmem_asprintf(const char *fmt, ...); #define strfree(str) kmem_free((str), strlen(str)+1) /* * Hostname information */ extern char hw_serial[]; /* for userland-emulated hostid access */ extern int ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result); extern int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result); /* ZFS Boot Related stuff. */ struct _buf { intptr_t _fd; }; struct bootstat { uint64_t st_size; }; typedef struct ace_object { uid_t a_who; uint32_t a_access_mask; uint16_t a_flags; uint16_t a_type; uint8_t a_obj_type[16]; uint8_t a_inherit_obj_type[16]; } ace_object_t; #define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 #define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 #define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 #define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 extern struct _buf *kobj_open_file(char *name); extern int kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off); extern void kobj_close_file(struct _buf *file); extern int kobj_get_filesize(struct _buf *file, uint64_t *size); extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern zoneid_t getzoneid(void); /* SID stuff */ typedef struct ksiddomain { uint_t kd_ref; uint_t kd_len; char *kd_name; } ksiddomain_t; ksiddomain_t *ksid_lookupdomain(const char *); void ksiddomain_rele(ksiddomain_t *); #define DDI_SLEEP KM_SLEEP #define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \ sysevent_post_event(_c, _d, _b, "libzpool", _e, _f) #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 0559347e96b9..494e544ea7f8 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -1,1168 +1,1170 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Emulation of kernel services in userland. */ int aok; uint64_t physmem; vnode_t *rootdir = (vnode_t *)0xabcd1234; char hw_serial[HW_HOSTID_LEN]; struct utsname utsname = { "userland", "libzpool", "1", "1", "na" }; /* this only exists to have its address taken */ struct proc p0; /* * ========================================================================= * threads * ========================================================================= */ pthread_cond_t kthread_cond = PTHREAD_COND_INITIALIZER; pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER; pthread_key_t kthread_key; int kthread_nr = 0; static void thread_init(void) { kthread_t *kt; VERIFY3S(pthread_key_create(&kthread_key, NULL), ==, 0); /* Create entry for primary kthread */ kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); kt->t_tid = pthread_self(); kt->t_func = NULL; VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0); /* Only the main thread should be running at the moment */ ASSERT3S(kthread_nr, ==, 0); kthread_nr = 1; } static void thread_fini(void) { kthread_t *kt = curthread; ASSERT(pthread_equal(kt->t_tid, pthread_self())); ASSERT3P(kt->t_func, ==, NULL); umem_free(kt, sizeof(kthread_t)); /* Wait for all threads to exit via thread_exit() */ VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); kthread_nr--; /* Main thread is exiting */ while (kthread_nr > 0) VERIFY3S(pthread_cond_wait(&kthread_cond, &kthread_lock), ==, 0); ASSERT3S(kthread_nr, ==, 0); VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); VERIFY3S(pthread_key_delete(kthread_key), ==, 0); } kthread_t * zk_thread_current(void) { kthread_t *kt = pthread_getspecific(kthread_key); ASSERT3P(kt, !=, NULL); return kt; } void * zk_thread_helper(void *arg) { kthread_t *kt = (kthread_t *) arg; VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0); VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); kthread_nr++; VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); kt->t_tid = pthread_self(); ((thread_func_arg_t) kt->t_func)(kt->t_arg); /* Unreachable, thread must exit with thread_exit() */ abort(); return NULL; } kthread_t * zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, size_t len, proc_t *pp, int state, pri_t pri) { kthread_t *kt; pthread_attr_t attr; size_t stack; ASSERT3S(state & ~TS_RUN, ==, 0); kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); kt->t_func = func; kt->t_arg = arg; /* * The Solaris kernel stack size is 24k for x86/x86_64. * The Linux kernel stack size is 8k for x86/x86_64. * * We reduce the default stack size in userspace, to ensure * we observe stack overruns in user space as well as in * kernel space. PTHREAD_STACK_MIN is the minimum stack * required for a NULL procedure in user space and is added * in to the stack requirements. * * Some buggy NPTL threading implementations include the * guard area within the stack size allocations. In * this case we allocate an extra page to account for the * guard area since we only have two pages of usable stack * on Linux. */ stack = PTHREAD_STACK_MIN + MAX(stksize, STACK_SIZE) + EXTRA_GUARD_BYTES; VERIFY3S(pthread_attr_init(&attr), ==, 0); VERIFY3S(pthread_attr_setstacksize(&attr, stack), ==, 0); VERIFY3S(pthread_attr_setguardsize(&attr, PAGESIZE), ==, 0); VERIFY3S(pthread_create(&kt->t_tid, &attr, &zk_thread_helper, kt), ==, 0); VERIFY3S(pthread_attr_destroy(&attr), ==, 0); return kt; } void zk_thread_exit(void) { kthread_t *kt = curthread; ASSERT(pthread_equal(kt->t_tid, pthread_self())); umem_free(kt, sizeof(kthread_t)); pthread_mutex_lock(&kthread_lock); kthread_nr--; pthread_mutex_unlock(&kthread_lock); pthread_cond_broadcast(&kthread_cond); pthread_exit((void *)TS_MAGIC); } void zk_thread_join(kt_did_t tid) { void *ret; pthread_join((pthread_t)tid, &ret); VERIFY3P(ret, ==, (void *)TS_MAGIC); } /* * ========================================================================= * kstats * ========================================================================= */ /*ARGSUSED*/ kstat_t * kstat_create(char *module, int instance, char *name, char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) { return (NULL); } /*ARGSUSED*/ void kstat_install(kstat_t *ksp) {} /*ARGSUSED*/ void kstat_delete(kstat_t *ksp) {} /* * ========================================================================= * mutexes * ========================================================================= */ void mutex_init(kmutex_t *mp, char *name, int type, void *cookie) { ASSERT3S(type, ==, MUTEX_DEFAULT); ASSERT3P(cookie, ==, NULL); mp->m_owner = MTX_INIT; mp->m_magic = MTX_MAGIC; VERIFY3S(pthread_mutex_init(&mp->m_lock, NULL), ==, 0); } void mutex_destroy(kmutex_t *mp) { ASSERT3U(mp->m_magic, ==, MTX_MAGIC); ASSERT3P(mp->m_owner, ==, MTX_INIT); VERIFY3S(pthread_mutex_destroy(&(mp)->m_lock), ==, 0); mp->m_owner = MTX_DEST; mp->m_magic = 0; } void mutex_enter(kmutex_t *mp) { ASSERT3U(mp->m_magic, ==, MTX_MAGIC); ASSERT3P(mp->m_owner, !=, MTX_DEST); ASSERT3P(mp->m_owner, !=, curthread); VERIFY3S(pthread_mutex_lock(&mp->m_lock), ==, 0); ASSERT3P(mp->m_owner, ==, MTX_INIT); mp->m_owner = curthread; } int mutex_tryenter(kmutex_t *mp) { ASSERT3U(mp->m_magic, ==, MTX_MAGIC); ASSERT3P(mp->m_owner, !=, MTX_DEST); if (0 == pthread_mutex_trylock(&mp->m_lock)) { ASSERT3P(mp->m_owner, ==, MTX_INIT); mp->m_owner = curthread; return (1); } else { return (0); } } void mutex_exit(kmutex_t *mp) { ASSERT3U(mp->m_magic, ==, MTX_MAGIC); ASSERT3P(mutex_owner(mp), ==, curthread); mp->m_owner = MTX_INIT; VERIFY3S(pthread_mutex_unlock(&mp->m_lock), ==, 0); } void * mutex_owner(kmutex_t *mp) { ASSERT3U(mp->m_magic, ==, MTX_MAGIC); return (mp->m_owner); } int mutex_held(kmutex_t *mp) { return (mp->m_owner == curthread); } /* * ========================================================================= * rwlocks * ========================================================================= */ void rw_init(krwlock_t *rwlp, char *name, int type, void *arg) { ASSERT3S(type, ==, RW_DEFAULT); ASSERT3P(arg, ==, NULL); VERIFY3S(pthread_rwlock_init(&rwlp->rw_lock, NULL), ==, 0); rwlp->rw_owner = RW_INIT; rwlp->rw_wr_owner = RW_INIT; rwlp->rw_readers = 0; rwlp->rw_magic = RW_MAGIC; } void rw_destroy(krwlock_t *rwlp) { ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); VERIFY3S(pthread_rwlock_destroy(&rwlp->rw_lock), ==, 0); rwlp->rw_magic = 0; } void rw_enter(krwlock_t *rwlp, krw_t rw) { ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); ASSERT3P(rwlp->rw_owner, !=, curthread); ASSERT3P(rwlp->rw_wr_owner, !=, curthread); if (rw == RW_READER) { VERIFY3S(pthread_rwlock_rdlock(&rwlp->rw_lock), ==, 0); ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT); atomic_inc_uint(&rwlp->rw_readers); } else { VERIFY3S(pthread_rwlock_wrlock(&rwlp->rw_lock), ==, 0); ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT); ASSERT3U(rwlp->rw_readers, ==, 0); rwlp->rw_wr_owner = curthread; } rwlp->rw_owner = curthread; } void rw_exit(krwlock_t *rwlp) { ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); ASSERT(RW_LOCK_HELD(rwlp)); if (RW_READ_HELD(rwlp)) atomic_dec_uint(&rwlp->rw_readers); else rwlp->rw_wr_owner = RW_INIT; rwlp->rw_owner = RW_INIT; VERIFY3S(pthread_rwlock_unlock(&rwlp->rw_lock), ==, 0); } int rw_tryenter(krwlock_t *rwlp, krw_t rw) { int rv; ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); if (rw == RW_READER) rv = pthread_rwlock_tryrdlock(&rwlp->rw_lock); else rv = pthread_rwlock_trywrlock(&rwlp->rw_lock); if (rv == 0) { ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT); if (rw == RW_READER) atomic_inc_uint(&rwlp->rw_readers); else { ASSERT3U(rwlp->rw_readers, ==, 0); rwlp->rw_wr_owner = curthread; } rwlp->rw_owner = curthread; return (1); } VERIFY3S(rv, ==, EBUSY); return (0); } int rw_tryupgrade(krwlock_t *rwlp) { ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); return (0); } /* * ========================================================================= * condition variables * ========================================================================= */ void cv_init(kcondvar_t *cv, char *name, int type, void *arg) { ASSERT3S(type, ==, CV_DEFAULT); cv->cv_magic = CV_MAGIC; VERIFY3S(pthread_cond_init(&cv->cv, NULL), ==, 0); } void cv_destroy(kcondvar_t *cv) { ASSERT3U(cv->cv_magic, ==, CV_MAGIC); VERIFY3S(pthread_cond_destroy(&cv->cv), ==, 0); cv->cv_magic = 0; } void cv_wait(kcondvar_t *cv, kmutex_t *mp) { ASSERT3U(cv->cv_magic, ==, CV_MAGIC); ASSERT3P(mutex_owner(mp), ==, curthread); mp->m_owner = MTX_INIT; int ret = pthread_cond_wait(&cv->cv, &mp->m_lock); if (ret != 0) VERIFY3S(ret, ==, EINTR); mp->m_owner = curthread; } clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) { int error; struct timeval tv; timestruc_t ts; clock_t delta; ASSERT3U(cv->cv_magic, ==, CV_MAGIC); top: delta = abstime - ddi_get_lbolt(); if (delta <= 0) return (-1); VERIFY(gettimeofday(&tv, NULL) == 0); ts.tv_sec = tv.tv_sec + delta / hz; ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } ASSERT3P(mutex_owner(mp), ==, curthread); mp->m_owner = MTX_INIT; error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts); mp->m_owner = curthread; if (error == ETIMEDOUT) return (-1); if (error == EINTR) goto top; VERIFY3S(error, ==, 0); return (1); } void cv_signal(kcondvar_t *cv) { ASSERT3U(cv->cv_magic, ==, CV_MAGIC); VERIFY3S(pthread_cond_signal(&cv->cv), ==, 0); } void cv_broadcast(kcondvar_t *cv) { ASSERT3U(cv->cv_magic, ==, CV_MAGIC); VERIFY3S(pthread_cond_broadcast(&cv->cv), ==, 0); } /* * ========================================================================= * vnode operations * ========================================================================= */ /* * Note: for the xxxat() versions of these functions, we assume that the * starting vp is always rootdir (which is true for spa_directory.c, the only * ZFS consumer of these interfaces). We assert this is true, and then emulate * them by adding '/' in front of the path. */ /*ARGSUSED*/ int vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) { int fd; vnode_t *vp; int old_umask; char *realpath; struct stat64 st; int err; realpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); /* * If we're accessing a real disk from userland, we need to use * the character interface to avoid caching. This is particularly * important if we're trying to look at a real in-kernel storage * pool from userland, e.g. via zdb, because otherwise we won't * see the changes occurring under the segmap cache. * On the other hand, the stupid character device returns zero * for its size. So -- gag -- we open the block device to get * its size, and remember it for subsequent VOP_GETATTR(). */ if (strncmp(path, "/dev/", 5) == 0) { char *dsk; fd = open64(path, O_RDONLY); if (fd == -1) { err = errno; free(realpath); return (err); } if (fstat64(fd, &st) == -1) { err = errno; close(fd); free(realpath); return (err); } close(fd); (void) sprintf(realpath, "%s", path); dsk = strstr(path, "/dsk/"); if (dsk != NULL) (void) sprintf(realpath + (dsk - path) + 1, "r%s", dsk + 1); } else { (void) sprintf(realpath, "%s", path); if (!(flags & FCREAT) && stat64(realpath, &st) == -1) { err = errno; free(realpath); return (err); } } if (flags & FCREAT) old_umask = umask(0); /* * The construct 'flags - FREAD' conveniently maps combinations of * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. */ fd = open64(realpath, flags - FREAD, mode); free(realpath); if (flags & FCREAT) (void) umask(old_umask); if (fd == -1) return (errno); if (fstat64(fd, &st) == -1) { err = errno; close(fd); return (err); } (void) fcntl(fd, F_SETFD, FD_CLOEXEC); *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); vp->v_fd = fd; vp->v_size = st.st_size; vp->v_path = spa_strdup(path); return (0); } /*ARGSUSED*/ int vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3, vnode_t *startvp, int fd) { char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); int ret; ASSERT(startvp == rootdir); (void) sprintf(realpath, "/%s", path); /* fd ignored for now, need if want to simulate nbmand support */ ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); umem_free(realpath, strlen(path) + 2); return (ret); } /*ARGSUSED*/ int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) { ssize_t rc, done = 0, split; if (uio == UIO_READ) { rc = pread64(vp->v_fd, addr, len, offset); } else { /* * To simulate partial disk writes, we split writes into two * system calls so that the process can be killed in between. */ split = (len > 0 ? rand() % len : 0); rc = pwrite64(vp->v_fd, addr, split, offset); if (rc != -1) { done = rc; rc = pwrite64(vp->v_fd, (char *)addr + split, len - split, offset + split); } } if (rc == -1) return (errno); done += rc; if (residp) *residp = len - done; else if (done != len) return (EIO); return (0); } void vn_close(vnode_t *vp) { close(vp->v_fd); spa_strfree(vp->v_path); umem_free(vp, sizeof (vnode_t)); } /* * At a minimum we need to update the size since vdev_reopen() * will no longer call vn_openat(). */ int fop_getattr(vnode_t *vp, vattr_t *vap) { struct stat64 st; if (fstat64(vp->v_fd, &st) == -1) { close(vp->v_fd); return (errno); } vap->va_size = st.st_size; return (0); } #ifdef ZFS_DEBUG /* * ========================================================================= * Figure out which debugging statements to print * ========================================================================= */ static char *dprintf_string; static int dprintf_print_all; int dprintf_find_string(const char *string) { char *tmp_str = dprintf_string; int len = strlen(string); /* * Find out if this is a string we want to print. * String format: file1.c,function_name1,file2.c,file3.c */ while (tmp_str != NULL) { if (strncmp(tmp_str, string, len) == 0 && (tmp_str[len] == ',' || tmp_str[len] == '\0')) return (1); tmp_str = strchr(tmp_str, ','); if (tmp_str != NULL) tmp_str++; /* Get rid of , */ } return (0); } void dprintf_setup(int *argc, char **argv) { int i, j; /* * Debugging can be specified two ways: by setting the * environment variable ZFS_DEBUG, or by including a * "debug=..." argument on the command line. The command * line setting overrides the environment variable. */ for (i = 1; i < *argc; i++) { int len = strlen("debug="); /* First look for a command line argument */ if (strncmp("debug=", argv[i], len) == 0) { dprintf_string = argv[i] + len; /* Remove from args */ for (j = i; j < *argc; j++) argv[j] = argv[j+1]; argv[j] = NULL; (*argc)--; } } if (dprintf_string == NULL) { /* Look for ZFS_DEBUG environment variable */ dprintf_string = getenv("ZFS_DEBUG"); } /* * Are we just turning on all debugging? */ if (dprintf_find_string("on")) dprintf_print_all = 1; } /* * ========================================================================= * debug printfs * ========================================================================= */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; va_list adx; /* * Get rid of annoying "../common/" prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } if (dprintf_print_all || dprintf_find_string(newfile) || dprintf_find_string(func)) { /* Print out just the function name if requested */ flockfile(stdout); if (dprintf_find_string("pid")) (void) printf("%d ", getpid()); if (dprintf_find_string("tid")) (void) printf("%u ", (uint_t) pthread_self()); if (dprintf_find_string("cpu")) (void) printf("%u ", getcpuid()); if (dprintf_find_string("time")) (void) printf("%llu ", gethrtime()); if (dprintf_find_string("long")) (void) printf("%s, line %d: ", newfile, line); (void) printf("%s: ", func); va_start(adx, fmt); (void) vprintf(fmt, adx); va_end(adx); funlockfile(stdout); } } #endif /* ZFS_DEBUG */ /* * ========================================================================= * cmn_err() and panic() * ========================================================================= */ static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; void vpanic(const char *fmt, va_list adx) { (void) fprintf(stderr, "error: "); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "\n"); abort(); /* think of it as a "user-level crash dump" */ } void panic(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vpanic(fmt, adx); va_end(adx); } void vcmn_err(int ce, const char *fmt, va_list adx) { if (ce == CE_PANIC) vpanic(fmt, adx); if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ (void) fprintf(stderr, "%s", ce_prefix[ce]); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "%s", ce_suffix[ce]); } } /*PRINTFLIKE2*/ void cmn_err(int ce, const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(ce, fmt, adx); va_end(adx); } /* * ========================================================================= * kobj interfaces * ========================================================================= */ struct _buf * kobj_open_file(char *name) { struct _buf *file; vnode_t *vp; /* set vp as the _fd field of the file */ if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, -1) != 0) return ((void *)-1UL); file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); file->_fd = (intptr_t)vp; return (file); } int kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) { ssize_t resid; vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, UIO_SYSSPACE, 0, 0, 0, &resid); return (size - resid); } void kobj_close_file(struct _buf *file) { vn_close((vnode_t *)file->_fd); umem_free(file, sizeof (struct _buf)); } int kobj_get_filesize(struct _buf *file, uint64_t *size) { struct stat64 st; vnode_t *vp = (vnode_t *)file->_fd; if (fstat64(vp->v_fd, &st) == -1) { vn_close(vp); return (errno); } *size = st.st_size; return (0); } /* * ========================================================================= * misc routines * ========================================================================= */ void delay(clock_t ticks) { poll(0, 0, ticks * (1000 / hz)); } /* * Find highest one bit set. * Returns bit number + 1 of highest bit that is set, otherwise returns 0. * High order bit is 31 (or 63 in _LP64 kernel). */ int highbit(ulong_t i) { register int h = 1; if (i == 0) return (0); #ifdef _LP64 if (i & 0xffffffff00000000ul) { h += 32; i >>= 32; } #endif if (i & 0xffff0000) { h += 16; i >>= 16; } if (i & 0xff00) { h += 8; i >>= 8; } if (i & 0xf0) { h += 4; i >>= 4; } if (i & 0xc) { h += 2; i >>= 2; } if (i & 0x2) { h += 1; } return (h); } static int random_fd = -1, urandom_fd = -1; static int random_get_bytes_common(uint8_t *ptr, size_t len, int fd) { size_t resid = len; ssize_t bytes; ASSERT(fd != -1); while (resid != 0) { bytes = read(fd, ptr, resid); ASSERT3S(bytes, >=, 0); ptr += bytes; resid -= bytes; } return (0); } int random_get_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, random_fd)); } int random_get_pseudo_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, urandom_fd)); } int ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) { char *end; *result = strtoul(hw_serial, &end, base); if (*result == 0) return (errno); return (0); } int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) { char *end; *result = strtoull(str, &end, base); if (*result == 0) return (errno); return (0); } /* * ========================================================================= * kernel emulation setup & teardown * ========================================================================= */ static int umem_out_of_memory(void) { char errmsg[] = "out of memory -- generating core dump\n"; (void) fprintf(stderr, "%s", errmsg); abort(); return (0); } void kernel_init(int mode) { umem_nofail_callback(umem_out_of_memory); physmem = sysconf(_SC_PHYS_PAGES); dprintf("physmem = %llu pages (%.2f GB)\n", physmem, (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", (mode & FWRITE) ? gethostid() : 0); VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1); VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1); thread_init(); system_taskq_init(); spa_init(mode); } void kernel_fini(void) { spa_fini(); system_taskq_fini(); thread_fini(); close(random_fd); close(urandom_fd); random_fd = -1; urandom_fd = -1; } uid_t crgetuid(cred_t *cr) { return (0); } gid_t crgetgid(cred_t *cr) { return (0); } int crgetngroups(cred_t *cr) { return (0); } gid_t * crgetgroups(cred_t *cr) { return (NULL); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (0); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { return (0); } ksiddomain_t * ksid_lookupdomain(const char *dom) { ksiddomain_t *kd; kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); kd->kd_name = spa_strdup(dom); return (kd); } void ksiddomain_rele(ksiddomain_t *ksid) { spa_strfree(ksid->kd_name); umem_free(ksid, sizeof (ksiddomain_t)); } -/* - * Do not change the length of the returned string; it must be freed - * with strfree(). - */ char * -kmem_asprintf(const char *fmt, ...) +kmem_vasprintf(const char *fmt, va_list adx) { - int size; - va_list adx; - char *buf; + char *buf = NULL; + va_list adx_copy; - va_start(adx, fmt); - size = vsnprintf(NULL, 0, fmt, adx) + 1; - va_end(adx); + va_copy(adx_copy, adx); + VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); + va_end(adx_copy); - buf = kmem_alloc(size, KM_SLEEP); + return (buf); +} + +char * +kmem_asprintf(const char *fmt, ...) +{ + char *buf = NULL; + va_list adx; va_start(adx, fmt); - size = vsnprintf(buf, size, fmt, adx); + VERIFY(vasprintf(&buf, fmt, adx) != -1); va_end(adx); return (buf); } /* ARGSUSED */ int zfs_onexit_fd_hold(int fd, minor_t *minorp) { *minorp = 0; return (0); } /* ARGSUSED */ void zfs_onexit_fd_rele(int fd) { } /* ARGSUSED */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle) { return (0); } /* ARGSUSED */ int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) { return (0); } /* ARGSUSED */ int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) { return (0); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5ac73e1158ca..f1d51805b19f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1,4667 +1,4682 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefor exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (rangeing from 512 bytes to * 128K bytes). We therefor choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for * adjusting the cache use method 2. We therefor provide two * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * * Buffers do not have their own mutexs, rather they rely on the * hash table mutexs for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexs). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each arc state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an arc list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * Arc buffers may have an associated eviction callback function. * This function will be invoked prior to removing the buffer (e.g. * in arc_do_user_evicts()). Note however that the data associated * with the buffer may be evicted prior to the callback. The callback * must be made with *no locks held* (to prevent deadlock). Additionally, * the users of callbacks must ensure that their private data is * protected from simultaneous callbacks from arc_buf_evict() * and arc_do_user_evicts(). * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #endif #include #include #include static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; extern int zfs_write_limit_shift; extern uint64_t zfs_write_limit_max; extern kmutex_t zfs_write_limit_lock; #define ARC_REDUCE_DNLC_PERCENT 3 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ ARC_RECLAIM_CONS /* Conservative reclaim strategy */ } arc_reclaim_strategy_t; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ static int arc_shrink_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static int arc_min_prefetch_lifespan; static int arc_dead; /* * The arc has filled available memory and has now warmed up. */ static boolean_t arc_warm; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ typedef struct arc_state { list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ uint64_t arcs_size; /* total amount of data in this state */ kmutex_t arcs_mtx; } arc_state_t; /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; kstat_named_t arcstat_misses; kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_misses; kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_misses; kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_misses; kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_deleted; kstat_named_t arcstat_recycle_miss; kstat_named_t arcstat_mutex_miss; kstat_named_t arcstat_evict_skip; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_p; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; kstat_named_t arcstat_hdr_size; kstat_named_t arcstat_data_size; kstat_named_t arcstat_other_size; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_hdr_miss; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; } arc_stats_t; static arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)); #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } #define ARCSTAT_MAXSTAT(stat) \ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; static uint64_t arc_meta_used; static uint64_t arc_meta_limit; static uint64_t arc_meta_max = 0; typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_done_func_t *acb_done; arc_buf_t *acb_buf; zio_t *acb_zio_dummy; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_done_func_t *awcb_ready; arc_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; uint64_t b_cksum0; kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; void *b_thawed; arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; uint32_t b_flags; uint32_t b_datacnt; arc_callback_t *b_acb; kcondvar_t b_cv; /* immutable */ arc_buf_contents_t b_type; uint64_t b_size; uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; list_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; /* self protecting */ refcount_t b_refcnt; l2arc_buf_hdr_t *b_l2hdr; list_node_t b_l2node; }; static arc_buf_t *arc_eviction_list; static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) /* * Private ARC flags. These flags are private ARC only flags that will show up * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can * be passed in as arc_flags in things like arc_read. However, these flags * should never be passed and should only be set by ARC code. When adding new * public flags, make sure not to smash the private ones. */ #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ (hdr)->b_l2hdr != NULL) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) /* * Other sizes */ #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) /* * Hash table routines */ -#define HT_LOCK_PAD 64 +#define HT_LOCK_ALIGN 64 +#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL - unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; + unsigned char pad[HT_LOCK_PAD]; #endif }; #define BUF_LOCKS 256 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; struct ht_lock ht_locks[BUF_LOCKS]; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) /* * L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals */ typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_write; /* desired write size, bytes */ uint64_t l2ad_boost; /* warmup write boost, bytes */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ list_t *l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_t *l2rcb_buf; /* read buffer */ spa_t *l2rcb_spa; /* spa */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ }; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; void (*l2df_func)(void *, size_t); list_node_t l2df_list_node; } l2arc_data_free_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static void l2arc_read_done(zio_t *zio); static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; crc ^= (spa>>8) ^ birth; return (crc); } #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ (buf)->b_dva.dva_word[1] == 0 && \ (buf)->b_birth == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; hdr->b_cksum0 = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *buf; mutex_enter(hash_lock); for (buf = buf_hash_table.ht_table[idx]; buf != NULL; buf = buf->b_hash_next) { if (BUF_EQUAL(spa, dva, birth, buf)) { *lockp = hash_lock; return (buf); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fbuf; uint32_t i; ASSERT(!HDR_IN_HASH_TABLE(buf)); *lockp = hash_lock; mutex_enter(hash_lock); for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; fbuf = fbuf->b_hash_next, i++) { if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) return (fbuf); } buf->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = buf; buf->b_flags |= ARC_IN_HASH_TABLE; /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } ARCSTAT_BUMP(arcstat_hash_elements); ARCSTAT_MAXSTAT(arcstat_hash_elements); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *buf) { arc_buf_hdr_t *fbuf, **bufp; uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(buf)); bufp = &buf_hash_table.ht_table[idx]; while ((fbuf = *bufp) != buf) { ASSERT(fbuf != NULL); bufp = &fbuf->b_hash_next; } *bufp = buf->b_hash_next; buf->b_hash_next = NULL; buf->b_flags &= ~ARC_IN_HASH_TABLE; /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { int i; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_free() in the linux kernel */ + vmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ /* ARGSUSED */ static int hdr_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *buf = vbuf; bzero(buf, sizeof (arc_buf_hdr_t)); refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&buf->b_arc_node); list_link_init(&buf->b_l2node); arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ /* ARGSUSED */ static void hdr_dest(void *vbuf, void *unused) { arc_buf_hdr_t *buf = vbuf; ASSERT(BUF_EMPTY(buf)); refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); mutex_destroy(&buf->b_freeze_lock); arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); rw_destroy(&buf->b_data_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* * Reclaim callback -- invoked when memory is low. */ /* ARGSUSED */ static void hdr_recl(void *unused) { dprintf("hdr_recl called\n"); /* * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). */ if (!arc_dead) cv_signal(&arc_reclaim_thr_cv); } static void buf_init(void) { uint64_t *ct; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average 64K block size. The table will take up * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). */ while (hsize * 65536 < physmem * PAGESIZE) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_alloc() in the linux kernel */ + buf_hash_table.ht_table = + vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); +#else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); +#endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) { mutex_init(&buf_hash_table.ht_locks[i].ht_lock, NULL, MUTEX_DEFAULT, NULL); } } #define ARC_MINTIME (hz>>4) /* 62 ms */ static void arc_cksum_verify(arc_buf_t *buf) { zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum == NULL || (buf->b_hdr->b_flags & ARC_IO_ERROR)) { mutex_exit(&buf->b_hdr->b_freeze_lock); return; } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&buf->b_hdr->b_freeze_lock); } static int arc_cksum_equal(arc_buf_t *buf) { zio_cksum_t zc; int equal; mutex_enter(&buf->b_hdr->b_freeze_lock); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); mutex_exit(&buf->b_hdr->b_freeze_lock); return (equal); } static void arc_cksum_compute(arc_buf_t *buf, boolean_t force) { if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { mutex_exit(&buf->b_hdr->b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_freeze_lock); } void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_state != arc_anon) panic("modifying non-anon buffer!"); if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_thawed) kmem_free(buf->b_hdr->b_thawed, 1); buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); } mutex_exit(&buf->b_hdr->b_freeze_lock); } void arc_buf_freeze(arc_buf_t *buf) { kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); } static void add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) { ASSERT(MUTEX_HELD(hash_lock)); if ((refcount_add(&ab->b_refcnt, tag) == 1) && (ab->b_state != arc_anon)) { uint64_t delta = ab->b_size * ab->b_datacnt; list_t *list = &ab->b_state->arcs_list[ab->b_type]; uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); mutex_enter(&ab->b_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); list_remove(list, ab); if (GHOST_STATE(ab->b_state)) { ASSERT3U(ab->b_datacnt, ==, 0); ASSERT3P(ab->b_buf, ==, NULL); delta = ab->b_size; } ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); mutex_exit(&ab->b_state->arcs_mtx); /* remove the prefetch flag if we get a reference */ if (ab->b_flags & ARC_PREFETCH) ab->b_flags &= ~ARC_PREFETCH; } } static int remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) { int cnt; arc_state_t *state = ab->b_state; ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && (state != arc_anon)) { uint64_t *size = &state->arcs_lsize[ab->b_type]; ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); ASSERT(!list_link_active(&ab->b_arc_node)); list_insert_head(&state->arcs_list[ab->b_type], ab); ASSERT(ab->b_datacnt > 0); atomic_add_64(size, ab->b_size * ab->b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); } /* * Move the supplied buffer to the indicated state. The mutex * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) { arc_state_t *old_state = ab->b_state; int64_t refcnt = refcount_count(&ab->b_refcnt); uint64_t from_delta, to_delta; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(new_state != old_state); ASSERT(refcnt == 0 || ab->b_datacnt > 0); ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); from_delta = to_delta = ab->b_datacnt * ab->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); uint64_t *size = &old_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); list_remove(&old_state->arcs_list[ab->b_type], ab); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { /* ghost elements have a ghost size */ ASSERT(ab->b_buf == NULL); from_delta = ab->b_size; } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); if (use_mutex) mutex_exit(&old_state->arcs_mtx); } if (new_state != arc_anon) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); uint64_t *size = &new_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&new_state->arcs_mtx); list_insert_head(&new_state->arcs_list[ab->b_type], ab); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { ASSERT(ab->b_datacnt == 0); ASSERT(ab->b_buf == NULL); to_delta = ab->b_size; } atomic_add_64(size, to_delta); if (use_mutex) mutex_exit(&new_state->arcs_mtx); } } ASSERT(!BUF_EMPTY(ab)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) buf_hash_remove(ab); /* adjust state sizes */ if (to_delta) atomic_add_64(&new_state->arcs_size, to_delta); if (from_delta) { ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } ab->b_state = new_state; /* adjust l2arc hdr stats */ if (new_state == arc_l2c_only) l2arc_hdr_stat_add(); else if (old_state == arc_l2c_only) l2arc_hdr_stat_remove(); } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, space); break; } atomic_add_64(&arc_meta_used, space); atomic_add_64(&arc_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, -space); break; } ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; atomic_add_64(&arc_meta_used, -space); ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } void * arc_data_buf_alloc(uint64_t size) { if (arc_evict_needed(ARC_BUFC_DATA)) cv_signal(&arc_reclaim_thr_cv); atomic_add_64(&arc_size, size); return (zio_data_buf_alloc(size)); } void arc_data_buf_free(void *buf, uint64_t size) { zio_data_buf_free(buf, size); ASSERT(arc_size >= size); atomic_add_64(&arc_size, -size); } arc_buf_t * arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; arc_buf_t *buf; ASSERT3U(size, >, 0); hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; hdr->b_spa = spa_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; hdr->b_buf = buf; arc_get_data_buf(buf); hdr->b_datacnt = 1; hdr->b_flags = 0; ASSERT(refcount_is_zero(&hdr->b_refcnt)); (void) refcount_add(&hdr->b_refcnt, tag); return (buf); } static char *arc_onloan_tag = "onloan"; /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); (void) refcount_add(&hdr->b_refcnt, tag); (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; ASSERT(buf->b_data != NULL); hdr = buf->b_hdr; (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; atomic_add_64(&arc_loaned_bytes, hdr->b_size); } static arc_buf_t * arc_buf_clone(arc_buf_t *from) { arc_buf_t *buf; arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; ASSERT(hdr->b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = hdr->b_buf; hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); hdr->b_datacnt += 1; return (buf); } void arc_buf_add_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; /* * Check to see if this buffer is evicted. Callers * must verify b_data != NULL to know if the add_ref * was successful. */ mutex_enter(&buf->b_evict_lock); if (buf->b_data == NULL) { mutex_exit(&buf->b_evict_lock); return; } hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); } /* * Free the arc data buffer. If it is an l2arc write in progress, * the buffer is placed on l2arc_free_on_write to be freed later. */ static void arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), void *data, size_t size) { if (HDR_L2_WRITING(hdr)) { l2arc_data_free_t *df; df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; df->l2df_func = free_func; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { free_func(data, size); } } static void arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) { arc_buf_t **bufp; /* free up data associated with the buf */ if (buf->b_data) { arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; arc_cksum_verify(buf); if (!recycle) { if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf->b_hdr, zio_buf_free, buf->b_data, size); arc_space_return(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); arc_buf_data_free(buf->b_hdr, zio_data_buf_free, buf->b_data, size); ARCSTAT_INCR(arcstat_data_size, -size); atomic_add_64(&arc_size, -size); } } if (list_link_active(&buf->b_hdr->b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); ASSERT(state != arc_anon); ASSERT3U(*cnt, >=, size); atomic_add_64(cnt, -size); } ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } /* only remove the buf if requested */ if (!all) return; /* remove the buf from the hdr list */ for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; buf->b_next = NULL; ASSERT(buf->b_efunc == NULL); /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); if (l2hdr != NULL) { boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); /* * To prevent arc_free() and l2arc_evict() from * attempting to free the same buffer at the same time, * a FREE_IN_PROGRESS flag is given to arc_free() to * give it priority. l2arc_evict() can't destroy this * header while we are waiting on l2arc_buflist_mtx. * * The hdr may be removed from l2ad_buflist before we * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. */ if (!buflist_held) { mutex_enter(&l2arc_buflist_mtx); l2hdr = hdr->b_l2hdr; } if (l2hdr != NULL) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); hdr->b_l2hdr = NULL; } if (!buflist_held) mutex_exit(&l2arc_buflist_mtx); } if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); buf_discard_identity(hdr); } while (hdr->b_buf) { arc_buf_t *buf = hdr->b_buf; if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); } } if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } if (hdr->b_thawed) { kmem_free(hdr->b_thawed, 1); hdr->b_thawed = NULL; } ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); ASSERT3P(hdr->b_acb, ==, NULL); kmem_cache_free(hdr_cache, hdr); } void arc_buf_free(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; int hashed = hdr->b_state != arc_anon; ASSERT(buf->b_efunc == NULL); ASSERT(buf->b_data != NULL); if (hashed) { kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); } else { ASSERT(buf == hdr->b_buf); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { int destroy_hdr; /* * We are in the middle of an async write. Don't destroy * this buffer unless the write completes before we finish * decrementing the reference count. */ mutex_enter(&arc_eviction_mtx); (void) remove_reference(hdr, NULL, tag); ASSERT(refcount_is_zero(&hdr->b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); mutex_exit(&arc_eviction_mtx); if (destroy_hdr) arc_hdr_destroy(hdr); } else { if (remove_reference(hdr, NULL, tag) > 0) arc_buf_destroy(buf, FALSE, TRUE); else arc_hdr_destroy(hdr); } } int arc_buf_remove_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); int no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { ASSERT(hdr->b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT(hdr->b_state != arc_anon); ASSERT(buf->b_data != NULL); (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_datacnt > 1) { if (no_callback) arc_buf_destroy(buf, FALSE, TRUE); } else if (no_callback) { ASSERT(hdr->b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; } ASSERT(no_callback || hdr->b_datacnt > 1 || refcount_is_zero(&hdr->b_refcnt)); mutex_exit(hash_lock); return (no_callback); } int arc_buf_size(arc_buf_t *buf) { return (buf->b_hdr->b_size); } /* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. * If the recycle flag is set, then attempt to "recycle" a buffer: * - look for a buffer to evict that is `bytes' long. * - return the data block from this buffer rather than freeing it. * This flag is used by callers that are trying to make space for a * new buffer in a full arc cache. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so may not catch all candidates. * It may also return without evicting as much space as requested. */ static void * arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; arc_buf_hdr_t *ab, *ab_prev = NULL; list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(ab) || (spa && ab->b_spa != spa) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && ddi_get_lbolt() - ab->b_arc_access < arc_min_prefetch_lifespan)) { skipped++; continue; } /* "lookahead" for better eviction candidate */ if (recycle && ab->b_size != bytes && ab_prev && ab_prev->b_size == bytes) continue; hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } if (buf->b_data) { bytes_evicted += ab->b_size; if (recycle && ab->b_type == type && ab->b_size == bytes && !HDR_L2_WRITING(ab)) { stolen = buf->b_data; recycle = FALSE; } } if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); arc_buf_destroy(buf, buf->b_data == stolen, FALSE); ab->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); mutex_exit(&buf->b_evict_lock); } else { mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } } if (ab->b_l2hdr) { ARCSTAT_INCR(arcstat_evict_l2_cached, ab->b_size); } else { if (l2arc_write_eligible(ab->b_spa, ab)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, ab->b_size); } else { ARCSTAT_INCR( arcstat_evict_l2_ineligible, ab->b_size); } } if (ab->b_datacnt == 0) { arc_change_state(evicted_state, ab, hash_lock); ASSERT(HDR_IN_HASH_TABLE(ab)); ab->b_flags |= ARC_IN_HASH_TABLE; ab->b_flags &= ~ARC_BUF_AVAILABLE; DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); } if (!have_lock) mutex_exit(hash_lock); if (bytes >= 0 && bytes_evicted >= bytes) break; } else { missed += 1; } } mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&state->arcs_mtx); if (bytes_evicted < bytes) dprintf("only evicted %lld bytes from %x\n", (longlong_t)bytes_evicted, state); if (skipped) ARCSTAT_INCR(arcstat_evict_skip, skipped); if (missed) ARCSTAT_INCR(arcstat_mutex_miss, missed); /* * We have just evicted some date into the ghost state, make * sure we also adjust the ghost state size if necessary. */ if (arc_no_grow && arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mru_ghost->arcs_lsize[type], mru_over); arc_evict_ghost(arc_mru_ghost, 0, todelete); } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c); arc_evict_ghost(arc_mfu_ghost, 0, todelete); } } return (stolen); } /* * Remove buffers from list until we've removed the specified number of * bytes. Destroy the buffers that are removed. */ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; arc_buf_hdr_t marker; list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; ASSERT(GHOST_STATE(state)); bzero(&marker, sizeof(marker)); top: mutex_enter(&state->arcs_mtx); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); if (spa && ab->b_spa != spa) continue; /* ignore markers */ if (ab->b_spa == 0) continue; hash_lock = HDR_LOCK(ab); /* caller may be trying to modify this buffer, skip it */ if (MUTEX_HELD(hash_lock)) continue; if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); ARCSTAT_BUMP(arcstat_deleted); bytes_deleted += ab->b_size; if (ab->b_l2hdr != NULL) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, ab, hash_lock); mutex_exit(hash_lock); } else { arc_change_state(arc_anon, ab, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(ab); } DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; } else if (bytes < 0) { /* * Insert a list marker and then wait for the * hash lock to become available. Once its * available, restart from where we left off. */ list_insert_after(list, ab, &marker); mutex_exit(&state->arcs_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); mutex_enter(&state->arcs_mtx); ab_prev = list_prev(list, &marker); list_remove(list, &marker); } else bufs_skipped += 1; } mutex_exit(&state->arcs_mtx); if (list == &state->arcs_list[ARC_BUFC_DATA] && (bytes < 0 || bytes_deleted < bytes)) { list = &state->arcs_list[ARC_BUFC_METADATA]; goto top; } if (bufs_skipped) { ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); ASSERT(bytes >= 0); } if (bytes_deleted < bytes) dprintf("only deleted %lld bytes from %p\n", (longlong_t)bytes_deleted, state); } static void arc_adjust(void) { int64_t adjustment, delta; /* * Adjust MRU size */ adjustment = MIN((int64_t)(arc_size - arc_c), (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p)); if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); } /* * Adjust MFU size */ adjustment = arc_size - arc_c; if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { int64_t delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); } /* * Adjust ghost lists */ adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { delta = MIN(arc_mru_ghost->arcs_size, adjustment); arc_evict_ghost(arc_mru_ghost, 0, delta); } adjustment = arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { delta = MIN(arc_mfu_ghost->arcs_size, adjustment); arc_evict_ghost(arc_mfu_ghost, 0, delta); } } static void arc_do_user_evicts(void) { mutex_enter(&arc_eviction_mtx); while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); if (buf->b_efunc != NULL) VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; buf->b_private = NULL; kmem_cache_free(buf_cache, buf); mutex_enter(&arc_eviction_mtx); } mutex_exit(&arc_eviction_mtx); } /* * Flush all *evictable* data from the cache for the given spa. * NOTE: this will not touch "active" (i.e. referenced) data. */ void arc_flush(spa_t *spa) { uint64_t guid = 0; if (spa) guid = spa_guid(spa); while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } arc_evict_ghost(arc_mru_ghost, guid, -1); arc_evict_ghost(arc_mfu_ghost, guid, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); mutex_exit(&arc_reclaim_thr_lock); ASSERT(spa || arc_eviction_list == NULL); } void arc_shrink(void) { if (arc_c > arc_c_min) { uint64_t to_free; #ifdef _KERNEL to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); #else to_free = arc_c >> arc_shrink_shift; #endif if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else arc_c = arc_c_min; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_c > arc_size) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } if (arc_size > arc_c) arc_adjust(); } static int arc_reclaim_needed(void) { #ifdef _KERNEL uint64_t extra; if (needfree) return (1); /* * take 'desfree' extra pages, so we reclaim sooner, rather than later */ extra = desfree; /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. * lotsfree is the high-water mark for pageout, and needfree is the * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ if (freemem < lotsfree + needfree + extra) return (1); /* * check to make sure that swapfs has enough space so that anon * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); #if defined(__i386) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ if (btop(vmem_size(heap_arena, VMEM_FREE)) < (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) return (1); #endif #else if (spa_get_random(100) == 0) return (1); #endif return (0); } static void arc_kmem_reap_now(arc_reclaim_strategy_t strat) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* * We are exceeding our meta-data cache limit. * Purge some DNLC entries to release holds on meta-data. */ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif /* * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) arc_shrink(); for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_cache); } static void arc_reclaim_thread(void) { clock_t growtime = 0; arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { if (arc_reclaim_needed()) { if (arc_no_grow) { if (last_reclaim == ARC_RECLAIM_CONS) { last_reclaim = ARC_RECLAIM_AGGR; } else { last_reclaim = ARC_RECLAIM_CONS; } } else { arc_no_grow = TRUE; last_reclaim = ARC_RECLAIM_AGGR; membar_producer(); } /* reset the growth delay for every reclaim */ growtime = ddi_get_lbolt() + (arc_grow_retry * hz); arc_kmem_reap_now(last_reclaim); arc_warm = B_TRUE; } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { arc_no_grow = FALSE; } arc_adjust(); if (arc_eviction_list != NULL) arc_do_user_evicts(); /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); } arc_thread_exit = 0; cv_broadcast(&arc_reclaim_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ thread_exit(); } /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); if (state == arc_l2c_only) return; ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); if (arc_reclaim_needed()) { cv_signal(&arc_reclaim_thr_cv); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if the cache has reached its limits and eviction is required * prior to insert. */ static int arc_evict_needed(arc_buf_contents_t type) { if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); #ifdef _KERNEL /* * If zio data pages are being allocated out of a separate heap segment, * then enforce that the size of available vmem for this area remains * above about 1/32nd free. */ if (type == ARC_BUFC_DATA && zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) < (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) return (1); #endif if (arc_reclaim_needed()) return (1); return (arc_size > arc_c); } /* * The buffer, supplied as the first argument, needs a data block. * So, if we are at cache max, determine which cache should be victimized. * We have the following cases: * * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> * In this situation if we're out of space, but the resident size of the MFU is * under the limit, victimize the MFU cache to satisfy this insertion request. * * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> * Here, we've used up all of the available space for the MRU, so we need to * evict from our own cache instead. Evict from the set of resident MRU * entries. * * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> * c minus p represents the MFU space in the cache, since p is the size of the * cache that is dedicated to the MRU. In this situation there's still space on * the MFU side, so the MRU side needs to be victimized. * * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> * MFU's resident set is consuming more space than it has been allotted. In * this situation, we must victimize our own cache, the MFU, for this insertion. */ static void arc_get_data_buf(arc_buf_t *buf) { arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; arc_adapt(size, state); /* * We have not yet reached cache maximum size, * just allocate a new buffer. */ if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } goto out; } /* * If we are prefetching from the mfu ghost list, this buffer * will end up on the mru list; so steal space from there. */ if (state == arc_mfu_ghost) state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; else if (state == arc_mru_ghost) state = arc_mru; if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; state = (arc_mfu->arcs_lsize[type] >= size && arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); out: /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(buf->b_hdr->b_state)) { arc_buf_hdr_t *hdr = buf->b_hdr; atomic_add_64(&hdr->b_state->arcs_size, size); if (list_link_active(&hdr->b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); atomic_add_64(&hdr->b_state->arcs_lsize[type], size); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (arc_size < arc_c && hdr->b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); if (buf->b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ ASSERT(buf->b_arc_access == 0); buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); arc_change_state(arc_mru, buf, hash_lock); } else if (buf->b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ if ((buf->b_flags & ARC_PREFETCH) != 0) { if (refcount_count(&buf->b_refcnt) == 0) { ASSERT(list_link_active(&buf->b_arc_node)); } else { buf->b_flags &= ~ARC_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); } buf->b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ if (now > buf->b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ buf->b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } ARCSTAT_BUMP(arcstat_mru_hits); } else if (buf->b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ if (buf->b_flags & ARC_PREFETCH) { new_state = arc_mru; if (refcount_count(&buf->b_refcnt) > 0) buf->b_flags &= ~ARC_PREFETCH; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); } buf->b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (buf->b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ if ((buf->b_flags & ARC_PREFETCH) != 0) { ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = ddi_get_lbolt(); } else if (buf->b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ if (buf->b_flags & ARC_PREFETCH) { /* * This is a prefetch access... * move this block back to the MRU state. */ ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); new_state = arc_mru; } buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (buf->b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } else { ASSERT(!"invalid arc state"); } } /* a generic arc_done_func_t which you can use */ /* ARGSUSED */ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) bcopy(buf->b_data, arg, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg) == 1); } /* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { VERIFY(arc_buf_remove_ref(buf, arg) == 1); *bufp = NULL; } else { *bufp = buf; ASSERT(buf->b_data); } } static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr, *found; arc_buf_t *buf; arc_buf_t *abuf; /* buffer we're assigning to callback */ kmutex_t *hash_lock; arc_callback_t *callback_list, *acb; int freeable = FALSE; buf = zio->io_private; hdr = buf->b_hdr; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); hdr->b_flags &= ~ARC_L2_EVICTED; if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) hdr->b_flags &= ~ARC_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; func(buf->b_data, hdr->b_size); } arc_cksum_compute(buf, B_FALSE); if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* create copies of the data buffer for the callers */ abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { if (abuf == NULL) abuf = arc_buf_clone(buf); acb->acb_buf = abuf; abuf = NULL; } } hdr->b_acb = NULL; hdr->b_flags &= ~ARC_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); ASSERT(hdr->b_datacnt == 1); hdr->b_flags |= ARC_BUF_AVAILABLE; } ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); if (zio->io_error != 0) { hdr->b_flags |= ARC_IO_ERROR; if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_cv); if (hash_lock) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_state, ==, arc_anon); freeable = refcount_is_zero(&hdr->b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done) acb->acb_done(zio, acb->acb_buf, acb->acb_private); if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. * * Normal callers should use arc_read and pass the arc buffer and offset * for the bp. But if you know you don't need locking, you can use * arc_read_bp. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { int err; if (pbuf == NULL) { /* * XXX This happens from traverse callback funcs, for * the objset_phys_t block. */ return (arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb)); } ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); rw_enter(&pbuf->b_data_lock, RW_READER); err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); rw_exit(&pbuf->b_data_lock); return (err); } int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; arc_buf_t *buf = NULL; kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_guid(spa); top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), &hash_lock); if (hdr && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_WAIT) { cv_wait(&hdr->b_cv, hash_lock); mutex_exit(hash_lock); goto top; } ASSERT(*arc_flags & ARC_NOWAIT); if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; hdr->b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); return (0); } mutex_exit(hash_lock); return (0); } ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); if (done) { add_reference(hdr, hash_lock, private); /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_buf; ASSERT(buf); ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { ASSERT(buf->b_efunc == NULL); hdr->b_flags &= ~ARC_BUF_AVAILABLE; } else { buf = arc_buf_clone(buf); } } else if (*arc_flags & ARC_PREFETCH && refcount_count(&hdr->b_refcnt) == 0) { hdr->b_flags |= ARC_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_L2CACHE) hdr->b_flags |= ARC_L2CACHE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); if (done) done(NULL, buf, private); } else { uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; daddr_t addr = -1; boolean_t devw = B_FALSE; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_PREFETCH) { (void) remove_reference(hdr, hash_lock, private); hdr->b_flags |= ARC_PREFETCH; } if (*arc_flags & ARC_L2CACHE) hdr->b_flags |= ARC_L2CACHE; if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_INDIRECT; } else { /* this block is in the ghost cache */ ASSERT(GHOST_STATE(hdr->b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); ASSERT(hdr->b_buf == NULL); /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_PREFETCH) hdr->b_flags |= ARC_PREFETCH; else add_reference(hdr, hash_lock, private); if (*arc_flags & ARC_L2CACHE) hdr->b_flags |= ARC_L2CACHE; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; hdr->b_buf = buf; ASSERT(hdr->b_datacnt == 0); hdr->b_datacnt = 1; arc_get_data_buf(buf); arc_access(hdr, hash_lock); } ASSERT(!GHOST_STATE(hdr->b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; ASSERT(hdr->b_acb == NULL); hdr->b_acb = acb; hdr->b_flags |= ARC_IO_IN_PROGRESS; if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr->b_dev->l2ad_writing; addr = hdr->b_l2hdr->b_daddr; /* * Lock out device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (hdr->b_l2hdr != NULL && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_buf = buf; cb->l2rcb_spa = spa; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). */ rzio = zio_read_phys(pio, vd, addr, size, buf->b_data, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); return (0); } ASSERT(*arc_flags & ARC_WAIT); if (zio_wait(rzio) == 0) return (0); /* l2arc read error; goto zio_read() */ } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); if (l2arc_ndev != 0) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); if (*arc_flags & ARC_WAIT) return (zio_wait(rzio)); ASSERT(*arc_flags & ARC_NOWAIT); zio_nowait(rzio); } return (0); } void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); ASSERT(buf->b_hdr->b_state != arc_anon); ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); ASSERT(buf->b_efunc == NULL); ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); buf->b_efunc = func; buf->b_private = private; } /* * This is used by the DMU to let the ARC know that a buffer is * being evicted, so the ARC should clean up. If this arc buf * is not yet in the evicted state, it will be put there. */ int arc_buf_evict(arc_buf_t *buf) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; arc_buf_t **bufp; mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); mutex_exit(&buf->b_evict_lock); return (0); } else if (buf->b_data == NULL) { arc_buf_t copy = *buf; /* structure assignment */ /* * We are on the eviction list; process this buffer now * but let arc_do_user_evicts() do the reaping. */ buf->b_efunc = NULL; mutex_exit(&buf->b_evict_lock); VERIFY(copy.b_efunc(©) == 0); return (1); } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); /* * Pull this buffer off of the hdr */ bufp = &hdr->b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; ASSERT(buf->b_data != NULL); arc_buf_destroy(buf, FALSE, FALSE); if (hdr->b_datacnt == 0) { arc_state_t *old_state = hdr->b_state; arc_state_t *evicted_state; ASSERT(hdr->b_buf == NULL); ASSERT(refcount_is_zero(&hdr->b_refcnt)); evicted_state = (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; mutex_enter(&old_state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdr->b_flags |= ARC_IN_HASH_TABLE; hdr->b_flags &= ~ARC_BUF_AVAILABLE; mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&old_state->arcs_mtx); } mutex_exit(hash_lock); mutex_exit(&buf->b_evict_lock); VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; buf->b_private = NULL; buf->b_hdr = NULL; buf->b_next = NULL; kmem_cache_free(buf_cache, buf); return (1); } /* * Release this buffer from the cache. This must be done * after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock = NULL; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size = 0; /* * It would be nice to assert that if it's DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); if (hdr->b_state == arc_anon) { /* this buffer is already released */ ASSERT(buf->b_efunc == NULL); } else { hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); } l2hdr = hdr->b_l2hdr; if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); hdr->b_l2hdr = NULL; buf_size = hdr->b_size; } /* * Do we have more than one buf? */ if (hdr->b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; uint32_t flags = hdr->b_flags; ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; buf->b_next = NULL; ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); if (refcount_is_zero(&hdr->b_refcnt)) { uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } hdr->b_datacnt -= 1; arc_cksum_verify(buf); mutex_exit(hash_lock); nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; nhdr->b_type = type; nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; nhdr->b_flags = flags & ARC_L2_WRITING; nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; if (hash_lock) mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; if (l2hdr) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); } } /* * Release this buffer. If it does not match the provided BP, fill it * with that block's contents. */ /* ARGSUSED */ int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa, zbookmark_t *zb) { arc_release(buf, tag); return (0); } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } int arc_has_callback(arc_buf_t *buf) { int callback; mutex_enter(&buf->b_evict_lock); callback = (buf->b_efunc != NULL); mutex_exit(&buf->b_evict_lock); return (callback); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); callback->awcb_ready(zio, buf, callback->awcb_private); /* * If the IO is already in progress, then this is a re-write * attempt, so we need to thaw and re-compute the cksum. * It is the responsibility of the callback to handle the * accounting for any re-write attempt. */ if (HDR_IO_IN_PROGRESS(hdr)) { mutex_enter(&hdr->b_freeze_lock); if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } mutex_exit(&hdr->b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); hdr->b_flags |= ARC_IO_IN_PROGRESS; } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(hdr->b_acb == NULL); if (zio->io_error == 0) { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; } else { ASSERT(BUF_EMPTY(hdr)); } /* * If the block to be written was all-zero, we may have * compressed it away. In this case no write was performed * so there will be no dva/birth/checksum. The buffer must * therefore remain anonymous (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT(zio->io_error == 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(refcount_is_zero(&exists->b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else { /* Dedup */ ASSERT(hdr->b_datacnt == 1); ASSERT(hdr->b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } hdr->b_flags &= ~ARC_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ if (!exists && hdr->b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } ASSERT(!refcount_is_zero(&hdr->b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); ASSERT(hdr->b_acb == NULL); if (l2arc) hdr->b_flags |= ARC_L2CACHE; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); return (zio); } static int arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; #if defined(__i386) available_memory = MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); #endif if (available_memory >= zfs_write_limit_max) return (0); if (txg > last_txg) { last_txg = txg; page_load = 0; } /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ if (curproc == proc_pageout) { if (page_load > MAX(ptob(minfree), available_memory) / 4) return (ERESTART); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; return (0); } else if (page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); return (EAGAIN); } page_load = 0; if (arc_size > arc_c_min) { uint64_t evictable_memory = arc_mru->arcs_lsize[ARC_BUFC_DATA] + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; available_memory += MIN(evictable_memory, arc_size - arc_c_min); } if (inflight_data > available_memory / 4) { ARCSTAT_INCR(arcstat_memory_throttle_count, 1); return (ERESTART); } #endif return (0); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; #ifdef ZFS_DEBUG /* * Once in a while, fail for no reason. Everything should cope. */ if (spa_get_random(10000) == 0) { dprintf("forcing random failure\n"); return (ERESTART); } #endif if (reserve > arc_c/4 && !arc_no_grow) arc_c = MIN(arc_c_max, reserve * 4); if (reserve > arc_c) return (ENOMEM); /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefor need to * make sure that there is sufficient available memory for this. */ if ((error = arc_memory_throttle(reserve, anon_size, txg))) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve>>10, arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, reserve>>10, arc_c>>10); return (ERESTART); } atomic_add_64(&arc_tempreserve, reserve); return (0); } void arc_init(void) { mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ arc_c = physmem * PAGESIZE / 8; #ifdef _KERNEL /* * On architectures where the physical memory can be larger * than the addressable space (intel in 32-bit mode), we may * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ arc_c_min = MAX(arc_c / 4, 64<<20); /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ if (arc_c * 8 >= 1<<30) arc_c_max = (arc_c * 8) - (1<<30); else arc_c_max = arc_c_min; arc_c_max = MAX(arc_c * 6, arc_c_max); /* * Allow the tunables to override our calculations if they are * reasonable (ie. over 64MB) */ if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) arc_c_max = zfs_arc_max; if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; arc_c = arc_c_max; arc_p = (arc_c >> 1); /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; if (zfs_arc_grow_retry > 0) arc_grow_retry = zfs_arc_grow_retry; if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; arc_anon = &ARC_anon; arc_mru = &ARC_mru; arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; arc_l2c_only = &ARC_l2c_only; arc_size = 0; mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); buf_init(); arc_thread_exit = 0; arc_eviction_list = NULL; mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; kstat_install(arc_ksp); } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); arc_dead = FALSE; arc_warm = B_FALSE; if (zfs_write_limit_max == 0) zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; else zfs_write_limit_shift = 0; mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); } void arc_fini(void) { mutex_enter(&arc_reclaim_thr_lock); arc_thread_exit = 1; while (arc_thread_exit != 0) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); mutex_exit(&arc_reclaim_thr_lock); arc_flush(NULL); arc_dead = TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } mutex_destroy(&arc_eviction_mtx); mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); mutex_destroy(&arc_anon->arcs_mtx); mutex_destroy(&arc_mru->arcs_mtx); mutex_destroy(&arc_mru_ghost->arcs_mtx); mutex_destroy(&arc_mfu->arcs_mtx); mutex_destroy(&arc_mfu_ghost->arcs_mtx); mutex_destroy(&arc_l2c_only->arcs_mtx); mutex_destroy(&zfs_write_limit_lock); buf_fini(); ASSERT(arc_loaned_bytes == 0); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) return (B_FALSE); return (B_TRUE); } static uint64_t l2arc_write_size(l2arc_dev_t *dev) { uint64_t size; size = dev->l2ad_write; if (arc_warm == B_FALSE) size += dev->l2ad_boost; return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } static void l2arc_hdr_stat_add(void) { ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); } static void l2arc_hdr_stat_remove(void) { ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; } while (vdev_is_dead(next->l2ad_vdev)); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev)) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write(void) { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT(df->l2df_data != NULL); ASSERT(df->l2df_func != NULL); df->l2df_func(df->l2df_data, df->l2df_size); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *ab, *ab_prev; l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; cb = zio->io_private; ASSERT(cb != NULL); dev = cb->l2wcb_dev; ASSERT(dev != NULL); head = cb->l2wcb_head; ASSERT(head != NULL); buflist = dev->l2ad_buflist; ASSERT(buflist != NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); mutex_enter(&l2arc_buflist_mtx); /* * All writes completed, or an error was hit. */ for (ab = list_prev(buflist, head); ab; ab = ab_prev) { ab_prev = list_prev(buflist, ab); hash_lock = HDR_LOCK(ab); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage * of eviction. Its ARC_L2_WRITING flag will be * left set, denying reads to this buffer. */ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); continue; } if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, ab); abl2 = ab->b_l2hdr; ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } /* * Allow ARC to begin reads to this L2ARC entry. */ ab->b_flags &= ~ARC_L2_WRITING; mutex_exit(hash_lock); } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); kmem_cache_free(hdr_cache, head); mutex_exit(&l2arc_buflist_mtx); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; arc_buf_t *buf; kmutex_t *hash_lock; int equal; ASSERT(zio->io_vd != NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; ASSERT(cb != NULL); buf = cb->l2rcb_buf; ASSERT(buf != NULL); hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * Check this survived the L2ARC journey. */ equal = arc_cksum_equal(buf); if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = buf; zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ arc_read_done(zio); } else { mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = EIO; } if (!equal) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, buf->b_data, zio->io_size, arc_read_done, buf, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static list_t * l2arc_list_locked(int list_num, kmutex_t **lock) { list_t *list = NULL; ASSERT(list_num >= 0 && list_num <= 3); switch (list_num) { case 0: list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; *lock = &arc_mfu->arcs_mtx; break; case 1: list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; *lock = &arc_mru->arcs_mtx; break; case 2: list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; *lock = &arc_mfu->arcs_mtx; break; case 3: list = &arc_mru->arcs_list[ARC_BUFC_DATA]; *lock = &arc_mru->arcs_mtx; break; } ASSERT(!(MUTEX_HELD(*lock))); mutex_enter(*lock); return (list); } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; l2arc_buf_hdr_t *abl2; arc_buf_hdr_t *ab, *ab_prev; kmutex_t *hash_lock; uint64_t taddr; buflist = dev->l2ad_buflist; if (buflist == NULL) return; if (!all && dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. */ return; } if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { /* * When nearing the end of the device, evict to the end * before the device write hand jumps to the start. */ taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); top: mutex_enter(&l2arc_buflist_mtx); for (ab = list_tail(buflist); ab; ab = ab_prev) { ab_prev = list_prev(buflist, ab); hash_lock = HDR_LOCK(ab); if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&l2arc_buflist_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } if (HDR_L2_WRITE_HEAD(ab)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ list_remove(buflist, ab); mutex_exit(hash_lock); continue; } if (!all && ab->b_l2hdr != NULL && (ab->b_l2hdr->b_daddr > taddr || ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (HDR_FREE_IN_PROGRESS(ab)) { /* * Already on the path to destruction. */ mutex_exit(hash_lock); continue; } if (ab->b_state == arc_l2c_only) { ASSERT(!HDR_L2_READING(ab)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ arc_change_state(arc_anon, ab, hash_lock); arc_hdr_destroy(ab); } else { /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(ab)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); ab->b_flags |= ARC_L2_EVICTED; } /* * Tell ARC this no longer exists in L2ARC. */ if (ab->b_l2hdr != NULL) { abl2 = ab->b_l2hdr; ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } list_remove(buflist, ab); /* * This may have been leftover after a * failed write. */ ab->b_flags &= ~ARC_L2_WRITING; } mutex_exit(hash_lock); } mutex_exit(&l2arc_buflist_mtx); vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); dev->l2ad_evict = taddr; } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *ab, *ab_prev, *head; l2arc_buf_hdr_t *hdrl2; list_t *list; uint64_t passed_sz, write_sz, buf_sz, headroom; void *buf_data; kmutex_t *hash_lock, *list_lock = NULL; boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_guid(spa); int try; ASSERT(dev->l2ad_vdev != NULL); pio = NULL; write_sz = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); head->b_flags |= ARC_L2_WRITE_HEAD; /* * Copy buffers for L2ARC writing. */ mutex_enter(&l2arc_buflist_mtx); for (try = 0; try <= 3; try++) { list = l2arc_list_locked(try, &list_lock); passed_sz = 0; /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ headroom = target_sz * l2arc_headroom; if (arc_warm == B_FALSE) ab = list_head(list); else ab = list_tail(list); for (; ab; ab = ab_prev) { if (arc_warm == B_FALSE) ab_prev = list_next(list, ab); else ab_prev = list_prev(list, ab); hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (!have_lock && !mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. */ continue; } passed_sz += ab->b_size; if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); break; } if (!l2arc_write_eligible(guid, ab)) { mutex_exit(hash_lock); continue; } if ((write_sz + ab->b_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ list_insert_head(dev->l2ad_buflist, head); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } /* * Create and add a new L2ARC header. */ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); hdrl2->b_dev = dev; hdrl2->b_daddr = dev->l2ad_hand; ab->b_flags |= ARC_L2_WRITING; ab->b_l2hdr = hdrl2; list_insert_head(dev->l2ad_buflist, ab); buf_data = ab->b_buf->b_data; buf_sz = ab->b_size; /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ arc_cksum_verify(ab->b_buf); arc_cksum_compute(ab->b_buf, B_TRUE); mutex_exit(hash_lock); wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); /* * Keep the clock hand suitably device-aligned. */ buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); write_sz += buf_sz; dev->l2ad_hand += buf_sz; } mutex_exit(list_lock); if (full == B_TRUE) break; } mutex_exit(&l2arc_buflist_mtx); if (pio == NULL) { ASSERT3U(write_sz, ==, 0); kmem_cache_free(hdr_cache, head); return (0); } ASSERT3U(write_sz, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); ARCSTAT_INCR(arcstat_l2_size, write_sz); vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { vdev_space_update(dev->l2ad_vdev, dev->l2ad_end - dev->l2ad_hand, 0, 0); dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; } dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; return (write_sz); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static void l2arc_feed_thread(void) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT(spa != NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; adddev->l2ad_write = l2arc_write_max; adddev->l2ad_boost = l2arc_write_boost; adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; list_link_init(&adddev->l2ad_node); ASSERT3U(adddev->l2ad_write, >, 0); /* * This is a list of all ARC buffers that are still valid on the * device. */ adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *dev, *nextdev, *remdev = NULL; /* * Find the device by vdev */ mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { nextdev = list_next(l2arc_dev_list, dev); if (vd == dev->l2ad_vdev) { remdev = dev; break; } } ASSERT(remdev != NULL); /* * Remove device from global list */ list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(remdev->l2ad_buflist); kmem_free(remdev->l2ad_buflist, sizeof (list_t)); kmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; l2arc_writes_sent = 0; l2arc_writes_done = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { /* * This is called from dmu_fini(), which is called from spa_fini(); * Because of this, we can assume that all l2arc devices have * already been removed when the pools themselves were removed. */ l2arc_do_free_on_write(); mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_buflist_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, minclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c1b27d4ef338..fb7d0ac86eca 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1,2806 +1,2818 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include struct dbuf_hold_impl_data { /* Function arguments */ dnode_t *dh_dn; uint8_t dh_level; uint64_t dh_blkid; int dh_fail_sparse; void *dh_tag; dmu_buf_impl_t **dh_dbp; /* Local variables */ dmu_buf_impl_t *dh_db; dmu_buf_impl_t *dh_parent; blkptr_t *dh_bp; int dh_err; dbuf_dirty_record_t *dh_dr; arc_buf_contents_t dh_type; int dh_depth; }; static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, void *tag, dmu_buf_impl_t **dbp, int depth); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_cache; /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { dmu_buf_impl_t *db = vdb; bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); refcount_create(&db->db_holds); list_link_init(&db->db_link); return (0); } /* ARGSUSED */ static void dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; static uint64_t dbuf_hash_count; static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { uintptr_t osv = (uintptr_t)os; uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); return (crc); } #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = dn->dn_objset; uint64_t obj; uint64_t hv; uint64_t idx; dmu_buf_impl_t *db; obj = dn->dn_object; hv = DBUF_HASH(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid, hv, idx; dmu_buf_impl_t *dbf; blkid = db->db_blkid; hv = DBUF_HASH(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_add_64(&dbuf_hash_count, 1); return (NULL); } /* * Remove an entry from the hash table. This operation will * fail if there are any existing holds on the db. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv, idx; dmu_buf_impl_t *dbf, **dbp; hv = DBUF_HASH(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); idx = hv & h->hash_table_mask; /* * We musn't hold db_mtx to maintin lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_add_64(&dbuf_hash_count, -1); } static arc_evict_func_t dbuf_do_evict; static void dbuf_evict_user(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level != 0 || db->db_evict_func == NULL) return; if (db->db_user_data_ptr_ptr) *db->db_user_data_ptr_ptr = db->db.db_data; db->db_evict_func(&db->db, db->db_user_ptr); db->db_user_ptr = NULL; db->db_user_data_ptr_ptr = NULL; db->db_evict_func = NULL; } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { if (db->db_level > 0) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; DB_DNODE_EXIT(db); return (is_metadata); } } void dbuf_evict(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_buf == NULL); ASSERT(db->db_data_pending == NULL); dbuf_clear(db); dbuf_destroy(db); } void dbuf_init(void) { uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; /* * The hash table is big enough to fill all of physical memory * with an average 4K block size. The table will take up * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). */ while (hsize * 4096 < physmem * PAGESIZE) hsize <<= 1; retry: h->hash_table_mask = hsize - 1; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_alloc() in the linux kernel */ + h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); +#else h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); +#endif if (h->hash_table == NULL) { /* XXX - we should really return an error instead of assert */ ASSERT(hsize > (1ULL << 10)); hsize >>= 1; goto retry; } dbuf_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; int i; for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_free() in the linux kernel */ + vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); +#else kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); +#endif kmem_cache_destroy(dbuf_cache); } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !list_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, 0); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ ASSERTV(int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT); ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. */ if (db->db_dirtycnt == 0) { ASSERTV(uint64_t *buf = db->db.db_data); int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } } DB_DNODE_EXIT(db); } #endif static void dbuf_update_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_user_data_ptr_ptr) { ASSERT(!refcount_is_zero(&db->db_holds)); *db->db_user_data_ptr_ptr = db->db.db_data; } } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); db->db_buf = buf; if (buf != NULL) { ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; if (!arc_released(buf)) arc_set_callback(buf, dbuf_do_evict, db); dbuf_update_data(db); } else { dbuf_evict_user(db); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; } } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa; mutex_exit(&db->db_mtx); DB_GET_SPA(&spa, db); abuf = arc_loan_buf(spa, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); dbuf_set_data(db, NULL); mutex_exit(&db->db_mtx); } return (abuf); } uint64_t dbuf_whichblock(dnode_t *dn, uint64_t offset) { if (dn->dn_datablkshift) { return (offset >> dn->dn_datablkshift); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } static void dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else if (zio == NULL || zio->io_error == 0) { dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); VERIFY(arc_buf_remove_ref(buf, db) == 1); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL); } static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { dnode_t *dn; spa_t *spa; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; arc_buf_t *pbuf; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } /* * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); DB_DNODE_EXIT(db); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); return; } spa = dn->dn_objset->os_spa; DB_DNODE_EXIT(db); db->db_state = DB_READ; mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_L2CACHE; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ if (db->db_parent) pbuf = db->db_parent->db_buf; else pbuf = db->db_objset->os_phys_buf; (void) dsl_read(zio, spa, db->db_blkptr, pbuf, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); if (aflags & ARC_CACHED) *flags |= DB_RF_CACHED; } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; int havepzio = (zio != NULL); int prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) return (EIO); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; if (zio == NULL) zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_read_impl(db, zio, &flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, flags & DB_RF_CACHED); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); if (!havepzio) err = zio_wait(zio); } else { mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = EIO; } mutex_exit(&db->db_mtx); } ASSERT(err || havepzio || db->db_state == DB_CACHED); return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); DB_GET_SPA(&spa, db); dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_set_data(db, NULL); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } /* * This is our just-in-time copy function. It makes a copy of * buffers, that have been modified in a previous transaction * group, before we modify them in the current active group. * * This function is used in two places: when we are dirtying a * buffer for the first time in a txg, and when we are freeing * a range in a dnode that includes this buffer. * * Note that when we are called from dbuf_free_range() we do * not put a hold on the buffer, we just traverse the active * dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa; DB_GET_SPA(&spa, db); dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { dbuf_set_data(db, NULL); } } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp)) { spa_t *spa; DB_GET_SPA(&spa, db); zio_free(spa, txg, bp); } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. Also, if we happen accross any level-1 dbufs in the * range that have not already been marked dirty, mark them dirty so * they stay in memory. */ void dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t first_l1 = start >> epbs; uint64_t last_l1 = end >> epbs; if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { end = dn->dn_maxblkid; last_l1 = end >> epbs; } dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level == 1 && db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { mutex_enter(&db->db_mtx); if (db->db_last_dirty && db->db_last_dirty->dr_txg < txg) { dbuf_add_ref(db, FTAG); mutex_exit(&db->db_mtx); dbuf_will_dirty(db, tx); dbuf_rele(db, FTAG); } else { mutex_exit(&db->db_mtx); } } if (db->db_level != 0) continue; dprintf_dbuf(db, "found buf %s\n", ""); if (db->db_blkid < start || db->db_blkid > end) continue; /* found a level 0 buffer in the range */ if (dbuf_undirty(db, tx)) continue; mutex_enter(&db->db_mtx); if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_clear(db); continue; } /* The dbuf is referenced */ if (db->db_last_dirty != NULL) { dbuf_dirty_record_t *dr = db->db_last_dirty; if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } mutex_exit(&dn->dn_dbufs_mtx); } static int dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; /* * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_last_dirty) birth_txg = db->db_last_dirty->dr_txg; else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; /* * If we don't exist or are in a snapshot, we can't be freed. * Don't pass the bp to dsl_dataset_block_freeable() since we * are holding the db_mtx lock and might deadlock if we are * prefetching a dedup-ed block. */ if (birth_txg) return (ds == NULL || dsl_dataset_block_freeable(ds, NULL, birth_txg)); else return (FALSE); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* XXX does *this* func really need the lock? */ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dbuf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dbuf_will_dirty(db, tx); /* create the data buffer for the new block */ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); VERIFY(arc_buf_remove_ref(obuf, db) == 1); db->db.db_size = size; if (db->db_level == 0) { ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); db->db_last_dirty->dt.dl.dr_data = buf; } mutex_exit(&db->db_mtx); dnode_willuse_space(dn, size-osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os; zbookmark_t zb; DB_GET_OBJSET(&os, db); ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; zb.zb_object = db->db.db_object; zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; (void) arc_release_bp(db->db_buf, db, db->db_blkptr, os->os_spa, &zb); } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); /* * Don't set dirtyctx to SYNC if we're just modifying this as we * initialize the objset. */ if (dn->dn_dirtyctx == DN_UNDIRTIED && !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); ASSERT(dn->dn_dirtyctx_firstset == NULL); dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); } mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); return (dr); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { /* * Update the accounting. * Note: we delay "free accounting" until after we drop * the db_mtx. This keeps us from grabbing other locks * (and possibly deadlocking) in bp_get_dsize() while * also holding the db_mtx. */ dnode_willuse_space(dn, db->db.db_size, tx); do_free_accounting = dbuf_block_freeable(db); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); list_link_init(&dr->dr_dirty_node); if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; *drp = dr; /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, db->db_blkid, 1, tx); mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } else if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? bp_get_dsize(os->os_spa, bp) : db->db.db_size; /* * This is only a guess -- if the dbuf is dirty * in a previous txg, we don't know how much * space it will use on disk yet. We should * really have the struct_rwlock to access * db_blkptr, but since this is just a guess, * it's OK if we get an odd answer. */ ddt_prefetch(os->os_spa, bp); dnode_willuse_space(dn, -willfree, tx); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level+1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* possible race with dbuf_undirty() */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) { mutex_exit(&db->db_mtx); return (0); } ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * If this buffer is currently held, we cannot undirty * it, since one of the current holders may be in the * middle of an update. Note that users of dbuf_undirty() * should not place a hold on the dbuf before the call. */ if (refcount_count(&db->db_holds) > db->db_dirtycnt) { mutex_exit(&db->db_mtx); /* Make sure we don't toss this buffer at sync phase */ mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, db->db_blkid, 1, tx); mutex_exit(&dn->dn_mtx); DB_DNODE_EXIT(db); return (0); } dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); /* XXX would be nice to fix up dn_towrite_space[] */ *drp = dr->dr_next; if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_level+1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } DB_DNODE_EXIT(db); if (db->db_level == 0) { if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); } } else { ASSERT(db->db_buf != NULL); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { arc_buf_t *buf = db->db_buf; ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); dbuf_set_data(db, NULL); VERIFY(arc_buf_remove_ref(buf, db) == 1); dbuf_evict(db); return (1); } mutex_exit(&db->db_mtx); return (0); } #pragma weak dmu_buf_will_dirty = dbuf_will_dirty void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; dmu_buf_will_fill(db_fake, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); (void) dbuf_dirty(db, tx); } #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) { mutex_enter(&db->db_mtx); DBUF_VERIFY(db); if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); ASSERT(buf != NULL); ASSERT(arc_buf_size(buf) == db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db) == 1); xuio_stat_wbuf_copied(); return; } xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dbuf_fill_done(db, tx); } /* * "Clear" the contents of this dbuf. This will mark the dbuf * EVICTING and clear *most* of its references. Unfortunetely, * when we are not holding the dn_dbufs_mtx, we can't clear the * entry in the dn_dbufs list. We have to wait until dbuf_destroy() * in this case. For callers from the DMU we will usually see: * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see: * dbuf_do_evict()->dbuf_clear();dbuf_destroy() * Sometimes, though, we will get a mix of these two: * DMU: dbuf_clear()->arc_buf_evict() * ARC: dbuf_do_evict()->dbuf_destroy() */ void dbuf_clear(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; int dbuf_gone = FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); dbuf_evict_user(db); if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); if (db->db_blkid == DMU_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; db->db_state = DB_UNCACHED; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { list_remove(&dn->dn_dbufs, db); (void) atomic_dec_32_nv(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; } else { DB_DNODE_EXIT(db); } if (db->db_buf) dbuf_gone = arc_buf_evict(db->db_buf); if (!dbuf_gone) mutex_exit(&db->db_mtx); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) dbuf_rele(parent, db); } __attribute__((always_inline)) static inline int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh) { int nlevels, epbs; *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = &dn->dn_phys->dn_spill; else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } if (dn->dn_phys->dn_nlevels == 0) nlevels = 1; else nlevels = dn->dn_phys->dn_nlevels; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (level >= nlevels || (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (ENOENT); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err; if (dh == NULL) { err = dbuf_hold_impl(dn, level+1, blkid >> epbs, fail_sparse, NULL, parentp); } else { __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, blkid >> epbs, fail_sparse, NULL, parentp, dh->dh_depth + 1); err = __dbuf_hold_impl(dh + 1); } if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; db->db_user_ptr = NULL; db->db_user_data_ptr_ptr = NULL; db->db_evict_func = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1<dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before its added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } list_insert_head(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); (void) atomic_inc_32_nv(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); return (db); } static int dbuf_do_evict(void *private) { arc_buf_t *buf = private; dmu_buf_impl_t *db = buf->b_private; if (!MUTEX_HELD(&db->db_mtx)) mutex_enter(&db->db_mtx); ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_state != DB_EVICTING) { ASSERT(db->db_state == DB_CACHED); DBUF_VERIFY(db); db->db_buf = NULL; dbuf_evict(db); } else { mutex_exit(&db->db_mtx); dbuf_destroy(db); } return (0); } static void dbuf_destroy(dmu_buf_impl_t *db) { ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_blkid != DMU_BONUS_BLKID) { /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ if (db->db_dnode_handle != NULL) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); list_remove(&dn->dn_dbufs, db); (void) atomic_dec_32_nv(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); DB_DNODE_EXIT(db); /* * Decrementing the dbuf count means that the hold * corresponding to the removed dbuf is no longer * discounted in dnode_move(), so the dnode cannot be * moved until after we release the hold. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; } dbuf_hash_remove(db); } db->db_parent = NULL; db->db_buf = NULL; ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } void dbuf_prefetch(dnode_t *dn, uint64_t blkid) { dmu_buf_impl_t *db = NULL; blkptr_t *bp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (dnode_block_freed(dn, blkid)) return; /* dbuf_find() returns with db_mtx held */ if ((db = dbuf_find(dn, 0, blkid))) { /* * This dbuf is already in the cache. We assume that * it is already CACHED, or else about to be either * read or filled. */ mutex_exit(&db->db_mtx); return; } if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { if (bp && !BP_IS_HOLE(bp)) { int priority = dn->dn_type == DMU_OT_DDT_ZAP ? ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; arc_buf_t *pbuf; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, 0, blkid); if (db) pbuf = db->db_buf; else pbuf = dn->dn_objset->os_phys_buf; (void) dsl_read(NULL, dn->dn_objset->os_spa, bp, pbuf, NULL, NULL, priority, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } if (db) dbuf_rele(db, NULL); } } #define DBUF_HOLD_IMPL_MAX_DEPTH 20 /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) { ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH); dh->dh_parent = NULL; ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); *(dh->dh_dbp) = NULL; top: /* dbuf_find() returns with db_mtx held */ dh->dh_db = dbuf_find(dh->dh_dn, dh->dh_level, dh->dh_blkid); if (dh->dh_db == NULL) { dh->dh_bp = NULL; ASSERT3P(dh->dh_parent, ==, NULL); dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh); if (dh->dh_fail_sparse) { if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) dh->dh_err = ENOENT; if (dh->dh_err) { if (dh->dh_parent) dbuf_rele(dh->dh_parent, NULL); return (dh->dh_err); } } if (dh->dh_err && dh->dh_err != ENOENT) return (dh->dh_err); dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, dh->dh_parent, dh->dh_bp); } if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) { arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db); if (dh->dh_db->db_buf->b_data == NULL) { dbuf_clear(dh->dh_db); if (dh->dh_parent) { dbuf_rele(dh->dh_parent, NULL); dh->dh_parent = NULL; } goto top; } ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); } ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (dh->dh_db->db_level == 0 && dh->dh_db->db_blkid != DMU_BONUS_BLKID && dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { dh->dh_dr = dh->dh_db->db_data_pending; if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) { dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db); dbuf_set_data(dh->dh_db, arc_buf_alloc(dh->dh_dn->dn_objset->os_spa, dh->dh_db->db.db_size, dh->dh_db, dh->dh_type)); bcopy(dh->dh_dr->dt.dl.dr_data->b_data, dh->dh_db->db.db_data, dh->dh_db->db.db_size); } } (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); dbuf_update_data(dh->dh_db); DBUF_VERIFY(dh->dh_db); mutex_exit(&dh->dh_db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (dh->dh_parent) dbuf_rele(dh->dh_parent, NULL); ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); *(dh->dh_dbp) = dh->dh_db; return (0); } /* * The following code preserves the recursive function dbuf_hold_impl() * but moves the local variables AND function arguments to the heap to * minimize the stack frame size. Enough space is initially allocated * on the stack for 20 levels of recursion. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, void *tag, dmu_buf_impl_t **dbp) { struct dbuf_hold_impl_data *dh; int error; dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) * DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0); error = __dbuf_hold_impl(dh); kmem_free(dh, sizeof(struct dbuf_hold_impl_data) * DBUF_HOLD_IMPL_MAX_DEPTH); return (error); } static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, void *tag, dmu_buf_impl_t **dbp, int depth) { dh->dh_dn = dn; dh->dh_level = level; dh->dh_blkid = blkid; dh->dh_fail_sparse = fail_sparse; dh->dh_tag = tag; dh->dh_dbp = dbp; dh->dh_depth = depth; } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); return (err ? NULL : db); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (ENOTSUP); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; if (blksz > SPA_MAXBLOCKSIZE) blksz = SPA_MAXBLOCKSIZE; else blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); DB_DNODE_ENTER(db); dn = DB_DNODE(db); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { VERIFY(refcount_add(&db->db_holds, tag) > 1); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ #pragma weak dmu_buf_rele = dbuf_rele void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) arc_buf_freeze(db->db_buf); if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { mutex_exit(&db->db_mtx); /* * If the dnode moves here, we cannot cross this barrier * until the move completes. */ DB_DNODE_ENTER(db); (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); DB_DNODE_EXIT(db); /* * The bonus buffer's dnode hold is no longer discounted * in dnode_move(). The dnode cannot move until after * the dnode_rele(). */ dnode_rele(DB_DNODE(db), db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_evict(db); } else if (arc_released(db->db_buf)) { arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ dbuf_set_data(db, NULL); VERIFY(arc_buf_remove_ref(buf, db) == 1); dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); if (!DBUF_IS_CACHEABLE(db)) dbuf_clear(db); else mutex_exit(&db->db_mtx); } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (refcount_count(&db->db_holds)); } void * dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) { return (dmu_buf_update_user(db_fake, NULL, user_ptr, user_data_ptr_ptr, evict_func)); } void * dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_immediate_evict = TRUE; return (dmu_buf_update_user(db_fake, NULL, user_ptr, user_data_ptr_ptr, evict_func)); } void * dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_level == 0); ASSERT((user_ptr == NULL) == (evict_func == NULL)); mutex_enter(&db->db_mtx); if (db->db_user_ptr == old_user_ptr) { db->db_user_ptr = user_ptr; db->db_user_data_ptr_ptr = user_data_ptr_ptr; db->db_evict_func = evict_func; dbuf_update_data(db); } else { old_user_ptr = db->db_user_ptr; } mutex_exit(&db->db_mtx); return (old_user_ptr); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(!refcount_is_zero(&db->db_holds)); return (db->db_user_ptr); } boolean_t dmu_buf_freeable(dmu_buf_t *dbuf) { boolean_t res = B_FALSE; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; if (db->db_blkptr) res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, db->db_blkptr, db->db_blkptr->blk_birth); return (res); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = &dn->dn_phys->dn_spill; BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mis-match). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); (void) dbuf_hold_impl(dn, db->db_level+1, db->db_blkid >> epbs, FALSE, db, &parent); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } /* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it * is critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } /* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is * critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); *datap = arc_buf_alloc(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); DB_DNODE_EXIT(db); } else { /* * Although zio_nowait() does not "wait for an IO", it does * initiate the IO. If this is an empty write it seems plausible * that the IO could actually be completed before the nowait * returns. We need to DB_DNODE_EXIT() first in case * zio_nowait() invalidates the dbuf. */ DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while ((dr = list_head(list))) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT(db->db_blkptr == bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (BP_IS_HOLE(bp)) { ASSERT(bp->blk_fill == 0); DB_DNODE_EXIT(db); return; } ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { dnode_phys_t *dnp = db->db.db_data; for (i = db->db.db_size >> DNODE_SHIFT; i > 0; i--, dnp++) { if (dnp->dn_type != DMU_OT_NONE) fill++; } } else { fill = 1; } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += ibp->blk_fill; } } DB_DNODE_EXIT(db); bp->blk_fill = fill; mutex_exit(&db->db_mtx); } /* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint64_t txg = zio->io_txg; dbuf_dirty_record_t **drp, *dr; ASSERT3U(zio->io_error, ==, 0); ASSERT(db->db_blkptr == bp); if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { objset_t *os; dsl_dataset_t *ds; dmu_tx_t *tx; DB_GET_OBJSET(&os, db); ds = os->os_dsl_dataset; tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); drp = &db->db_last_dirty; while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); DB_DNODE_EXIT(db); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); else if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); } } else { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { ASSERTV(int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); ASSERT3U(dn->dn_phys->dn_maxblkid >> (db->db_level * epbs), >=, db->db_blkid); arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); } static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_t zb; zio_prop_t zp; zio_t *zio; int wp_flag = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } } } if (parent != dn->dn_dbuf) { ASSERT(parent && parent->db_data_pending); ASSERT(db->db_level == parent->db_level-1); ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); zio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(zio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { ASSERT(db->db_state != DB_NOFILL); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, data->b_data, arc_buf_size(data), &zp, dbuf_write_override_ready, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, dbuf_write_nofill_ready, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); dr->dr_zio = arc_write(zio, os->os_spa, txg, db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, dbuf_write_ready, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index ae9d2a5e139c..c7db3d7580bc 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -1,1186 +1,1189 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. */ int zfs_dedup_prefetch = 1; static const ddt_ops_t *ddt_ops[DDT_TYPES] = { &ddt_zap_ops, }; static const char *ddt_class_name[DDT_CLASSES] = { "ditto", "duplicate", "unique", }; static void ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, dmu_tx_t *tx) { spa_t *spa = ddt->ddt_spa; objset_t *os = ddt->ddt_os; uint64_t *objectp = &ddt->ddt_object[type][class]; boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; char name[DDT_NAMELEN]; ddt_object_name(ddt, type, class, name); ASSERT(*objectp == 0); VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); ASSERT(*objectp != 0); VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, objectp, tx) == 0); VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), &ddt->ddt_histogram[type][class], tx) == 0); } static void ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, dmu_tx_t *tx) { spa_t *spa = ddt->ddt_spa; objset_t *os = ddt->ddt_os; uint64_t *objectp = &ddt->ddt_object[type][class]; char name[DDT_NAMELEN]; ddt_object_name(ddt, type, class, name); ASSERT(*objectp != 0); ASSERT(ddt_object_count(ddt, type, class) == 0); ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); *objectp = 0; } static int ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; dmu_object_info_t doi; char name[DDT_NAMELEN]; int error; ddt_object_name(ddt, type, class, name); error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); if (error) return (error); error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), &ddt->ddt_histogram[type][class]); /* * Seed the cached statistics. */ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); ddo->ddo_count = ddt_object_count(ddt, type, class); ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; ASSERT(error == 0); return (error); } static void ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, dmu_tx_t *tx) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; dmu_object_info_t doi; char name[DDT_NAMELEN]; ddt_object_name(ddt, type, class, name); VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), &ddt->ddt_histogram[type][class], tx) == 0); /* * Cache DDT statistics; this is the only time they'll change. */ VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); ddo->ddo_count = ddt_object_count(ddt, type, class); ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; } static int ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_entry_t *dde) { if (!ddt_object_exists(ddt, type, class)) return (ENOENT); return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, ddt->ddt_object[type][class], dde)); } static void ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_entry_t *dde) { if (!ddt_object_exists(ddt, type, class)) return; ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, ddt->ddt_object[type][class], dde); } int ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, ddt->ddt_object[type][class], dde, tx)); } static int ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, ddt->ddt_object[type][class], dde, tx)); } int ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *walk, ddt_entry_t *dde) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, ddt->ddt_object[type][class], dde, walk)); } uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, ddt->ddt_object[type][class])); } int ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, dmu_object_info_t *doi) { if (!ddt_object_exists(ddt, type, class)) return (ENOENT); return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], doi)); } boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { return (!!ddt->ddt_object[type][class]); } void ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, char *name) { (void) sprintf(name, DMU_POOL_DDT, zio_checksum_table[ddt->ddt_checksum].ci_name, ddt_ops[type]->ddt_op_name, ddt_class_name[class]); } void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) { int d; ASSERT(txg != 0); for (d = 0; d < SPA_DVAS_PER_BP; d++) bp->blk_dva[d] = ddp->ddp_dva[d]; BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); } void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) { BP_ZERO(bp); if (ddp != NULL) ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); bp->blk_cksum = ddk->ddk_cksum; bp->blk_fill = 1; BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); BP_SET_CHECKSUM(bp, checksum); BP_SET_TYPE(bp, DMU_OT_DEDUP); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); } void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) { ddk->ddk_cksum = bp->blk_cksum; ddk->ddk_prop = 0; DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); } void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) { int d; ASSERT(ddp->ddp_phys_birth == 0); for (d = 0; d < SPA_DVAS_PER_BP; d++) ddp->ddp_dva[d] = bp->blk_dva[d]; ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); } void ddt_phys_clear(ddt_phys_t *ddp) { bzero(ddp, sizeof (*ddp)); } void ddt_phys_addref(ddt_phys_t *ddp) { ddp->ddp_refcnt++; } void ddt_phys_decref(ddt_phys_t *ddp) { ASSERT((int64_t)ddp->ddp_refcnt > 0); ddp->ddp_refcnt--; } void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) { blkptr_t blk; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); ddt_phys_clear(ddp); zio_free(ddt->ddt_spa, txg, &blk); } ddt_phys_t * ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) { ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; int p; for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) return (ddp); } return (NULL); } uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde) { uint64_t refcnt = 0; int p; for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) refcnt += dde->dde_phys[p].ddp_refcnt; return (refcnt); } static void ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) { spa_t *spa = ddt->ddt_spa; ddt_phys_t *ddp = dde->dde_phys; ddt_key_t *ddk = &dde->dde_key; uint64_t lsize = DDK_GET_LSIZE(ddk); uint64_t psize = DDK_GET_PSIZE(ddk); int p, d; bzero(dds, sizeof (*dds)); for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { uint64_t dsize = 0; uint64_t refcnt = ddp->ddp_refcnt; if (ddp->ddp_phys_birth == 0) continue; for (d = 0; d < SPA_DVAS_PER_BP; d++) dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); dds->dds_blocks += 1; dds->dds_lsize += lsize; dds->dds_psize += psize; dds->dds_dsize += dsize; dds->dds_ref_blocks += refcnt; dds->dds_ref_lsize += lsize * refcnt; dds->dds_ref_psize += psize * refcnt; dds->dds_ref_dsize += dsize * refcnt; } } void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) { const uint64_t *s = (const uint64_t *)src; uint64_t *d = (uint64_t *)dst; uint64_t *d_end = (uint64_t *)(dst + 1); ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ while (d < d_end) *d++ += (*s++ ^ neg) - neg; } static void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) { ddt_stat_t dds; ddt_histogram_t *ddh; int bucket; ddt_stat_generate(ddt, dde, &dds); bucket = highbit(dds.dds_ref_blocks) - 1; ASSERT(bucket >= 0); ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); } void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) { int h; for (h = 0; h < 64; h++) ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); } void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) { int h; bzero(dds, sizeof (*dds)); for (h = 0; h < 64; h++) ddt_stat_add(dds, &ddh->ddh_stat[h], 0); } boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh) { const uint64_t *s = (const uint64_t *)ddh; const uint64_t *s_end = (const uint64_t *)(ddh + 1); while (s < s_end) if (*s++ != 0) return (B_FALSE); return (B_TRUE); } void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) { enum zio_checksum c; enum ddt_type type; enum ddt_class class; /* Sum the statistics we cached in ddt_object_sync(). */ for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; ddo_total->ddo_count += ddo->ddo_count; ddo_total->ddo_dspace += ddo->ddo_dspace; ddo_total->ddo_mspace += ddo->ddo_mspace; } } } /* ... and compute the averages. */ if (ddo_total->ddo_count != 0) { ddo_total->ddo_dspace /= ddo_total->ddo_count; ddo_total->ddo_mspace /= ddo_total->ddo_count; } } void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) { enum zio_checksum c; enum ddt_type type; enum ddt_class class; for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { ddt_histogram_add(ddh, &ddt->ddt_histogram_cache[type][class]); } } } } void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) { ddt_histogram_t *ddh_total; + /* XXX: Move to a slab */ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh_total); ddt_histogram_stat(dds_total, ddh_total); kmem_free(ddh_total, sizeof (ddt_histogram_t)); } uint64_t ddt_get_dedup_dspace(spa_t *spa) { ddt_stat_t dds_total = { 0 }; ddt_get_dedup_stats(spa, &dds_total); return (dds_total.dds_ref_dsize - dds_total.dds_dsize); } uint64_t ddt_get_pool_dedup_ratio(spa_t *spa) { ddt_stat_t dds_total = { 0 }; ddt_get_dedup_stats(spa, &dds_total); if (dds_total.dds_dsize == 0) return (100); return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); } int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) { spa_t *spa = ddt->ddt_spa; uint64_t total_refcnt = 0; uint64_t ditto = spa->spa_dedup_ditto; int total_copies = 0; int desired_copies = 0; int p; for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *zio = dde->dde_lead_zio[p]; uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ if (zio != NULL) refcnt += zio->io_parent_count; /* pending refs */ if (ddp == ddp_willref) refcnt++; /* caller's ref */ if (refcnt != 0) { total_refcnt += refcnt; total_copies += p; } } if (ditto == 0 || ditto > UINT32_MAX) ditto = UINT32_MAX; if (total_refcnt >= 1) desired_copies++; if (total_refcnt >= ditto) desired_copies++; if (total_refcnt >= ditto * ditto) desired_copies++; return (MAX(desired_copies, total_copies) - total_copies); } int ddt_ditto_copies_present(ddt_entry_t *dde) { ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; dva_t *dva = ddp->ddp_dva; int copies = 0 - DVA_GET_GANG(dva); int d; for (d = 0; d < SPA_DVAS_PER_BP; d++, dva++) if (DVA_IS_VALID(dva)) copies++; ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); return (copies); } size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) { uchar_t *version = dst++; int cpfunc = ZIO_COMPRESS_ZLE; zio_compress_info_t *ci = &zio_compress_table[cpfunc]; size_t c_len; ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); if (c_len == s_len) { cpfunc = ZIO_COMPRESS_OFF; bcopy(src, dst, s_len); } *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; return (c_len + 1); } void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) { uchar_t version = *src++; int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; zio_compress_info_t *ci = &zio_compress_table[cpfunc]; if (ci->ci_decompress != NULL) (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); else bcopy(src, dst, d_len); if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) byteswap_uint64_array(dst, d_len); } ddt_t * ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) { return (spa->spa_ddt[c]); } ddt_t * ddt_select(spa_t *spa, const blkptr_t *bp) { return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); } void ddt_enter(ddt_t *ddt) { mutex_enter(&ddt->ddt_lock); } void ddt_exit(ddt_t *ddt) { mutex_exit(&ddt->ddt_lock); } static ddt_entry_t * ddt_alloc(const ddt_key_t *ddk) { ddt_entry_t *dde; + /* XXX: Move to a slab */ dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); dde->dde_key = *ddk; return (dde); } static void ddt_free(ddt_entry_t *dde) { int p; ASSERT(!dde->dde_loading); for (p = 0; p < DDT_PHYS_TYPES; p++) ASSERT(dde->dde_lead_zio[p] == NULL); if (dde->dde_repair_data != NULL) zio_buf_free(dde->dde_repair_data, DDK_GET_PSIZE(&dde->dde_key)); cv_destroy(&dde->dde_cv); kmem_free(dde, sizeof (*dde)); } void ddt_remove(ddt_t *ddt, ddt_entry_t *dde) { ASSERT(MUTEX_HELD(&ddt->ddt_lock)); avl_remove(&ddt->ddt_tree, dde); ddt_free(dde); } ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) { ddt_entry_t *dde, dde_search; enum ddt_type type; enum ddt_class class; avl_index_t where; int error; ASSERT(MUTEX_HELD(&ddt->ddt_lock)); ddt_key_fill(&dde_search.dde_key, bp); dde = avl_find(&ddt->ddt_tree, &dde_search, &where); if (dde == NULL) { if (!add) return (NULL); dde = ddt_alloc(&dde_search.dde_key); avl_insert(&ddt->ddt_tree, dde, where); } while (dde->dde_loading) cv_wait(&dde->dde_cv, &ddt->ddt_lock); if (dde->dde_loaded) return (dde); dde->dde_loading = B_TRUE; ddt_exit(ddt); error = ENOENT; for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { error = ddt_object_lookup(ddt, type, class, dde); if (error != ENOENT) break; } if (error != ENOENT) break; } ASSERT(error == 0 || error == ENOENT); ddt_enter(ddt); ASSERT(dde->dde_loaded == B_FALSE); ASSERT(dde->dde_loading == B_TRUE); dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ dde->dde_loaded = B_TRUE; dde->dde_loading = B_FALSE; if (error == 0) ddt_stat_update(ddt, dde, -1ULL); cv_broadcast(&dde->dde_cv); return (dde); } void ddt_prefetch(spa_t *spa, const blkptr_t *bp) { ddt_t *ddt; ddt_entry_t dde; enum ddt_type type; enum ddt_class class; if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) return; /* * We only remove the DDT once all tables are empty and only * prefetch dedup blocks when there are entries in the DDT. * Thus no locking is required as the DDT can't disappear on us. */ ddt = ddt_select(spa, bp); ddt_key_fill(&dde.dde_key, bp); for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { ddt_object_prefetch(ddt, type, class, &dde); } } } int ddt_entry_compare(const void *x1, const void *x2) { const ddt_entry_t *dde1 = x1; const ddt_entry_t *dde2 = x2; const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; int i; for (i = 0; i < DDT_KEY_WORDS; i++) { if (u1[i] < u2[i]) return (-1); if (u1[i] > u2[i]) return (1); } return (0); } static ddt_t * ddt_table_alloc(spa_t *spa, enum zio_checksum c) { ddt_t *ddt; - ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); + /* XXX: Move to a slab */ + ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP | KM_NODEBUG); mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&ddt->ddt_tree, ddt_entry_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); ddt->ddt_checksum = c; ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; return (ddt); } static void ddt_table_free(ddt_t *ddt) { ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); avl_destroy(&ddt->ddt_tree); avl_destroy(&ddt->ddt_repair_tree); mutex_destroy(&ddt->ddt_lock); kmem_free(ddt, sizeof (*ddt)); } void ddt_create(spa_t *spa) { enum zio_checksum c; spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) spa->spa_ddt[c] = ddt_table_alloc(spa, c); } int ddt_load(spa_t *spa) { enum zio_checksum c; enum ddt_type type; enum ddt_class class; int error; ddt_create(spa); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, &spa->spa_ddt_stat_object); if (error) return (error == ENOENT ? 0 : error); for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { error = ddt_object_load(ddt, type, class); if (error != 0 && error != ENOENT) return (error); } } /* * Seed the cached histograms. */ bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, sizeof (ddt->ddt_histogram)); } return (0); } void ddt_unload(spa_t *spa) { enum zio_checksum c; for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { if (spa->spa_ddt[c]) { ddt_table_free(spa->spa_ddt[c]); spa->spa_ddt[c] = NULL; } } } boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) { ddt_t *ddt; ddt_entry_t dde; enum ddt_type type; enum ddt_class class; if (!BP_GET_DEDUP(bp)) return (B_FALSE); if (max_class == DDT_CLASS_UNIQUE) return (B_TRUE); ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; ddt_key_fill(&dde.dde_key, bp); for (type = 0; type < DDT_TYPES; type++) for (class = 0; class <= max_class; class++) if (ddt_object_lookup(ddt, type, class, &dde) == 0) return (B_TRUE); return (B_FALSE); } ddt_entry_t * ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) { ddt_key_t ddk; ddt_entry_t *dde; enum ddt_type type; enum ddt_class class; ddt_key_fill(&ddk, bp); dde = ddt_alloc(&ddk); for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { /* * We can only do repair if there are multiple copies * of the block. For anything in the UNIQUE class, * there's definitely only one copy, so don't even try. */ if (class != DDT_CLASS_UNIQUE && ddt_object_lookup(ddt, type, class, dde) == 0) return (dde); } } bzero(dde->dde_phys, sizeof (dde->dde_phys)); return (dde); } void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) { avl_index_t where; ddt_enter(ddt); if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) avl_insert(&ddt->ddt_repair_tree, dde, where); else ddt_free(dde); ddt_exit(ddt); } static void ddt_repair_entry_done(zio_t *zio) { ddt_entry_t *rdde = zio->io_private; ddt_free(rdde); } static void ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) { ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *rddp = rdde->dde_phys; ddt_key_t *ddk = &dde->dde_key; ddt_key_t *rddk = &rdde->dde_key; zio_t *zio; blkptr_t blk; int p; zio = zio_null(rio, rio->io_spa, NULL, ddt_repair_entry_done, rdde, rio->io_flags); for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth != rddp->ddp_phys_birth || bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); } zio_nowait(zio); } static void ddt_repair_table(ddt_t *ddt, zio_t *rio) { spa_t *spa = ddt->ddt_spa; ddt_entry_t *dde, *rdde_next, *rdde; avl_tree_t *t = &ddt->ddt_repair_tree; blkptr_t blk; if (spa_sync_pass(spa) > 1) return; ddt_enter(ddt); for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { rdde_next = AVL_NEXT(t, rdde); avl_remove(&ddt->ddt_repair_tree, rdde); ddt_exit(ddt); ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); dde = ddt_repair_start(ddt, &blk); ddt_repair_entry(ddt, dde, rdde, rio); ddt_repair_done(ddt, dde); ddt_enter(ddt); } ddt_exit(ddt); } static void ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) { dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; ddt_phys_t *ddp = dde->dde_phys; ddt_key_t *ddk = &dde->dde_key; enum ddt_type otype = dde->dde_type; enum ddt_type ntype = DDT_TYPE_CURRENT; enum ddt_class oclass = dde->dde_class; enum ddt_class nclass; uint64_t total_refcnt = 0; int p; ASSERT(dde->dde_loaded); ASSERT(!dde->dde_loading); for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ASSERT(dde->dde_lead_zio[p] == NULL); ASSERT((int64_t)ddp->ddp_refcnt >= 0); if (ddp->ddp_phys_birth == 0) { ASSERT(ddp->ddp_refcnt == 0); continue; } if (p == DDT_PHYS_DITTO) { if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) ddt_phys_free(ddt, ddk, ddp, txg); continue; } if (ddp->ddp_refcnt == 0) ddt_phys_free(ddt, ddk, ddp, txg); total_refcnt += ddp->ddp_refcnt; } if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) nclass = DDT_CLASS_DITTO; else if (total_refcnt > 1) nclass = DDT_CLASS_DUPLICATE; else nclass = DDT_CLASS_UNIQUE; if (otype != DDT_TYPES && (otype != ntype || oclass != nclass || total_refcnt == 0)) { VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); } if (total_refcnt != 0) { dde->dde_type = ntype; dde->dde_class = nclass; ddt_stat_update(ddt, dde, 0); if (!ddt_object_exists(ddt, ntype, nclass)) ddt_object_create(ddt, ntype, nclass, tx); VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); /* * If the class changes, the order that we scan this bp * changes. If it decreases, we could miss it, so * scan it right now. (This covers both class changing * while we are doing ddt_walk(), and when we are * traversing.) */ if (nclass < oclass) { dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, dde, tx); } } } static void ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) { spa_t *spa = ddt->ddt_spa; ddt_entry_t *dde; void *cookie = NULL; enum ddt_type type; enum ddt_class class; if (avl_numnodes(&ddt->ddt_tree) == 0) return; ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, &spa->spa_ddt_stat_object, tx) == 0); } while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { ddt_sync_entry(ddt, dde, tx, txg); ddt_free(dde); } for (type = 0; type < DDT_TYPES; type++) { uint64_t count = 0; for (class = 0; class < DDT_CLASSES; class++) { if (ddt_object_exists(ddt, type, class)) { ddt_object_sync(ddt, type, class, tx); count += ddt_object_count(ddt, type, class); } } for (class = 0; class < DDT_CLASSES; class++) { if (count == 0 && ddt_object_exists(ddt, type, class)) ddt_object_destroy(ddt, type, class, tx); } } bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, sizeof (ddt->ddt_histogram)); } void ddt_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); enum zio_checksum c; ASSERT(spa_syncing_txg(spa) == txg); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL) continue; ddt_sync_table(ddt, tx, txg); ddt_repair_table(ddt, rio); } (void) zio_wait(rio); dmu_tx_commit(tx); } int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) { do { do { do { ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; int error = ENOENT; if (ddt_object_exists(ddt, ddb->ddb_type, ddb->ddb_class)) { error = ddt_object_walk(ddt, ddb->ddb_type, ddb->ddb_class, &ddb->ddb_cursor, dde); } dde->dde_type = ddb->ddb_type; dde->dde_class = ddb->ddb_class; if (error == 0) return (0); if (error != ENOENT) return (error); ddb->ddb_cursor = 0; } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); ddb->ddb_checksum = 0; } while (++ddb->ddb_type < DDT_TYPES); ddb->ddb_type = 0; } while (++ddb->ddb_class < DDT_CLASSES); return (ENOENT); } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index f13cfd316f90..ae0b36fc6380 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1,1609 +1,1609 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static char *dmu_recv_tag = "dmu_recv_tag"; /* * The list of data whose inclusion in a send stream can be pending from * one call to backup_cb to another. Multiple calls to dump_free() and * dump_freeobjects() can be aggregated into a single DRR_FREE or * DRR_FREEOBJECTS replay record. */ typedef enum { PENDING_NONE, PENDING_FREE, PENDING_FREEOBJECTS } pendop_t; struct backuparg { dmu_replay_record_t *drr; vnode_t *vp; offset_t *off; objset_t *os; zio_cksum_t zc; uint64_t toguid; int err; pendop_t pending_op; }; static int dump_bytes(struct backuparg *ba, void *buf, int len) { ssize_t resid; /* have to get resid to get detailed errno */ ASSERT3U(len % 8, ==, 0); fletcher_4_incremental_native(buf, len, &ba->zc); ba->err = vn_rdwr(UIO_WRITE, ba->vp, (caddr_t)buf, len, 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); *ba->off += len; return (ba->err); } static int dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, uint64_t length) { struct drr_free *drrf = &(ba->drr->drr_u.drr_free); /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_FREE records can only be aggregated with * other DRR_FREE records. DRR_FREEOBJECTS records can only be * aggregated with other DRR_FREEOBJECTS records. */ if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); ba->pending_op = PENDING_NONE; } if (ba->pending_op == PENDING_FREE) { /* * There should never be a PENDING_FREE if length is -1 * (because dump_dnode is the only place where this * function is called with a -1, and only after flushing * any pending record). */ ASSERT(length != -1ULL); /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); ba->pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; drrf->drr_length = length; drrf->drr_toguid = ba->toguid; if (length == -1ULL) { if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); } else { ba->pending_op = PENDING_FREE; } return (0); } static int dump_data(struct backuparg *ba, dmu_object_type_t type, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) { struct drr_write *drrw = &(ba->drr->drr_u.drr_write); /* * If there is any kind of pending aggregation (currently either * a grouping of free objects or free blocks), push it out to * the stream, since aggregation can't be done across operations * of different types. */ if (ba->pending_op != PENDING_NONE) { if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); ba->pending_op = PENDING_NONE; } /* write a DATA record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = ba->toguid; drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); drrw->drr_key.ddk_cksum = bp->blk_cksum; if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); if (dump_bytes(ba, data, blksz) != 0) return (EINTR); return (0); } static int dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data) { struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill); if (ba->pending_op != PENDING_NONE) { if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); ba->pending_op = PENDING_NONE; } /* write a SPILL record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; drrs->drr_toguid = ba->toguid; if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) return (EINTR); if (dump_bytes(ba, data, blksz)) return (EINTR); return (0); } static int dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for * blocks of the same type (i.e., DRR_FREE records can only be * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records * can only be aggregated with other DRR_FREEOBJECTS records. */ if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREEOBJECTS) { if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); ba->pending_op = PENDING_NONE; } if (ba->pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one */ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { drrfo->drr_numobjs += numobjs; return (0); } else { /* can't be aggregated. Push out pending record */ if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); ba->pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; drrfo->drr_toguid = ba->toguid; ba->pending_op = PENDING_FREEOBJECTS; return (0); } static int dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(ba->drr->drr_u.drr_object); if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(ba, object, 1)); if (ba->pending_op != PENDING_NONE) { if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); ba->pending_op = PENDING_NONE; } /* write an OBJECT record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; drro->drr_bonuslen = dnp->dn_bonuslen; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; drro->drr_toguid = ba->toguid; if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (EINTR); /* free anything past the end of the file */ if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) return (EINTR); if (ba->err) return (EINTR); return (0); } #define BP_SPAN(dnp, level) \ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) /* ARGSUSED */ static int backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct backuparg *ba = arg; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); } else if (bp == NULL) { uint64_t span = BP_SPAN(dnp, zb->zb_level); err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { dnode_phys_t *blk; int i; int blksz = BP_GET_LSIZE(bp); uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; if (dsl_read(NULL, spa, bp, pbuf, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); blk = abuf->b_data; for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; err = dump_dnode(ba, dnobj, blk+i); if (err) break; } (void) arc_buf_remove_ref(abuf, &abuf); } else if (type == DMU_OT_SA) { uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); if (arc_read_nolock(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } else { /* it's a level-0 block of a regular object */ uint32_t aflags = ARC_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); if (dsl_read(NULL, spa, bp, pbuf, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (EIO); err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, blksz, bp, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); return (err); } int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, vnode_t *vp, offset_t *off) { dsl_dataset_t *ds = tosnap->os_dsl_dataset; dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; dmu_replay_record_t *drr; struct backuparg ba; int err; uint64_t fromtxg = 0; /* tosnap must be a snapshot */ if (ds->ds_phys->ds_next_snap_obj == 0) return (EINVAL); /* fromsnap must be an earlier snapshot from the same fs as tosnap */ if (fromds && (ds->ds_dir != fromds->ds_dir || fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) return (EXDEV); if (fromorigin) { dsl_pool_t *dp = ds->ds_dir->dd_pool; if (fromsnap) return (EINVAL); if (dsl_dir_is_clone(ds->ds_dir)) { rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); rw_exit(&dp->dp_config_rwlock); if (err) return (err); } else { fromorigin = B_FALSE; } } drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, DMU_SUBSTREAM); #ifdef _KERNEL if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) return (EINVAL); if (version == ZPL_VERSION_SA) { DMU_SET_FEATUREFLAGS( drr->drr_u.drr_begin.drr_versioninfo, DMU_BACKUP_FEATURE_SA_SPILL); } } #endif drr->drr_u.drr_begin.drr_creation_time = ds->ds_phys->ds_creation_time; drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; if (fromorigin) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; if (fromds) drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); if (fromds) fromtxg = fromds->ds_phys->ds_creation_txg; if (fromorigin) dsl_dataset_rele(fromds, FTAG); ba.drr = drr; ba.vp = vp; ba.os = tosnap; ba.off = off; ba.toguid = ds->ds_phys->ds_guid; ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); ba.pending_op = PENDING_NONE; if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); return (ba.err); } err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, backup_cb, &ba); if (ba.pending_op != PENDING_NONE) if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) err = EINTR; if (err) { if (err == EINTR && ba.err) err = ba.err; kmem_free(drr, sizeof (dmu_replay_record_t)); return (err); } bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = ba.zc; drr->drr_u.drr_end.drr_toguid = ba.toguid; if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); return (ba.err); } kmem_free(drr, sizeof (dmu_replay_record_t)); return (0); } struct recvbeginsyncarg { const char *tofs; const char *tosnap; dsl_dataset_t *origin; uint64_t fromguid; dmu_objset_type_t type; void *tag; boolean_t force; uint64_t dsflags; char clonelastname[MAXNAMELEN]; dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ cred_t *cr; }; /* ARGSUSED */ static int recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct recvbeginsyncarg *rbsa = arg2; objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t val; int err; err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); if (err != ENOENT) return (err ? err : EEXIST); if (rbsa->origin) { /* make sure it's a snap in the same pool */ if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) return (EXDEV); if (!dsl_dataset_is_snapshot(rbsa->origin)) return (EINVAL); if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) return (ENODEV); } return (0); } static void recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct recvbeginsyncarg *rbsa = arg2; uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; /* Create and open new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, rbsa->origin, flags, rbsa->cr, tx); VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, B_TRUE, dmu_recv_tag, &rbsa->ds)); if (rbsa->origin == NULL) { (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); } spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); } /* ARGSUSED */ static int recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct recvbeginsyncarg *rbsa = arg2; int err; uint64_t val; /* must not have any changes since most recent snapshot */ if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) return (ETXTBSY); /* new snapshot name must not exist */ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); if (err == 0) return (EEXIST); if (err != ENOENT) return (err); if (rbsa->fromguid) { /* if incremental, most recent snapshot must match fromguid */ if (ds->ds_prev == NULL) return (ENODEV); /* * most recent snapshot must match fromguid, or there are no * changes since the fromguid one */ if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; while (obj != 0) { dsl_dataset_t *snap; err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, obj, FTAG, &snap); if (err) return (ENODEV); if (snap->ds_phys->ds_creation_txg < birth) { dsl_dataset_rele(snap, FTAG); return (ENODEV); } if (snap->ds_phys->ds_guid == rbsa->fromguid) { dsl_dataset_rele(snap, FTAG); break; /* it's ok */ } obj = snap->ds_phys->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); } if (obj == 0) return (ENODEV); } } else { /* if full, most recent snapshot must be $ORIGIN */ if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) return (ENODEV); } /* temporary clone name must not exist */ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_dir->dd_phys->dd_child_dir_zapobj, rbsa->clonelastname, 8, 1, &val); if (err == 0) return (EEXIST); if (err != ENOENT) return (err); return (0); } /* ARGSUSED */ static void recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ohds = arg1; struct recvbeginsyncarg *rbsa = arg2; dsl_pool_t *dp = ohds->ds_dir->dd_pool; dsl_dataset_t *cds; uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; /* create and open the temporary clone */ dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, ohds->ds_prev, flags, rbsa->cr, tx); VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); /* * If we actually created a non-clone, we need to create the * objset in our new dataset. */ if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { (void) dmu_objset_create_impl(dp->dp_spa, cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); } rbsa->ds = cds; spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, dp->dp_spa, tx, "dataset = %lld", dsobj); } static boolean_t dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) { int featureflags; featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); /* Verify pool version supports SA if SA_SPILL feature set */ return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); } /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) { int err = 0; boolean_t byteswap; struct recvbeginsyncarg rbsa = { 0 }; uint64_t versioninfo; int flags; dsl_dataset_t *ds; if (drrb->drr_magic == DMU_BACKUP_MAGIC) byteswap = FALSE; else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) byteswap = TRUE; else return (EINVAL); rbsa.tofs = tofs; rbsa.tosnap = tosnap; rbsa.origin = origin ? origin->os_dsl_dataset : NULL; rbsa.fromguid = drrb->drr_fromguid; rbsa.type = drrb->drr_type; rbsa.tag = FTAG; rbsa.dsflags = 0; rbsa.cr = CRED(); versioninfo = drrb->drr_versioninfo; flags = drrb->drr_flags; if (byteswap) { rbsa.type = BSWAP_32(rbsa.type); rbsa.fromguid = BSWAP_64(rbsa.fromguid); versioninfo = BSWAP_64(versioninfo); flags = BSWAP_32(flags); } if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || rbsa.type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && origin == NULL)) return (EINVAL); if (flags & DRR_FLAG_CI_DATA) rbsa.dsflags = DS_FLAG_CI_DATASET; bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drrb = drrb; drc->drc_tosnap = tosnap; drc->drc_top_ds = top_ds; drc->drc_force = force; /* * Process the begin in syncing context. */ /* open the dataset we are logically receiving into */ err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); if (err == 0) { if (dmu_recv_verify_features(ds, drrb)) { dsl_dataset_rele(ds, dmu_recv_tag); return (ENOTSUP); } /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ if (flags & DRR_FLAG_CLONE) { dsl_dataset_rele(ds, dmu_recv_tag); return (EINVAL); } /* must not have an incremental recv already in progress */ if (!mutex_tryenter(&ds->ds_recvlock)) { dsl_dataset_rele(ds, dmu_recv_tag); return (EBUSY); } /* tmp clone name is: tofs/%tosnap" */ (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), "%%%s", tosnap); rbsa.force = force; err = dsl_sync_task_do(ds->ds_dir->dd_pool, recv_existing_check, recv_existing_sync, ds, &rbsa, 5); if (err) { mutex_exit(&ds->ds_recvlock); dsl_dataset_rele(ds, dmu_recv_tag); return (err); } drc->drc_logical_ds = ds; drc->drc_real_ds = rbsa.ds; } else if (err == ENOENT) { /* target fs does not exist; must be a full backup or clone */ char *cp; /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) return (ENOENT); /* Open the parent of tofs */ cp = strrchr(tofs, '/'); *cp = '\0'; err = dsl_dataset_hold(tofs, FTAG, &ds); *cp = '/'; if (err) return (err); if (dmu_recv_verify_features(ds, drrb)) { dsl_dataset_rele(ds, FTAG); return (ENOTSUP); } err = dsl_sync_task_do(ds->ds_dir->dd_pool, recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); dsl_dataset_rele(ds, FTAG); if (err) return (err); drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; drc->drc_newfs = B_TRUE; } return (err); } struct restorearg { int err; int byteswap; vnode_t *vp; char *buf; uint64_t voff; int bufsize; /* amount of memory allocated for buf */ zio_cksum_t cksum; avl_tree_t *guid_to_ds_map; }; typedef struct guid_map_entry { uint64_t guid; dsl_dataset_t *gme_ds; avl_node_t avlnode; } guid_map_entry_t; static int guid_compare(const void *arg1, const void *arg2) { const guid_map_entry_t *gmep1 = arg1; const guid_map_entry_t *gmep2 = arg2; if (gmep1->guid < gmep2->guid) return (-1); else if (gmep1->guid > gmep2->guid) return (1); return (0); } /* * This function is a callback used by dmu_objset_find() (which * enumerates the object sets) to build an avl tree that maps guids * to datasets. The resulting table is used when processing DRR_WRITE_BYREF * send stream records. These records, which are used in dedup'ed * streams, do not contain data themselves, but refer to a copy * of the data block that has already been written because it was * earlier in the stream. That previous copy is identified by the * guid of the dataset with the referenced data. */ int find_ds_by_guid(const char *name, void *arg) { avl_tree_t *guid_map = arg; dsl_dataset_t *ds, *snapds; guid_map_entry_t *gmep; dsl_pool_t *dp; int err; uint64_t lastobj, firstobj; if (dsl_dataset_hold(name, FTAG, &ds) != 0) return (0); dp = ds->ds_dir->dd_pool; rw_enter(&dp->dp_config_rwlock, RW_READER); firstobj = ds->ds_dir->dd_phys->dd_origin_obj; lastobj = ds->ds_phys->ds_prev_snap_obj; while (lastobj != firstobj) { err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds); if (err) { /* * Skip this snapshot and move on. It's not * clear why this would ever happen, but the * remainder of the snapshot streadm can be * processed. */ rw_exit(&dp->dp_config_rwlock); dsl_dataset_rele(ds, FTAG); return (0); } gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); gmep->guid = snapds->ds_phys->ds_guid; gmep->gme_ds = snapds; avl_add(guid_map, gmep); lastobj = snapds->ds_phys->ds_prev_snap_obj; } rw_exit(&dp->dp_config_rwlock); dsl_dataset_rele(ds, FTAG); return (0); } static void free_guid_map_onexit(void *arg) { avl_tree_t *ca = arg; void *cookie = NULL; guid_map_entry_t *gmep; while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { dsl_dataset_rele(gmep->gme_ds, ca); kmem_free(gmep, sizeof (guid_map_entry_t)); } avl_destroy(ca); kmem_free(ca, sizeof (avl_tree_t)); } static void * restore_read(struct restorearg *ra, int len) { void *rv; int done = 0; /* some things will require 8-byte alignment, so everything must */ ASSERT3U(len % 8, ==, 0); while (done < len) { ssize_t resid; ra->err = vn_rdwr(UIO_READ, ra->vp, (caddr_t)ra->buf + done, len - done, ra->voff, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); if (resid == len - done) ra->err = EINVAL; ra->voff += len - done - resid; done = len - resid; if (ra->err) return (NULL); } ASSERT3U(done, ==, len); rv = ra->buf; if (ra->byteswap) fletcher_4_incremental_byteswap(rv, len, &ra->cksum); else fletcher_4_incremental_native(rv, len, &ra->cksum); return (rv); } noinline static void backup_byteswap(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; case DRR_OBJECT: DO64(drr_object.drr_object); /* DO64(drr_object.drr_allocation_txg); */ DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); DO64(drr_object.drr_toguid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_length); DO64(drr_write.drr_toguid); DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); DO64(drr_write.drr_key.ddk_prop); break; case DRR_WRITE_BYREF: DO64(drr_write_byref.drr_object); DO64(drr_write_byref.drr_offset); DO64(drr_write_byref.drr_length); DO64(drr_write_byref.drr_toguid); DO64(drr_write_byref.drr_refguid); DO64(drr_write_byref.drr_refobject); DO64(drr_write_byref.drr_refoffset); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); DO64(drr_write_byref.drr_key.ddk_prop); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; case DRR_SPILL: DO64(drr_spill.drr_object); DO64(drr_spill.drr_length); DO64(drr_spill.drr_toguid); break; case DRR_END: DO64(drr_end.drr_checksum.zc_word[0]); DO64(drr_end.drr_checksum.zc_word[1]); DO64(drr_end.drr_checksum.zc_word[2]); DO64(drr_end.drr_checksum.zc_word[3]); DO64(drr_end.drr_toguid); break; default: break; } #undef DO64 #undef DO32 } noinline static int restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) { int err; dmu_tx_t *tx; void *data = NULL; if (drro->drr_type == DMU_OT_NONE || drro->drr_type >= DMU_OT_NUMTYPES || drro->drr_bonustype >= DMU_OT_NUMTYPES || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > SPA_MAXBLOCKSIZE || drro->drr_bonuslen > DN_MAX_BONUSLEN) { return (EINVAL); } err = dmu_object_info(os, drro->drr_object, NULL); if (err != 0 && err != ENOENT) return (EINVAL); if (drro->drr_bonuslen) { data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); if (ra->err) return (ra->err); } if (err == ENOENT) { /* currently free, want to be allocated */ tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } err = dmu_object_claim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); dmu_tx_commit(tx); } else { /* currently allocated, want to be allocated */ err = dmu_object_reclaim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen); } if (err) { return (EINVAL); } tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, drro->drr_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, tx); dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); return (0); } /* ARGSUSED */ noinline static int restore_freeobjects(struct restorearg *ra, objset_t *os, struct drr_freeobjects *drrfo) { uint64_t obj; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (EINVAL); for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; (void) dmu_object_next(os, &obj, FALSE, 0)) { int err; if (dmu_object_info(os, obj, NULL) != 0) continue; err = dmu_free_object(os, obj); if (err) return (err); } return (0); } noinline static int restore_write(struct restorearg *ra, objset_t *os, struct drr_write *drrw) { dmu_tx_t *tx; void *data; int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || drrw->drr_type >= DMU_OT_NUMTYPES) return (EINVAL); data = restore_read(ra, drrw->drr_length); if (data == NULL) return (ra->err); if (dmu_object_info(os, drrw->drr_object, NULL) != 0) return (EINVAL); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } if (ra->byteswap) dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); dmu_write(os, drrw->drr_object, drrw->drr_offset, drrw->drr_length, data, tx); dmu_tx_commit(tx); return (0); } /* * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed * streams to refer to a copy of the data that is already on the * system because it came in earlier in the stream. This function * finds the earlier copy of the data, and uses that copy instead of * data from the stream to fulfill this write. */ static int restore_write_byref(struct restorearg *ra, objset_t *os, struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; guid_map_entry_t gmesrch; guid_map_entry_t *gmep; avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) return (EINVAL); /* * If the GUID of the referenced dataset is different from the * GUID of the target dataset, find the referenced dataset. */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (EINVAL); } if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (EINVAL); } else { ref_os = os; } err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); if (err) return (err); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } dmu_write(os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); return (0); } static int restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) { dmu_tx_t *tx; void *data; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > SPA_MAXBLOCKSIZE) return (EINVAL); data = restore_read(ra, drrs->drr_length); if (data == NULL) return (ra->err); if (dmu_object_info(os, drrs->drr_object, NULL) != 0) return (EINVAL); VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } tx = dmu_tx_create(os); dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); return (err); } dmu_buf_will_dirty(db_spill, tx); if (db_spill->db_size < drrs->drr_length) VERIFY(0 == dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); bcopy(data, db_spill->db_data, drrs->drr_length); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_commit(tx); return (0); } /* ARGSUSED */ noinline static int restore_free(struct restorearg *ra, objset_t *os, struct drr_free *drrf) { int err; if (drrf->drr_length != -1ULL && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (EINVAL); if (dmu_object_info(os, drrf->drr_object, NULL) != 0) return (EINVAL); err = dmu_free_long_range(os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); return (err); } /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ int dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { struct restorearg ra = { 0 }; dmu_replay_record_t *drr; objset_t *os; zio_cksum_t pcksum; int featureflags; if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) ra.byteswap = TRUE; { /* compute checksum of drr_begin record */ dmu_replay_record_t *drr; drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin = *drc->drc_drrb; if (ra.byteswap) { fletcher_4_incremental_byteswap(drr, sizeof (dmu_replay_record_t), &ra.cksum); } else { fletcher_4_incremental_native(drr, sizeof (dmu_replay_record_t), &ra.cksum); } kmem_free(drr, sizeof (dmu_replay_record_t)); } if (ra.byteswap) { struct drr_begin *drrb = drc->drc_drrb; drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } ra.vp = vp; ra.voff = *voffp; ra.bufsize = 1<<20; - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP); /* these were verified in dmu_recv_begin */ ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == DMU_SUBSTREAM); ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); /* if this stream is dedup'ed, set up the avl tree for guid mapping */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { minor_t minor; if (cleanup_fd == -1) { ra.err = EBADF; goto out; } ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); if (ra.err) { cleanup_fd = -1; goto out; } if (*action_handlep == 0) { ra.guid_to_ds_map = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); avl_create(ra.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, (void *)ra.guid_to_ds_map, DS_FIND_CHILDREN); ra.err = zfs_onexit_add_cb(minor, free_guid_map_onexit, ra.guid_to_ds_map, action_handlep); if (ra.err) goto out; } else { ra.err = zfs_onexit_cb_data(minor, *action_handlep, (void **)&ra.guid_to_ds_map); if (ra.err) goto out; } } /* * Read records and process them. */ pcksum = ra.cksum; while (ra.err == 0 && NULL != (drr = restore_read(&ra, sizeof (*drr)))) { if (issig(JUSTLOOKING) && issig(FORREAL)) { ra.err = EINTR; goto out; } if (ra.byteswap) backup_byteswap(drr); switch (drr->drr_type) { case DRR_OBJECT: { /* * We need to make a copy of the record header, * because restore_{object,write} may need to * restore_read(), which will invalidate drr. */ struct drr_object drro = drr->drr_u.drr_object; ra.err = restore_object(&ra, os, &drro); break; } case DRR_FREEOBJECTS: { struct drr_freeobjects drrfo = drr->drr_u.drr_freeobjects; ra.err = restore_freeobjects(&ra, os, &drrfo); break; } case DRR_WRITE: { struct drr_write drrw = drr->drr_u.drr_write; ra.err = restore_write(&ra, os, &drrw); break; } case DRR_WRITE_BYREF: { struct drr_write_byref drrwbr = drr->drr_u.drr_write_byref; ra.err = restore_write_byref(&ra, os, &drrwbr); break; } case DRR_FREE: { struct drr_free drrf = drr->drr_u.drr_free; ra.err = restore_free(&ra, os, &drrf); break; } case DRR_END: { struct drr_end drre = drr->drr_u.drr_end; /* * We compare against the *previous* checksum * value, because the stored checksum is of * everything before the DRR_END record. */ if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) ra.err = ECKSUM; goto out; } case DRR_SPILL: { struct drr_spill drrs = drr->drr_u.drr_spill; ra.err = restore_spill(&ra, os, &drrs); break; } default: ra.err = EINVAL; goto out; } pcksum = ra.cksum; } ASSERT(ra.err != 0); out: if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); if (ra.err != 0) { /* * destroy what we created, so we don't leave it in the * inconsistent restoring state. */ txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); if (drc->drc_real_ds != drc->drc_logical_ds) { mutex_exit(&drc->drc_logical_ds->ds_recvlock); dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); } } - kmem_free(ra.buf, ra.bufsize); + vmem_free(ra.buf, ra.bufsize); *voffp = ra.voff; return (ra.err); } struct recvendsyncarg { char *tosnap; uint64_t creation_time; uint64_t toguid; }; static int recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct recvendsyncarg *resa = arg2; return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); } static void recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct recvendsyncarg *resa = arg2; dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; ds->ds_prev->ds_phys->ds_guid = resa->toguid; ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; } static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { struct recvendsyncarg resa; dsl_dataset_t *ds = drc->drc_logical_ds; int err; /* * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() * expects it to have a ds_user_ptr (and zil), but clone_swap() * can close it. */ txg_wait_synced(ds->ds_dir->dd_pool, 0); if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, drc->drc_force); if (err) goto out; } else { mutex_exit(&ds->ds_recvlock); dsl_dataset_rele(ds, dmu_recv_tag); (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); return (EBUSY); } resa.creation_time = drc->drc_drrb->drr_creation_time; resa.toguid = drc->drc_drrb->drr_toguid; resa.tosnap = drc->drc_tosnap; err = dsl_sync_task_do(ds->ds_dir->dd_pool, recv_end_check, recv_end_sync, ds, &resa, 3); if (err) { /* swap back */ (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); } out: mutex_exit(&ds->ds_recvlock); dsl_dataset_disown(ds, dmu_recv_tag); (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); return (err); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { struct recvendsyncarg resa; dsl_dataset_t *ds = drc->drc_logical_ds; int err; /* * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() * expects it to have a ds_user_ptr (and zil), but clone_swap() * can close it. */ txg_wait_synced(ds->ds_dir->dd_pool, 0); resa.creation_time = drc->drc_drrb->drr_creation_time; resa.toguid = drc->drc_drrb->drr_toguid; resa.tosnap = drc->drc_tosnap; err = dsl_sync_task_do(ds->ds_dir->dd_pool, recv_end_check, recv_end_sync, ds, &resa, 3); if (err) { /* clean up the fs we just recv'd into */ (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); } else { /* release the hold from dmu_recv_begin */ dsl_dataset_disown(ds, dmu_recv_tag); } return (err); } int dmu_recv_end(dmu_recv_cookie_t *drc) { if (drc->drc_logical_ds != drc->drc_real_ds) return (dmu_recv_existing_end(drc)); else return (dmu_recv_new_end(drc)); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index e037f4133ff5..afdfa123221e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1,5860 +1,5860 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" typedef enum zti_modes { zti_mode_fixed, /* value is # of threads (min 1) */ zti_mode_online_percent, /* value is % of online CPUs */ zti_mode_batch, /* cpu-intensive; value is ignored */ zti_mode_null, /* don't create a taskq */ zti_nmodes } zti_modes_t; #define ZTI_FIX(n) { zti_mode_fixed, (n) } #define ZTI_PCT(n) { zti_mode_online_percent, (n) } #define ZTI_BATCH { zti_mode_batch, 0 } #define ZTI_NULL { zti_mode_null, 0 } #define ZTI_ONE ZTI_FIX(1) typedef struct zio_taskq_info { enum zti_modes zti_mode; uint_t zti_value; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "issue", "issue_high", "intr", "intr_high" }; /* * Define the taskq threads for the following I/O types: * NULL, READ, WRITE, FREE, CLAIM, and IOCTL */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; static dsl_syncfunc_t spa_sync_props; static boolean_t spa_has_active_shared_spare(spa_t *spa); static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ id_t zio_taskq_psrset_bind = PS_NONE; boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); if (strval != NULL) VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); else VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { uint64_t size; uint64_t alloc; uint64_t cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (spa->spa_root_vdev != NULL) { alloc = metaslab_class_get_alloc(spa_normal_class(spa)); size = metaslab_class_get_space(spa_normal_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == FREAD), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, spa->spa_root_vdev->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; int err; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) { mutex_exit(&spa->spa_props_lock); return (0); } /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_pool_t *dp; dsl_dataset_t *ds = NULL; dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); if ((err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds))) { rw_exit(&dp->dp_config_rwlock); break; } strval = kmem_alloc( MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); rw_exit(&dp->dp_config_rwlock); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } spa_prop_add_list(*nvp, prop, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); mutex_exit(&spa->spa_props_lock); out: if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { zpool_prop_t prop; char *propname, *strval; uint64_t intval; objset_t *os; char *slash; propname = nvpair_name(elem); if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) return (EINVAL); switch (prop) { case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION)) error = EINVAL; break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = EINVAL; break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = ENOTSUP; break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = ENOTSUP; break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { uint64_t compress; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } if ((error = dmu_objset_hold(strval,FTAG,&os))) break; /* Must be ZPL and not gzip compressed. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = ENOTSUP; } else if ((error = dsl_prop_get_integer(strval, zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL)) == 0 && !BOOTFS_COMPRESS_VALID(compress)) { error = ENOTSUP; } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < ZIO_FAILURE_MODE_WAIT || intval > ZIO_FAILURE_MODE_PANIC)) error = EINVAL; /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = EIO; } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = EINVAL; break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = EINVAL; break; case ZPOOL_PROP_DEDUPDITTO: if (spa_version(spa) < SPA_VERSION_DEDUP) error = ENOTSUP; else error = nvpair_value_uint64(elem, &intval); if (error == 0 && intval != 0 && intval < ZIO_DEDUPDITTO_MIN) error = EINVAL; break; default: break; } if (error) break; } if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem; boolean_t need_sync = B_FALSE; zpool_prop_t prop; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { if ((prop = zpool_name_to_prop( nvpair_name(elem))) == ZPROP_INVAL) return (EINVAL); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; need_sync = B_TRUE; break; } if (need_sync) return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, spa, nvp, 3)); else return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { spa_error_entry_t *sa = (spa_error_entry_t *)a; spa_error_entry_t *sb = (spa_error_entry_t *)b; int ret; ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_t)); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static taskq_t * spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, uint_t value) { uint_t flags = TASKQ_PREPOPULATE; boolean_t batch = B_FALSE; switch (mode) { case zti_mode_null: return (NULL); /* no taskq needed */ case zti_mode_fixed: ASSERT3U(value, >=, 1); value = MAX(value, 1); break; case zti_mode_batch: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = zio_taskq_batch_pct; break; case zti_mode_online_percent: flags |= TASKQ_THREADS_CPU_PCT; break; default: panic("unrecognized mode for %s taskq (%u:%u) in " "spa_activate()", name, mode, value); break; } if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; return (taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags)); } return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, spa->spa_proc, flags)); } static void spa_create_zio_taskqs(spa_t *spa) { int t, q; for (t = 0; t < ZIO_TYPES; t++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; char name[32]; (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); spa->spa_zio_taskq[t][q] = spa_taskq_create(spa, name, mode, value); } } } #ifdef _KERNEL static void spa_thread(void *arg) { callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, int mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { int t, q; ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_state_dirty_list); for (t = 0; t < ZIO_TYPES; t++) { for (q = 0; q < ZIO_TASKQ_TYPES; q++) { if (spa->spa_zio_taskq[t][q] != NULL) taskq_destroy(spa->spa_zio_taskq[t][q]); spa->spa_zio_taskq[t][q] = NULL; } } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ static int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; int c; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (EINVAL); } for (c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { int i; ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Stop async tasks. */ spa_async_suspend(spa); /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { (void) zio_wait(spa->spa_async_zio_root); spa->spa_async_zio_root = NULL; } bpobj_close(&spa->spa_deferred_bpobj); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); for (i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; for (i = 0; i < spa->spa_l2cache.sav_count; i++) vdev_free(spa->spa_l2cache.sav_vdevs[i]); if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; spa_config_exit(spa, SCL_ALL, FTAG); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ static void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ static void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs = NULL; spa_aux_vdev_t *sav = &spa->spa_l2cache; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (sav->sav_config != NULL) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); } else { nl2cache = 0; } oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); } } /* * Purge vdevs that were dropped */ for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); (void) vdev_close(vd); spa_l2cache_remove(vd); } } if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); if (sav->sav_config == NULL) goto out; sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); - packed = kmem_alloc(nvsize, KM_SLEEP); + packed = kmem_alloc(nvsize, KM_SLEEP | KM_NODEBUG); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); return (error); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { int c; for (c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { zfs_ereport_post(FM_EREPORT_RESOURCE_AUTOREPLACE, vd->vdev_spa, vd, NULL, 0, 0); spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_CHECK); } } /* * Validate the current config against the MOS config */ static boolean_t spa_config_valid(spa_t *spa, nvlist_t *config) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv; int c, i; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); /* * If we're doing a normal import, then build up any additional * diagnostic information about missing devices in this config. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; if (tvd->vdev_ops == &vdev_missing_ops && mtvd->vdev_ops != &vdev_missing_ops && mtvd->vdev_islog) child[idx++] = vdev_config_generate(spa, mtvd, B_FALSE, 0); } if (idx) { VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, idx) == 0); VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); for (i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); } /* * Compare the root vdev tree with the information we have * from the MOS config (mrvd). Check each top-level vdev * with the corresponding MOS config top-level (mtvd). */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; /* * Resolve any "missing" vdevs in the current configuration. * If we find that the MOS config has more accurate information * about the top-level vdev then use that vdev instead. */ if (tvd->vdev_ops == &vdev_missing_ops && mtvd->vdev_ops != &vdev_missing_ops) { if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) continue; /* * Device specific actions. */ if (mtvd->vdev_islog) { spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * XXX - once we have 'readonly' pool * support we should be able to handle * missing data devices by transitioning * the pool to readonly. */ continue; } /* * Swap the missing vdev with the data we were * able to obtain from the MOS config. */ vdev_remove_child(rvd, tvd); vdev_remove_child(mrvd, mtvd); vdev_add_child(rvd, mtvd); vdev_add_child(mrvd, tvd); spa_config_exit(spa, SCL_ALL, FTAG); vdev_load(mtvd); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_reopen(rvd); } else if (mtvd->vdev_islog) { /* * Load the slog device's state from the MOS config * since it's possible that the label does not * contain the most up-to-date information. */ vdev_load_log_state(tvd, mtvd); vdev_reopen(tvd); } } vdev_free(mrvd); spa_config_exit(spa, SCL_ALL, FTAG); /* * Ensure we were able to validate the config. */ return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); } /* * Check for missing log devices */ static int spa_check_logs(spa_t *spa) { switch (spa->spa_log_state) { default: break; case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, DS_FIND_CHILDREN)) { spa_set_log_state(spa, SPA_LOG_MISSING); return (1); } break; } return (0); } static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; int c; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { metaslab_group_passivate(mg); slog_found = B_TRUE; } } return (slog_found); } static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) metaslab_group_activate(mg); } } int spa_offline_log(spa_t *spa) { int error = 0; if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, NULL, DS_FIND_CHILDREN)) == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { int i; for (i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; if (error) { if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && type != DMU_OT_INTENT_LOG) atomic_add_64(&sle->sle_meta_count, 1); else atomic_add_64(&sle->sle_data_count, 1); } zio_data_buf_free(zio->io_data, zio->io_size); } /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { if (bp != NULL) { zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); void *data = zio_data_buf_alloc(size); zio_nowait(zio_read(rio, spa, bp, data, size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); } return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_rewind_policy_t policy; boolean_t verify_ok = B_FALSE; int error; zpool_get_rewind_policy(spa->spa_config, &policy); if (policy.zrp_request & ZPOOL_NEVER_REWIND) return (0); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); (void) zio_wait(rio); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && sle.sle_data_count <= policy.zrp_maxdata) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); VERIFY(nvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (error) { if (error != ENXIO && error != EIO) error = EIO; return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) { return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val)); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (err); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig) { nvlist_t *config = spa->spa_config; char *ereport = FM_EREPORT_ZFS_POOL; int error; uint64_t pool_guid; nvlist_t *nvl; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) return (EINVAL); /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { error = EEXIST; } else { spa->spa_load_guid = pool_guid; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) { VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, KM_SLEEP) == 0); } gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); } spa->spa_minref = refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; return (error); } /* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ __attribute__((always_inline)) static inline int spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport) { int error = 0; nvlist_t *nvroot = NULL; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; /* * If this is an untrusted config, access the pool in read-only mode. * This prevents things like resilvering recently removed devices. */ if (!mosconfig) spa->spa_mode = FREAD; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa->spa_load_state = state; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) return (EINVAL); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); ASSERT(spa->spa_root_vdev == rvd); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } /* * Try to open all vdevs, loading each label in the process. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); /* * We need to validate the vdev labels against the configuration that * we have in hand, which is dependent on the setting of mosconfig. If * mosconfig is true then we're validating the vdev labels based on * that config. Otherwise, we're validating against the cached config * (zpool.cache) that was read when we loaded the zfs module, and then * later we will recursively call spa_load() and validate against * the vdev config. * * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) return (ENXIO); } /* * Find the best uberblock. */ vdev_uberblock_load(NULL, rvd, ub); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); /* * If the pool is newer than the code, we can't open it. */ if (ub->ub_version > SPA_VERSION) return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); /* * If the vdev guid sum doesn't match the uberblock, we have an * incomplete configuration. We first check to see if the pool * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). * If it is, defer the vdev_guid_sum check till later so we * can handle missing vdevs. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!mosconfig) { uint64_t hostid; nvlist_t *policy = NULL, *nvconfig; if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; VERIFY(nvlist_lookup_string(nvconfig, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); #ifdef _KERNEL myhostid = zone_get_hostid(NULL); #else /* _KERNEL */ /* * We're emulating the system's hostid in userland, so * we can't use zone_get_hostid(). */ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); #endif /* _KERNEL */ if (hostid != 0 && myhostid != 0 && hostid != myhostid) { nvlist_free(nvconfig); cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " "See: http://www.sun.com/msg/ZFS-8000-EY", spa_name(spa), hostname, (unsigned long)hostid); return (EBADF); } } if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY, &policy) == 0) VERIFY(nvlist_add_nvlist(nvconfig, ZPOOL_REWIND_POLICY, policy) == 0); spa_config_set(spa, nvconfig); spa_unload(spa); spa_deactivate(spa); spa_activate(spa, orig_mode); return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, &spa->spa_dedup_ditto); spa->spa_autoreplace = (autoreplace != 0); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev state for all toplevel vdevs. */ vdev_load(rvd); /* * Propagate the leaf DTLs we just loaded all the way up the tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); /* * Load the DDTs (dedup tables). */ error = ddt_load(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_update_dspace(spa); /* * Validate the config, using the MOS config to fill in any * information which might be missing. If we fail to validate * the config then declare the pool unfit for use. If we're * assembling a pool from a split, the log is not transferred * over. */ if (type != SPA_IMPORT_ASSEMBLE) { nvlist_t *nvconfig; if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!spa_config_valid(spa, nvconfig)) { nvlist_free(nvconfig); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } nvlist_free(nvconfig); /* * Now that we've validate the config, check the state of the * root vdev. If it can't be opened, it indicates one or * more toplevel vdevs are faulted. */ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) return (ENXIO); if (spa_check_logs(spa)) { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (state != SPA_LOAD_TRYIMPORT) { if ((error = spa_load_verify(spa))) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; int c; ASSERT(state != SPA_LOAD_TRYIMPORT); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(spa_get_dsl(spa), spa_first_txg(spa)); (void) dmu_objset_find(spa_name(spa), zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by either zil_check_log_chain() * (invoked from spa_check_logs()) or zil_claim() above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || state == SPA_LOAD_IMPORT || state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asychronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); /* * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(rvd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* * Delete any inconsistent datasets. */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); } return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) { int mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg--; spa_activate(spa, mode); spa_async_suspend(spa); return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } static int spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, uint64_t max_request, int rewind_flags) { nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig); if (load_error == 0) return (0); if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); return (load_error); } /* Price of rolling back is discarding txgs, including log */ if (state == SPA_LOAD_RECOVER) spa_set_log_state(spa, SPA_LOG_CLEAR); spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state, mosconfig); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (mutex_owner(&spa_namespace_lock) != curthread) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (ENOENT); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_rewind_policy_t policy; zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zrp_request & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, policy.zrp_request); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (ENOENT); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); } *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); if (nl2cache != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); } } } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; VERIFY(nvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); if (spa_suspended(spa)) VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (EINVAL); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (ENOTSUP); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = EINVAL; goto out; } /* * The L2ARC currently only supports disk devices in * kernel context. For user-level testing, we allow it. */ #ifdef _KERNEL if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { error = ENOTBLK; goto out; } #endif vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatentating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs) == 0); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) VERIFY(nvlist_dup(olddevs[i], &newdevs[i], KM_SLEEP) == 0); for (i = 0; i < ndevs; i++) VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], KM_SLEEP) == 0); VERIFY(nvlist_remove(sav->sav_config, config, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, newdevs, ndevs + oldndevs) == 0); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs) == 0); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); vdev_clear_stats(vd); (void) vdev_close(vd); } } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, const char *history_str, nvlist_t *zplprops) { spa_t *spa; char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; int c; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (EEXIST); } /* * Allocate a new spa_t structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, NULL, altroot); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) version = SPA_VERSION; ASSERT(version <= SPA_VERSION); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = EINVAL; if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (c = 0; c < rvd->vdev_children; c++) { vdev_metaslab_set_size(rvd->vdev_child[c]); vdev_expand(rvd->vdev_child[c], txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; /* * Create DDTs (dedup tables). */ ddt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY) spa_history_create_obj(spa, tx); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(spa, props, tx); } dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * We explicitly wait for the first transaction to complete so that our * bean counters are appropriately updated. */ txg_wait_synced(spa->spa_dsl_pool, txg); spa_config_sync(spa, B_FALSE, B_TRUE); if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); spa_history_log_version(spa, LOG_POOL_CREATE); spa->spa_minref = refcount_count(&spa->spa_refcount); mutex_exit(&spa_namespace_lock); return (0); } #ifdef _KERNEL /* * Get the root pool information from the root disk, then import the root pool * during the system boot up time. */ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); static nvlist_t * spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) { nvlist_t *config; nvlist_t *nvtop, *nvroot; uint64_t pgid; if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) return (NULL); /* * Add this top-level vdev to the child array. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &nvtop, 1) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); return (config); } /* * Walk the vdev tree and see if we can find a device with "better" * configuration. A configuration is "better" if the label on that * device has a more recent txg. */ static void spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) { int c; for (c = 0; c < vd->vdev_children; c++) spa_alt_rootvdev(vd->vdev_child[c], avd, txg); if (vd->vdev_ops->vdev_op_leaf) { nvlist_t *label; uint64_t label_txg; if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, &label) != 0) return; VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg) == 0); /* * Do we have a better boot device? */ if (label_txg > *txg) { *txg = label_txg; *avd = vd; } nvlist_free(label); } } /* * Import a root pool. * * For x86. devpath_list will consist of devid and/or physpath name of * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). * The GRUB "findroot" command will return the vdev we should boot. * * For Sparc, devpath_list consists the physpath name of the booting device * no matter the rootpool is a single device pool or a mirrored pool. * e.g. * "/pci@1f,0/ide@d/disk@0,0:a" */ int spa_import_rootpool(char *devpath, char *devid) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t guid, txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(devpath, devid, &guid); #if defined(_OBP) && defined(_KERNEL) if (config == NULL) { if (strstr(devpath, "/iscsi/ssd") != NULL) { /* iscsi boot */ get_iscsi_bootpath_phy(devpath); config = spa_generate_rootconf(devpath, devid, &guid); } } #endif if (config == NULL) { cmn_err(CE_NOTE, "Can not read the pool label from '%s'", devpath); return (EIO); } VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pname)) != NULL) { /* * Remove the existing root pool from the namespace so that we * can replace it with the correct config we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } /* * Get the boot vdev. */ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", (u_longlong_t)guid); error = ENOENT; goto out; } /* * Determine if there is a better boot device. */ avd = bvd; spa_alt_rootvdev(rvd, &avd, &txg); if (avd != bvd) { cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " "try booting from '%s'", avd->vdev_path); error = EINVAL; goto out; } /* * If the boot device is part of a spare vdev then ensure that * we're booting off the active spare. */ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && !bvd->vdev_isspare) { cmn_err(CE_NOTE, "The boot device is currently spared. Please " "try booting from '%s'", bvd->vdev_parent-> vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); error = EINVAL; goto out; } error = 0; spa_history_log_version(spa, LOG_POOL_IMPORT); out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (error); } #endif /* * Import a non-root pool into the system. */ int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_rewind_policy_t policy; uint64_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (EEXIST); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = FREAD; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_config_sync(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); spa_history_log_version(spa, LOG_POOL_IMPORT); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_rewind_policy(config, &policy); if (policy.zrp_request & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; /* * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig * because the user-supplied config is actually the one to trust when * doing an import. */ if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, policy.zrp_request); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_L2CACHE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); mutex_exit(&spa_namespace_lock); spa_history_log_version(spa, LOG_POOL_IMPORT); return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; char *poolname; spa_t *spa; uint64_t state; int error; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); /* * Pass off the heavy lifting to spa_load(). * Pass TRUE for mosconfig because the user-supplied config * is actually the one to trust when doing an import. */ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname) == 0); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & FWRITE)) return (EROFS); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (ENOENT); } /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); /* * The pool will be in core if it's openable, * in which case we can modify its state. */ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { /* * Objsets may be open only because they're dirty, so we * have to force it to sync before checking spa_refcnt. */ txg_wait_synced(spa->spa_dsl_pool, 0); /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (EBUSY); } /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (EXDEV); } /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } } spa_event_notify(spa, NULL, FM_EREPORT_ZFS_POOL_DESTROY); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); return (0); } /* * Destroy a storage pool. */ int spa_destroy(char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg, id; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; int c; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * Transfer each new top-level vdev from vd to rvd. */ for (c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. */ for (id = 0; id < rvd->vdev_children; id++) { if (rvd->vdev_child[id]->vdev_ishole) { vdev_free(rvd->vdev_child[id]); break; } } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { uint64_t txg, dtl_max_txg; ASSERTV(vdev_t *rvd = spa->spa_root_vdev;) vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * Spares can't replace logs */ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) sprintf(oldvd->vdev_path, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* mark the device being resilvered */ newvd->vdev_resilvering = B_TRUE; /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, FM_EREPORT_ZFS_DEVICE_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Restart the resilver */ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); if (spa->spa_bootfs) spa_event_notify(spa, newvd, FM_EREPORT_ZFS_BOOTFS_VDEV_ATTACH); return (0); } /* * Detach a device from a mirror or replacing vdev. * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; ASSERTV(vdev_t *rvd = spa->spa_root_vdev;) vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; int c, t; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a spare, then it implies * that the spare should become a real disk, and be removed from the * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0 && pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); cvd->vdev_resilvering = B_FALSE; } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path); for (t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, FM_EREPORT_ZFS_DEVICE_REMOVE); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_offline_log(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || vd->vdev_ishole) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = EINVAL; break; } } /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = EINVAL; break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = ENODEV; break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || vml[c]->vdev_ishole || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = EINVAL; break; } if (vdev_dtl_required(vml[c])) { error = EBUSY; break; } /* we need certain info from the top level */ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children) == 0); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl) == 0); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { VERIFY(nvlist_alloc(&newspa->spa_config_splitting, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL) { vdev_split(vml[c]); if (error == 0) spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, "split new pool %s from pool %s", newname, spa_name(spa)); kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { int i; for (i = 0; i < count; i++) { uint64_t guid; VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (guid == target_guid) return (nvpp[i]); } return (NULL); } static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; int i, j; if (count > 1) newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); for (i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); } VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); for (i = 0; i < count - 1; i++) nvlist_free(newdev[i]); if (count > 1) kmem_free(newdev, (count - 1) * sizeof (void *)); } /* * Evacuate the device. */ static int spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) { uint64_t txg; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); ASSERT(vd == vd->vdev_top); /* * Evacuate the device. We don't hold the config lock as writer * since we need to do I/O but we do keep the * spa_namespace_lock held. Once this completes the device * should no longer have any blocks allocated on it. */ if (vd->vdev_islog) { if (vd->vdev_stat.vs_alloc != 0) error = spa_offline_log(spa); } else { error = ENOTSUP; } if (error) return (error); /* * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); txg = spa_vdev_config_enter(spa); vd->vdev_removing = B_TRUE; vdev_dirty(vd, 0, NULL, txg); vdev_config_dirty(vd); spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); return (0); } /* * Complete the removal by cleaning up the namespace. */ static void spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; uint64_t id = vd->vdev_id; boolean_t last_vdev = (id == (rvd->vdev_children - 1)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vd == vd->vdev_top); /* * Only remove any devices which are empty. */ if (vd->vdev_stat.vs_alloc != 0) return; (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) vdev_state_clean(vd); if (list_link_active(&vd->vdev_config_dirty_node)) vdev_config_clean(vd); vdev_free(vd); if (last_vdev) { vdev_compact_children(rvd); } else { vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); } vdev_config_dirty(rvd); /* * Reassess the health of our root vdev. */ vdev_reopen(rvd); } /* * Remove a device from the pool - * * Removing a device from the vdev namespace requires several steps * and can take a significant amount of time. As a result we use * the spa_vdev_config_[enter/exit] functions which allow us to * grab and release the spa_config_lock while still holding the namespace * lock. During each step the configuration is synced out. */ /* * Remove a device from the pool. Currently, this supports removing only hot * spares, slogs, and level 2 ARC devices. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; metaslab_group_t *mg; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; uint_t nspares, nl2cache; int error = 0; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); ASSERT(spa_writeable(spa)); if (!locked) txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_spares.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { /* * Only remove the hot spare if it's not currently in use * in this pool. */ if (vd == NULL || unspare) { spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } else { error = EBUSY; } } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { /* * Cache devices can always be removed. */ spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); ASSERT(vd == vd->vdev_top); /* * XXX - Once we have bp-rewrite this should * become the common case. */ mg = vd->vdev_mg; /* * Stop allocating from this vdev. */ metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * Attempt to evacuate the vdev. */ error = spa_vdev_remove_evacuate(spa, vd); txg = spa_vdev_config_enter(spa); /* * If we couldn't evacuate the vdev, unwind. */ if (error) { metaslab_group_activate(mg); return (spa_vdev_exit(spa, NULL, txg, error)); } /* * Clean up the vdev namespace. */ spa_vdev_remove_from_namespace(spa, vd); } else if (vd != NULL) { /* * Normal vdevs cannot be removed (yet). */ error = ENOTSUP; } else { /* * There is no vdev of any kind with the specified guid. */ error = ENOENT; } if (!locked) return (spa_vdev_exit(spa, NULL, txg, error)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * current spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; int c; for (c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); } /* * Update the stored path or FRU for this vdev. */ int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (EBUSY); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (ENOTSUP); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { int c; if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); } for (c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { int c; if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { int c; if (!spa->spa_autoexpand) return; for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; spa_event_notify(vd->vdev_spa, vd, FM_EREPORT_ZFS_DEVICE_AUTOEXPAND); } static void spa_async_thread(spa_t *spa) { int tasks, i; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(LOG_POOL_VDEV_ONLINE, spa, NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), new_space, new_space - old_space); } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE) spa_vdev_resilver_done(spa); /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) dsl_resilver_restart(spa->spa_dsl_pool, 0); /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa->spa_async_tasks && !spa->spa_async_suspended && spa->spa_async_thread == NULL && rootdir != NULL && !vn_is_readonly(rootdir)) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, tx); return (0); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, zio->io_flags)); return (0); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dbuf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); - packed = kmem_alloc(bufsize, KM_SLEEP); + packed = vmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); - kmem_free(packed, bufsize); + vmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (sav->sav_count == 0) { VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; if (list_is_empty(&spa->spa_config_dirty_list)) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); spa_config_exit(spa, SCL_STATE, FTAG); if (spa->spa_config_syncing) nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } /* * Set zpool properties. */ static void spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; objset_t *mos = spa->spa_meta_objset; nvlist_t *nvp = arg2; nvpair_t *elem; uint64_t intval; char *strval; zpool_prop_t prop; const char *propname; zprop_type_t proptype; mutex_enter(&spa->spa_props_lock); elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem))) { switch (prop = zpool_name_to_prop(nvpair_name(elem))) { case ZPOOL_PROP_VERSION: /* * Only set version for non-zpool-creation cases * (set/import). spa_create() needs special care * for version setting. */ if (tx->tx_txg != TXG_INITIAL) { VERIFY(nvpair_value_uint64(elem, &intval) == 0); ASSERT(intval <= SPA_VERSION); ASSERT(intval >= spa_version(spa)); spa->spa_uberblock.ub_version = intval; vdev_config_dirty(spa->spa_root_vdev); } break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persisitent * properties. */ break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { VERIFY((spa->spa_pool_props_object = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx)) > 0); VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 8, 1, &spa->spa_pool_props_object, tx) == 0); } /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); VERIFY(nvpair_value_string(elem, &strval) == 0); VERIFY(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx) == 0); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { VERIFY(nvpair_value_uint64(elem, &intval) == 0); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY(zpool_prop_index_to_string( prop, intval, &unused) == 0); } VERIFY(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx) == 0); } else { ASSERT(0); /* not allowed */ } switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_DEDUPDITTO: spa->spa_dedup_ditto = intval; break; default: break; } } /* log internal history if this is not a zpool create */ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && tx->tx_txg != TXG_INITIAL) { spa_history_log_internal(LOG_POOL_PROPSET, spa, tx, "%s %lld %s", nvpair_name(elem), intval, spa_name(spa)); } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { dsl_pool_t *dp = spa->spa_dsl_pool; ASSERT(spa->spa_sync_pass == 1); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; int error; int c; VERIFY(spa_writeable(spa)); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while (list_head(&spa->spa_state_dirty_list) != NULL) { /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); tx = dmu_tx_create_assigned(dp, txg); /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY(0 == zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } /* * If anything has changed in this txg, or if someone is waiting * for this txg to sync (eg, spa_vdev_remove()), push the * deferred frees from the previous txg. If not, leave them * alone so that we don't generate work on an otherwise idle * system. */ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || !txg_list_empty(&dp->dp_dirty_dirs, txg) || !txg_list_empty(&dp->dp_sync_tasks, txg) || ((dsl_scan_active(dp->dp_scan) || txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(defer_bpo, spa_free_sync_cb, zio, tx), ==, 0); VERIFY3U(zio_wait(zio), ==, 0); } /* * Iterate to convergence. */ do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass <= SYNC_PASS_DEFERRED_FREE) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(free_bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } else { bplist_iterate(free_bpl, bpobj_enqueue_cb, defer_bpo, tx); } ddt_sync(spa, txg); dsl_scan_sync(dp, tx); while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))) vdev_sync(vd, txg); if (pass == 1) spa_sync_upgrades(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few * random top-level vdevs that are known to be visible in the * config cache (see spa_vdev_add() for a complete description). * If there *are* dirty vdevs, sync the uberblock to all vdevs. */ for (;;) { /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_DVAS_PER_BP]; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; if (vd->vdev_ms_array == 0 || vd->vdev_islog) continue; svd[svdcount++] = vd; if (svdcount == SPA_DVAS_PER_BP) break; } error = vdev_config_sync(svd, svdcount, txg, B_FALSE); if (error != 0) error = vdev_config_sync(svd, svdcount, txg, B_TRUE); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg, B_FALSE); if (error != 0) error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg, B_TRUE); } spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL); zio_resume_wait(spa); } dmu_tx_commit(tx); /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } spa->spa_ubsync = spa->spa_uberblock; dsl_pool_sync_done(dp, txg); /* * Update usable space statistics. */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))) vdev_sync_done(vd, txg); spa_update_dspace(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); spa->spa_sync_pass = 0; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); ASSERT(version >= spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &spareguid) == 0 && spareguid == guid) return (B_TRUE); } return (B_FALSE); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } /* * Post a FM_EREPORT_ZFS_* event from sys/fm/fs/zfs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) { #ifdef _KERNEL zfs_ereport_post(name, spa, vd, NULL, 0, 0); #endif } diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 1cf3950d450d..b7ef12a8fa38 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -1,487 +1,487 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Pool configuration repository. * * Pool configuration is stored as a packed nvlist on the filesystem. By * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot * (when the ZFS module is loaded). Pools can also have the 'cachefile' * property set that allows them to be stored in an alternate location until * the control of external software. * * For each cache file, we have a single nvlist which holds all the * configuration information. When the module loads, we read this information * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is * maintained independently in spa.c. Whenever the namespace is modified, or * the configuration of a pool is changed, we call spa_config_sync(), which * walks through all the active pools and writes the configuration to disk. */ static uint64_t spa_config_generation = 1; /* * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ const char *spa_config_path = ZPOOL_CACHE; /* * Called when the module is first loaded, this routine loads the configuration * file into the SPA namespace. It does not actually open or load the pools; it * only populates the namespace. */ void spa_config_load(void) { void *buf = NULL; nvlist_t *nvlist, *child; nvpair_t *nvpair; char *pathname; struct _buf *file; uint64_t fsize; /* * Open the configuration file. */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(pathname, MAXPATHLEN, "%s%s", (rootdir != NULL) ? "./" : "", spa_config_path); file = kobj_open_file(pathname); kmem_free(pathname, MAXPATHLEN); if (file == (struct _buf *)-1) return; if (kobj_get_filesize(file, &fsize) != 0) goto out; - buf = kmem_alloc(fsize, KM_SLEEP); + buf = kmem_alloc(fsize, KM_SLEEP | KM_NODEBUG); /* * Read the nvlist from the file. */ if (kobj_read_file(file, buf, fsize, 0) < 0) goto out; /* * Unpack the nvlist. */ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) goto out; /* * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. */ mutex_enter(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; VERIFY(nvpair_value_nvlist(nvpair, &child) == 0); if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; (void) spa_add(nvpair_name(nvpair), child, NULL); } mutex_exit(&spa_namespace_lock); nvlist_free(nvlist); out: if (buf != NULL) kmem_free(buf, fsize); kobj_close_file(file); } static void spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; char *buf; vnode_t *vp; int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; char *temp; /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { (void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); return; } /* * Pack the configuration into a buffer. */ VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0); - buf = kmem_alloc(buflen, KM_SLEEP); + buf = kmem_alloc(buflen, KM_SLEEP | KM_NODEBUG); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0); /* * Write the configuration to disk. We need to do the traditional * 'write to temporary file, sync, move over original' to make sure we * always have a consistent view of the data. */ (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); if (vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) == 0) { if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, NULL) == 0 && VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) { (void) vn_rename(temp, dp->scd_path, UIO_SYSSPACE); } (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); VN_RELE(vp); } (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); kmem_free(buf, buflen); kmem_free(temp, MAXPATHLEN); } /* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. */ void spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (rootdir == NULL || !(spa_mode_global & FWRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { if (spa == target && removing) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(nvl, spa->spa_name, spa->spa_config) == 0); mutex_exit(&spa->spa_props_lock); } spa_config_write(dp, nvl); nvlist_free(nvl); } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) spa_event_notify(target, NULL, FM_EREPORT_ZFS_CONFIG_SYNC); } /* * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, * and we don't want to allow the local zone to see all the pools anyway. * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ nvlist_t * spa_all_configs(uint64_t *generation) { nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) return (NULL); VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(pools, spa_name(spa), spa->spa_config) == 0); mutex_exit(&spa->spa_props_lock); } } *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); return (pools); } void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); if (spa->spa_config != NULL) nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } /* * Generate the pool's configuration based on the current in-core state. * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ nvlist_t * spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; uint64_t split_guid; if (vd == NULL) { vd = rvd; locked = B_TRUE; spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == (SCL_CONFIG | SCL_STATE)); /* * If txg is -1, report the current value of spa->spa_config_txg. */ if (txg == -1ULL) txg = spa->spa_config_txg; VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)) == 0); #ifdef _KERNEL hostid = zone_get_hostid(NULL); #else /* _KERNEL */ /* * We're emulating the system's hostid in userland, so we can't use * zone_get_hostid(). */ (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); #endif /* _KERNEL */ if (hostid != 0) { VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid) == 0); } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename) == 0); if (vd != rvd) { VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); if (vd->vdev_isspare) VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 1ULL) == 0); if (vd->vdev_islog) VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL) == 0); vd = vd->vdev_top; /* label contains top config */ } else { /* * Only add the (potentially large) split information * in the mos config, and not in the vdev labels */ if (spa->spa_config_splitting != NULL) VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, spa->spa_config_splitting) == 0); } /* * Add the top-level config. We even add this on pools which * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); /* * If we're splitting, record the original pool's guid. */ if (spa->spa_config_splitting != NULL && nvlist_lookup_uint64(spa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid) == 0); } nvroot = vdev_config_generate(spa, vd, getstats, 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh); VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0); kmem_free(ddh, sizeof (ddt_histogram_t)); ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); ddt_get_dedup_object_stats(spa, ddo); VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0); kmem_free(ddo, sizeof (ddt_object_t)); dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); ddt_get_dedup_stats(spa, dds); VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0); kmem_free(dds, sizeof (ddt_stat_t)); } if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); } else { /* * If we have top-level vdevs that were added but have * not yet been prepared for allocation, do that now. * (It's safe now because the config cache is up to date, * so it will be able to translate the new DVAs.) * See comments in spa_vdev_add() for full details. */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ms_array == 0) vdev_metaslab_set_size(tvd); vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. */ txg_wait_synced(spa->spa_dsl_pool, txg); /* * Update the global config cache to reflect the new mosconfig. */ if (!spa->spa_is_root) spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index a65f16bccdde..ce7d378c6ff9 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -1,502 +1,501 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" #ifdef _KERNEL #include #endif /* * Routines to manage the on-disk history log. * * The history log is stored as a dmu object containing * tuples. * * Where "record nvlist" is a nvlist containing uint64_ts and strings, and * "packed record length" is the packed length of the "record nvlist" stored * as a little endian uint64_t. * * The log is implemented as a ring buffer, though the original creation * of the pool ('zpool create') is never overwritten. * * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer * of 'spa_history' stores the offsets for logging/retrieving history as * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of * where the 'zpool create' record is stored. This allows us to never * overwrite the original creation of the pool. 'sh_phys_max_off' is the * physical ending offset in bytes of the log. This tells you the length of * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record * is added, 'sh_eof' is incremented by the the size of the record. * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). * This is where the consumer should start reading from after reading in * the 'zpool create' portion of the log. * * 'sh_records_lost' keeps track of how many records have been overwritten * and permanently lost. */ /* convert a logical offset to physical */ static uint64_t spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) { uint64_t phys_len; phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; return ((log_off - shpp->sh_pool_create_len) % phys_len + shpp->sh_pool_create_len); } void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) { dmu_buf_t *dbp; spa_history_phys_t *shpp; objset_t *mos = spa->spa_meta_objset; ASSERT(spa->spa_history == 0); spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, sizeof (spa_history_phys_t), tx); VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, sizeof (uint64_t), 1, &spa->spa_history, tx) == 0); VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); /* * Figure out maximum size of history log. We set it at * 1% of pool size, with a max of 32MB and min of 128KB. */ shpp->sh_phys_max_off = metaslab_class_get_dspace(spa_normal_class(spa)) / 100; shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); dmu_buf_rele(dbp, FTAG); } /* * Change 'sh_bof' to the beginning of the next record. */ static int spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) { objset_t *mos = spa->spa_meta_objset; uint64_t firstread, reclen, phys_bof; char buf[sizeof (reclen)]; int err; phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, buf, DMU_READ_PREFETCH)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, buf + firstread, DMU_READ_PREFETCH)) != 0) return (err); } reclen = LE_64(*((uint64_t *)buf)); shpp->sh_bof += reclen + sizeof (reclen); shpp->sh_records_lost++; return (0); } static int spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, dmu_tx_t *tx) { uint64_t firstwrite, phys_eof; objset_t *mos = spa->spa_meta_objset; int err; ASSERT(MUTEX_HELD(&spa->spa_history_lock)); /* see if we need to reset logical BOF */ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - (shpp->sh_eof - shpp->sh_bof) <= len) { if ((err = spa_history_advance_bof(spa, shpp)) != 0) { return (err); } } phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); shpp->sh_eof += len; dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); len -= firstwrite; if (len > 0) { /* write out the rest at the beginning of physical file */ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, len, (char *)buf + firstwrite, tx); } return (0); } static char * spa_history_zone(void) { #ifdef _KERNEL return (curproc->p_zone->zone_name); #else return ("global"); #endif } /* * Write out a history event. */ /*ARGSUSED*/ static void spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; history_arg_t *hap = arg2; const char *history_str = hap->ha_history_str; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; spa_history_phys_t *shpp; size_t reclen; uint64_t le_len; nvlist_t *nvrecord; char *record_packed = NULL; int ret; /* * If we have an older pool that doesn't have a command * history object, create it now. */ mutex_enter(&spa->spa_history_lock); if (!spa->spa_history) spa_history_create_obj(spa, tx); mutex_exit(&spa->spa_history_lock); /* * Get the offset of where we need to write via the bonus buffer. * Update the offset when the write completes. */ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, gethrestime_sec()) == 0); VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0); if (hap->ha_zone != NULL) VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, hap->ha_zone) == 0); #ifdef _KERNEL VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, utsname.nodename) == 0); #endif if (hap->ha_log_type == LOG_CMD_POOL_CREATE || hap->ha_log_type == LOG_CMD_NORMAL) { VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0); zfs_dbgmsg("command: %s", history_str); } else { VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, hap->ha_event) == 0); VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, tx->tx_txg) == 0); VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, history_str) == 0); zfs_dbgmsg("internal %s pool:%s txg:%llu %s", zfs_history_event_names[hap->ha_event], spa_name(spa), (longlong_t)tx->tx_txg, history_str); } VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); record_packed = kmem_alloc(reclen, KM_SLEEP); VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, NV_ENCODE_XDR, KM_SLEEP) == 0); mutex_enter(&spa->spa_history_lock); if (hap->ha_log_type == LOG_CMD_POOL_CREATE) VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); /* write out the packed length as little endian */ le_len = LE_64((uint64_t)reclen); ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); if (!ret) ret = spa_history_write(spa, record_packed, reclen, shpp, tx); if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { shpp->sh_pool_create_len += sizeof (le_len) + reclen; shpp->sh_bof = shpp->sh_pool_create_len; } mutex_exit(&spa->spa_history_lock); nvlist_free(nvrecord); kmem_free(record_packed, reclen); dmu_buf_rele(dbp, FTAG); strfree(hap->ha_history_str); if (hap->ha_zone != NULL) strfree(hap->ha_zone); kmem_free(hap, sizeof (history_arg_t)); } /* * Write out a history event. */ int spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) { history_arg_t *ha; int err = 0; dmu_tx_t *tx; ASSERT(what != LOG_INTERNAL); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); ha->ha_history_str = strdup(history_str); ha->ha_zone = strdup(spa_history_zone()); ha->ha_log_type = what; ha->ha_uid = crgetuid(CRED()); /* Kick this off asynchronously; errors are ignored. */ dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, spa_history_log_sync, spa, ha, 0, tx); dmu_tx_commit(tx); /* spa_history_log_sync will free ha and strings */ return (err); } /* * Read out the command history. */ int spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) { objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; uint64_t read_len, phys_read_off, phys_eof; uint64_t leftover = 0; spa_history_phys_t *shpp; int err; /* * If the command history doesn't exist (older pool), * that's ok, just return ENOENT. */ if (!spa->spa_history) return (ENOENT); /* * The history is logged asynchronously, so when they request * the first chunk of history, make sure everything has been * synced to disk so that we get it. */ if (*offp == 0 && spa_writeable(spa)) txg_wait_synced(spa_get_dsl(spa), 0); if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); shpp = dbp->db_data; #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif mutex_enter(&spa->spa_history_lock); phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); if (*offp < shpp->sh_pool_create_len) { /* read in just the zpool create history */ phys_read_off = *offp; read_len = MIN(*len, shpp->sh_pool_create_len - phys_read_off); } else { /* * Need to reset passed in offset to BOF if the passed in * offset has since been overwritten. */ *offp = MAX(*offp, shpp->sh_bof); phys_read_off = spa_history_log_to_phys(*offp, shpp); /* * Read up to the minimum of what the user passed down or * the EOF (physical or logical). If we hit physical EOF, * use 'leftover' to read from the physical BOF. */ if (phys_read_off <= phys_eof) { read_len = MIN(*len, phys_eof - phys_read_off); } else { read_len = MIN(*len, shpp->sh_phys_max_off - phys_read_off); if (phys_read_off + *len > shpp->sh_phys_max_off) { leftover = MIN(*len - read_len, phys_eof - shpp->sh_pool_create_len); } } } /* offset for consumer to use next */ *offp += read_len + leftover; /* tell the consumer how much you actually read */ *len = read_len + leftover; if (read_len == 0) { mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (0); } err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, DMU_READ_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, leftover, buf + read_len, DMU_READ_PREFETCH); } mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (err); } static void log_internal(history_internal_events_t event, spa_t *spa, dmu_tx_t *tx, const char *fmt, va_list adx) { history_arg_t *ha; + va_list adx_copy; /* * If this is part of creating a pool, not everything is * initialized yet, so don't bother logging the internal events. */ if (tx->tx_txg == TXG_INITIAL) return; ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); - ha->ha_history_str = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, - KM_SLEEP); - - (void) vsprintf(ha->ha_history_str, fmt, adx); - + va_copy(adx_copy, adx); + ha->ha_history_str = kmem_vasprintf(fmt, adx_copy); + va_end(adx_copy); ha->ha_log_type = LOG_INTERNAL; ha->ha_event = event; ha->ha_zone = NULL; ha->ha_uid = 0; if (dmu_tx_is_syncing(tx)) { spa_history_log_sync(spa, ha, tx); } else { dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, spa_history_log_sync, spa, ha, 0, tx); } /* spa_history_log_sync() will free ha and strings */ } void spa_history_log_internal(history_internal_events_t event, spa_t *spa, dmu_tx_t *tx, const char *fmt, ...) { dmu_tx_t *htx = tx; va_list adx; /* create a tx if we didn't get one */ if (tx == NULL) { htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); if (dmu_tx_assign(htx, TXG_WAIT) != 0) { dmu_tx_abort(htx); return; } } va_start(adx, fmt); log_internal(event, spa, htx, fmt, adx); va_end(adx); /* if we didn't get a tx from the caller, commit the one we made */ if (tx == NULL) dmu_tx_commit(htx); } void spa_history_log_version(spa_t *spa, history_internal_events_t event) { #ifdef _KERNEL uint64_t current_vers = spa_version(spa); if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { spa_history_log_internal(event, spa, NULL, "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, utsname.nodename, utsname.release, utsname.version, utsname.machine); } cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", event == LOG_POOL_IMPORT ? "imported" : event == LOG_POOL_CREATE ? "created" : "accessed", (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); #endif } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 6fb3f90e327a..31e82d879adf 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1,1681 +1,1681 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" /* * SPA locking * * There are four basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy/import/export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. * * spa_rename() is also implemented within this file since is requires * manipulation of the namespace. */ static avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; static kcondvar_t spa_namespace_cv; static int spa_active_count; int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; int spa_mode_global; #ifdef ZFS_DEBUG /* Everything except dprintf is on by default in debug builds */ int zfs_flags = ~ZFS_DEBUG_DPRINTF; #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. */ int zfs_recover = 0; /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { int i; for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); refcount_create(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; } } static void spa_config_lock_destroy(spa_t *spa) { int i; for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); refcount_destroy(&scl->scl_count); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); } } int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) { int i; for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks ^ (1 << i), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (!refcount_is_zero(&scl->scl_count)) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks ^ (1 << i), tag); return (0); } scl->scl_writer = curthread; } (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } return (1); } void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) { int wlocks_held = 0; int i; for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || scl->scl_write_wanted) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (!refcount_is_zero(&scl->scl_count)) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } ASSERT(wlocks_held <= locks); } void spa_config_exit(spa_t *spa, int locks, void *tag) { int i; for (i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(!refcount_is_zero(&scl->scl_count)); if (refcount_remove(&scl->scl_count, tag) == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int i, locks_held = 0; for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char c = 0; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(name, "/@"); if (cp) { c = *cp; *cp = '\0'; } (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); spa = avl_find(&spa_namespace_avl, &search, &where); if (cp) *cp = c; return (spa); } /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; int t; ASSERT(MUTEX_HELD(&spa_namespace_lock)); - spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); + spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP | KM_NODEBUG); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); for (t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) { spa->spa_root = spa_strdup(altroot); spa_active_count++; } /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (config != NULL) VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; int t; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); if (spa->spa_root) { spa_strfree(spa->spa_root); spa_active_count--; } while ((dp = list_head(&spa->spa_config_list)) != NULL) { list_remove(&spa->spa_config_list, dp); if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); refcount_destroy(&spa->spa_refcount); spa_config_lock_destroy(spa); for (t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, void *tag) { ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_close(spa_t *spa, void *tag) { ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the * number of references acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); return (refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = a; const spa_aux_t *sb = b; if (sa->aux_guid < sb->aux_guid) return (-1); else if (sa->aux_guid > sb->aux_guid) return (1); else return (0); } void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. * - A spare may be added to a pool even if it's actively in use within * another pool. * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) { int config_changed = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_config_sync(spa, B_FALSE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; if (vd != NULL || error == 0) vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 0, 0, B_FALSE); if (vd != NULL) { vdev_state_dirty(vd->vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(1M) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_config_sync(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ /* * Rename a spa_t. */ int spa_rename(const char *name, const char *newname) { spa_t *spa; int err; /* * Lookup the spa_t and grab the config lock for writing. We need to * actually open the pool so that we can sync out the necessary labels. * It's OK to call spa_open() with the namespace lock held because we * allow recursive calls for other reasons. */ mutex_enter(&spa_namespace_lock); if ((err = spa_open(name, &spa, FTAG)) != 0) { mutex_exit(&spa_namespace_lock); return (err); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); avl_remove(&spa_namespace_avl, spa); (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); avl_add(&spa_namespace_avl, spa); /* * Sync all labels to disk with the new names by marking the root vdev * dirty and waiting for it to sync. It will pick up the new pool name * during the sync. */ vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa->spa_dsl_pool, 0); /* * Sync the updated config cache. */ spa_config_sync(spa, B_FALSE, B_TRUE); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (0); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); bcopy(s, new, len); new[len] = '\0'; return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_get_random(uint64_t range) { uint64_t r; ASSERT(range != 0); (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); return (r % range); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid = spa_get_random(-1ULL); if (spa != NULL) { while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) guid = spa_get_random(-1ULL); } else { while (guid == 0 || spa_guid_exists(guid, 0)) guid = spa_get_random(-1ULL); } return (guid); } void sprintf_blkptr(char *buf, const blkptr_t *bp) { char *type = NULL; char *checksum = NULL; char *compress = NULL; if (bp != NULL) { type = dmu_ot[BP_GET_TYPE(bp)].ot_name; checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexidecimal numbers that don't overflow. */ uint64_t strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else (void) strncpy(buf, spa->spa_root, buflen); } int spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_load_guid' to handle * this case. */ if (spa->spa_root_vdev != NULL) return (spa->spa_root_vdev->vdev_guid); else return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* ARGSUSED */ uint64_t spa_get_asize(spa_t *spa, uint64_t lsize) { /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). */ return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ddt_get_dedup_dspace(spa); } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint8_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; int d; for (d = 0; d < SPA_DVAS_PER_BP; d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; int d; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (d = 0; d < SPA_DVAS_PER_BP; d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); if (s > 0) return (1); if (s < 0) return (-1); return (0); } void spa_boot_init(void) { spa_config_load(); } void spa_init(int mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; fm_init(); refcount_init(); unique_init(); zio_init(); dmu_init(); zil_init(); vdev_cache_stat_init(); zfs_prop_init(); zpool_prop_init(); spa_config_load(); l2arc_start(); } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_cache_stat_fini(); zil_fini(); dmu_fini(); zio_fini(); unique_fini(); refcount_fini(); fm_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } /* * Return whether this pool has slogs. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_rotor != NULL); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & FWRITE)); } int spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); spa->spa_scan_pass_exam = 0; vdev_scan_stat_init(spa->spa_root_vdev); } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) return (ENOENT); bzero(ps, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; ps->pss_state = scn->scn_phys.scn_state; /* data not stored on disk */ ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_exam = spa->spa_scan_pass_exam; return (0); } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index f9f24dd0a945..5fc086e5de69 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -1,724 +1,724 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include /* * Pool-wide transaction groups. */ static void txg_sync_thread(dsl_pool_t *dp); static void txg_quiesce_thread(dsl_pool_t *dp); int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ /* * Prepare the txg subsystem. */ void txg_init(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; int c; bzero(tx, sizeof (tx_state_t)); - tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); + tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); for (c = 0; c < max_ncpus; c++) { int i; mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); for (i = 0; i < TXG_SIZE; i++) { cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); list_create(&tx->tx_cpu[c].tc_callbacks[i], sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); } } mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); tx->tx_open_txg = txg; } /* * Close down the txg subsystem. */ void txg_fini(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; int c; ASSERT(tx->tx_threads == 0); mutex_destroy(&tx->tx_sync_lock); cv_destroy(&tx->tx_sync_more_cv); cv_destroy(&tx->tx_sync_done_cv); cv_destroy(&tx->tx_quiesce_more_cv); cv_destroy(&tx->tx_quiesce_done_cv); cv_destroy(&tx->tx_exit_cv); for (c = 0; c < max_ncpus; c++) { int i; mutex_destroy(&tx->tx_cpu[c].tc_lock); for (i = 0; i < TXG_SIZE; i++) { cv_destroy(&tx->tx_cpu[c].tc_cv[i]); list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); } } if (tx->tx_commit_cb_taskq != NULL) taskq_destroy(tx->tx_commit_cb_taskq); - kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); + vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); bzero(tx, sizeof (tx_state_t)); } /* * Start syncing transaction groups. */ void txg_sync_start(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; mutex_enter(&tx->tx_sync_lock); dprintf("pool %p\n", dp); ASSERT(tx->tx_threads == 0); tx->tx_threads = 2; tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, dp, 0, &p0, TS_RUN, minclsyspri); /* * The sync thread can need a larger-than-default stack size on * 32-bit x86. This is due in part to nested pools and * scrub_visitbp() recursion. */ tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, dp, 0, &p0, TS_RUN, minclsyspri); mutex_exit(&tx->tx_sync_lock); } static void txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) { CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); mutex_enter(&tx->tx_sync_lock); } static void txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) { ASSERT(*tpp != NULL); *tpp = NULL; tx->tx_threads--; cv_broadcast(&tx->tx_exit_cv); CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ thread_exit(); } static void txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) { CALLB_CPR_SAFE_BEGIN(cpr); if (time) (void) cv_timedwait(cv, &tx->tx_sync_lock, ddi_get_lbolt() + time); else cv_wait(cv, &tx->tx_sync_lock); CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); } /* * Stop syncing transaction groups. */ void txg_sync_stop(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; dprintf("pool %p\n", dp); /* * Finish off any work in progress. */ ASSERT(tx->tx_threads == 2); /* * We need to ensure that we've vacated the deferred space_maps. */ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); /* * Wake all sync threads and wait for them to die. */ mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); tx->tx_exiting = 1; cv_broadcast(&tx->tx_quiesce_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); cv_broadcast(&tx->tx_sync_more_cv); while (tx->tx_threads != 0) cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); tx->tx_exiting = 0; mutex_exit(&tx->tx_sync_lock); } uint64_t txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) { tx_state_t *tx = &dp->dp_tx; tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; uint64_t txg; mutex_enter(&tc->tc_lock); txg = tx->tx_open_txg; tc->tc_count[txg & TXG_MASK]++; th->th_cpu = tc; th->th_txg = txg; return (txg); } void txg_rele_to_quiesce(txg_handle_t *th) { tx_cpu_t *tc = th->th_cpu; mutex_exit(&tc->tc_lock); } void txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) { tx_cpu_t *tc = th->th_cpu; int g = th->th_txg & TXG_MASK; mutex_enter(&tc->tc_lock); list_move_tail(&tc->tc_callbacks[g], tx_callbacks); mutex_exit(&tc->tc_lock); } void txg_rele_to_sync(txg_handle_t *th) { tx_cpu_t *tc = th->th_cpu; int g = th->th_txg & TXG_MASK; mutex_enter(&tc->tc_lock); ASSERT(tc->tc_count[g] != 0); if (--tc->tc_count[g] == 0) cv_broadcast(&tc->tc_cv[g]); mutex_exit(&tc->tc_lock); th->th_cpu = NULL; /* defensive */ } static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; int g = txg & TXG_MASK; int c; /* * Grab all tx_cpu locks so nobody else can get into this txg. */ for (c = 0; c < max_ncpus; c++) mutex_enter(&tx->tx_cpu[c].tc_lock); ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; /* * Now that we've incremented tx_open_txg, we can let threads * enter the next transaction group. */ for (c = 0; c < max_ncpus; c++) mutex_exit(&tx->tx_cpu[c].tc_lock); /* * Quiesce the transaction group by waiting for everyone to txg_exit(). */ for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; mutex_enter(&tc->tc_lock); while (tc->tc_count[g] != 0) cv_wait(&tc->tc_cv[g], &tc->tc_lock); mutex_exit(&tc->tc_lock); } } static void txg_do_callbacks(list_t *cb_list) { dmu_tx_do_callbacks(cb_list, 0); list_destroy(cb_list); kmem_free(cb_list, sizeof (list_t)); } /* * Dispatch the commit callbacks registered on this txg to worker threads. */ static void txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) { int c; tx_state_t *tx = &dp->dp_tx; list_t *cb_list; for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; /* No need to lock tx_cpu_t at this point */ int g = txg & TXG_MASK; if (list_is_empty(&tc->tc_callbacks[g])) continue; if (tx->tx_commit_cb_taskq == NULL) { /* * Commit callback taskq hasn't been created yet. */ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", 100, minclsyspri, max_ncpus, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_PREPOPULATE); } cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(cb_list, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); list_move_tail(cb_list, &tc->tc_callbacks[g]); (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) txg_do_callbacks, cb_list, TQ_SLEEP); } } static void txg_sync_thread(dsl_pool_t *dp) { spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; uint64_t start, delta; txg_thread_enter(tx, &cpr); start = delta = 0; for (;;) { uint64_t timer, timeout = zfs_txg_timeout * hz; uint64_t txg; /* * We sync when we're scanning, there's someone waiting * on us, or the quiesce thread has handed off a txg to * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); delta = ddi_get_lbolt() - start; timer = (delta > timeout ? 0 : timeout - delta); } /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); /* * Consume the quiesced txg which has been handed off to * us. This may cause the quiescing thread to now be * able to quiesce another txg, so we must signal it. */ txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; cv_broadcast(&tx->tx_quiesce_more_cv); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; cv_broadcast(&tx->tx_sync_done_cv); /* * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); } } static void txg_quiesce_thread(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; txg_thread_enter(tx, &cpr); for (;;) { uint64_t txg; /* * We quiesce when there's someone waiting on us. * However, we can only have one txg in "quiescing" or * "quiesced, waiting to sync" state. So we wait until * the "quiesced, waiting to sync" txg has been consumed * by the sync thread. */ while (!tx->tx_exiting && (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || tx->tx_quiesced_txg != 0)) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); txg = tx->tx_open_txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); txg_quiesce(dp, txg); mutex_enter(&tx->tx_sync_lock); /* * Hand this txg off to the sync thread. */ dprintf("quiesce done, handing off txg %llu\n", txg); tx->tx_quiesced_txg = txg; cv_broadcast(&tx->tx_sync_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); } } /* * Delay this thread by 'ticks' if we are still in the open transaction * group and there is already a waiting txg quiesing or quiesced. Abort * the delay if this txg stalls or enters the quiesing state. */ void txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) { tx_state_t *tx = &dp->dp_tx; int timeout = ddi_get_lbolt() + ticks; /* don't delay if this txg could transition to quiesing immediately */ if (tx->tx_open_txg > txg || tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) return; mutex_enter(&tx->tx_sync_lock); if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { mutex_exit(&tx->tx_sync_lock); return; } while (ddi_get_lbolt() < timeout && tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, timeout); mutex_exit(&tx->tx_sync_lock); } void txg_wait_synced(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) txg = tx->tx_open_txg + TXG_DEFER_SIZE; if (tx->tx_sync_txg_waiting < txg) tx->tx_sync_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); while (tx->tx_synced_txg < txg) { dprintf("broadcasting sync more " "tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); cv_broadcast(&tx->tx_sync_more_cv); cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); } mutex_exit(&tx->tx_sync_lock); } void txg_wait_open(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) txg = tx->tx_open_txg + 1; if (tx->tx_quiesce_txg_waiting < txg) tx->tx_quiesce_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); while (tx->tx_open_txg < txg) { cv_broadcast(&tx->tx_quiesce_more_cv); cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); } mutex_exit(&tx->tx_sync_lock); } boolean_t txg_stalled(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); } boolean_t txg_sync_waiting(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || tx->tx_quiesced_txg != 0); } /* * Per-txg object lists. */ void txg_list_create(txg_list_t *tl, size_t offset) { int t; mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); tl->tl_offset = offset; for (t = 0; t < TXG_SIZE; t++) tl->tl_head[t] = NULL; } void txg_list_destroy(txg_list_t *tl) { int t; for (t = 0; t < TXG_SIZE; t++) ASSERT(txg_list_empty(tl, t)); mutex_destroy(&tl->tl_lock); } int txg_list_empty(txg_list_t *tl, uint64_t txg) { return (tl->tl_head[txg & TXG_MASK] == NULL); } /* * Add an entry to the list. * Returns 0 if it's a new entry, 1 if it's already there. */ int txg_list_add(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); int already_on_list; mutex_enter(&tl->tl_lock); already_on_list = tn->tn_member[t]; if (!already_on_list) { tn->tn_member[t] = 1; tn->tn_next[t] = tl->tl_head[t]; tl->tl_head[t] = tn; } mutex_exit(&tl->tl_lock); return (already_on_list); } /* * Add an entry to the end of the list (walks list to find end). * Returns 0 if it's a new entry, 1 if it's already there. */ int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); int already_on_list; mutex_enter(&tl->tl_lock); already_on_list = tn->tn_member[t]; if (!already_on_list) { txg_node_t **tp; for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) continue; tn->tn_member[t] = 1; tn->tn_next[t] = NULL; *tp = tn; } mutex_exit(&tl->tl_lock); return (already_on_list); } /* * Remove the head of the list and return it. */ void * txg_list_remove(txg_list_t *tl, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn; void *p = NULL; mutex_enter(&tl->tl_lock); if ((tn = tl->tl_head[t]) != NULL) { p = (char *)tn - tl->tl_offset; tl->tl_head[t] = tn->tn_next[t]; tn->tn_next[t] = NULL; tn->tn_member[t] = 0; } mutex_exit(&tl->tl_lock); return (p); } /* * Remove a specific item from the list and return it. */ void * txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn, **tp; mutex_enter(&tl->tl_lock); for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { if ((char *)tn - tl->tl_offset == p) { *tp = tn->tn_next[t]; tn->tn_next[t] = NULL; tn->tn_member[t] = 0; mutex_exit(&tl->tl_lock); return (p); } } mutex_exit(&tl->tl_lock); return (NULL); } int txg_list_member(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); return (tn->tn_member[t]); } /* * Walk a txg list -- only safe if you know it's not changing. */ void * txg_list_head(txg_list_t *tl, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = tl->tl_head[t]; return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); } void * txg_list_next(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); tn = tn->tn_next[t]; return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index aa86bdecb11e..04369bbc50b3 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1,1456 +1,1456 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); return (zap->zap_u.zap_fat.zap_phys->zap_flags); } int zap_hashbits(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return (48); else return (28); } uint32_t zap_maxcd(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return ((1<<16)-1); else return (-1U); } static uint64_t zap_hash(zap_name_t *zn) { zap_t *zap = zn->zn_zap; uint64_t h = 0; if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); h = *(uint64_t *)zn->zn_key_orig; } else { h = zap->zap_salt; ASSERT(h != 0); ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { int i; const uint64_t *wp = zn->zn_key_norm; ASSERT(zn->zn_key_intlen == 8); for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) { int j; uint64_t word = *wp; for (j = 0; j < zn->zn_key_intlen; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; } } } else { int i, len; const uint8_t *cp = zn->zn_key_norm; /* * We previously stored the terminating null on * disk, but didn't hash it, so we need to * continue to not hash it. (The * zn_key_*_numints includes the terminating * null for non-binary keys.) */ len = zn->zn_key_norm_numints - 1; ASSERT(zn->zn_key_intlen == 1); for (i = 0; i < len; cp++, i++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF]; } } } /* * Don't use all 64 bits, since we need some in the cookie for * the collision differentiator. We MUST use the high bits, * since those are the ones that we first pay attention to when * chosing the bucket. */ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); return (h); } static int zap_normalize(zap_t *zap, const char *name, char *namenorm) { size_t inlen, outlen; int err; ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); inlen = strlen(name) + 1; outlen = ZAP_MAXNAMELEN; err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } boolean_t zap_match(zap_name_t *zn, const char *matchname) { ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); if (zn->zn_matchtype == MT_FIRST) { char norm[ZAP_MAXNAMELEN]; if (zap_normalize(zn->zn_zap, matchname, norm) != 0) return (B_FALSE); return (strcmp(zn->zn_key_norm, norm) == 0); } else { /* MT_BEST or MT_EXACT */ return (strcmp(zn->zn_key_orig, matchname) == 0); } } void zap_name_free(zap_name_t *zn) { kmem_free(zn, sizeof (zap_name_t)); } zap_name_t * zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_matchtype = mt; if (zap->zap_normflags) { if (zap_normalize(zap, key, zn->zn_normbuf) != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != MT_EXACT) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } zn->zn_hash = zap_hash(zn); return (zn); } zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); ASSERT(zap->zap_normflags == 0); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = zn->zn_key_norm = key; zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; zn->zn_matchtype = MT_EXACT; zn->zn_hash = zap_hash(zn); return (zn); } static void mzap_byteswap(mzap_phys_t *buf, size_t size) { int i, max; buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); buf->mz_normflags = BSWAP_64(buf->mz_normflags); max = (size / MZAP_ENT_LEN) - 1; for (i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = BSWAP_64(buf->mz_chunk[i].mze_value); buf->mz_chunk[i].mze_cd = BSWAP_32(buf->mz_chunk[i].mze_cd); } } void zap_byteswap(void *buf, size_t size) { uint64_t block_type; block_type = *(uint64_t *)buf; if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { /* ASSERT(magic == ZAP_LEAF_MAGIC); */ mzap_byteswap(buf, size); } else { fzap_byteswap(buf, size); } } static int mze_compare(const void *arg1, const void *arg2) { const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; if (mze1->mze_hash > mze2->mze_hash) return (+1); if (mze1->mze_hash < mze2->mze_hash) return (-1); if (mze1->mze_cd > mze2->mze_cd) return (+1); if (mze1->mze_cd < mze2->mze_cd) return (-1); return (0); } static void mze_insert(zap_t *zap, int chunkid, uint64_t hash) { mzap_ent_t *mze; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); avl_add(&zap->zap_m.zap_avl, mze); } static mzap_ent_t * mze_find(zap_name_t *zn) { mzap_ent_t mze_tofind; mzap_ent_t *mze; avl_index_t idx; avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); mze_tofind.mze_hash = zn->zn_hash; mze_tofind.mze_cd = 0; again: mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } if (zn->zn_matchtype == MT_BEST) { zn->zn_matchtype = MT_FIRST; goto again; } return (NULL); } static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; mzap_ent_t *mze; avl_index_t idx; avl_tree_t *avl = &zap->zap_m.zap_avl; uint32_t cd; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; cd = 0; for (mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { if (mze->mze_cd != cd) break; cd++; } return (cd); } static void mze_remove(zap_t *zap, mzap_ent_t *mze) { ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); avl_remove(&zap->zap_m.zap_avl, mze); kmem_free(mze, sizeof (mzap_ent_t)); } static void mze_destroy(zap_t *zap) { mzap_ent_t *mze; void *avlcookie = NULL; while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))) kmem_free(mze, sizeof (mzap_ent_t)); avl_destroy(&zap->zap_m.zap_avl); } static zap_t * mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) { zap_t *winner; zap_t *zap; int i; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = os; zap->zap_object = obj; zap->zap_dbuf = db; if (*(uint64_t *)db->db_data != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; } else { zap->zap_ismicro = TRUE; } /* * Make sure that zap_ismicro is set before we let others see * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); if (winner != NULL) { rw_exit(&zap->zap_rwlock); rw_destroy(&zap->zap_rwlock); if (!zap->zap_ismicro) mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); return (winner); } if (zap->zap_ismicro) { zap->zap_salt = zap->zap_m.zap_phys->mz_salt; zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; avl_create(&zap->zap_m.zap_avl, mze_compare, sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; if (mze->mze_name[0]) { zap_name_t *zn; zap->zap_m.zap_num_entries++; zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); mze_insert(zap, i, zn->zn_hash); zap_name_free(zn); } } } else { zap->zap_salt = zap->zap_f.zap_phys->zap_salt; zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); /* * The embedded pointer table should not overlap the * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, &zap->zap_f.zap_phys->zap_salt); /* * The embedded pointer table should end at the end of * the block */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_f.zap_phys, ==, zap->zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); return (zap); } int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { zap_t *zap; dmu_buf_t *db; krw_t lt; int err; *zapp = NULL; err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH); if (err) return (err); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); } #endif zap = dmu_buf_get_user(db); if (zap == NULL) zap = mzap_open(os, obj, db); /* * We're checking zap_ismicro without the lock held, in order to * tell what type of lock we want. Once we have some sort of * lock, see if it really is the right type. In practice this * can only be different if it was upgraded from micro to fat, * and micro wanted WRITER but fat only needs READER. */ lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; rw_enter(&zap->zap_rwlock, lt); if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { /* it was upgraded, now we only need reader */ ASSERT(lt == RW_WRITER); ASSERT(RW_READER == (!zap->zap_ismicro && fatreader) ? RW_READER : lti); rw_downgrade(&zap->zap_rwlock); lt = RW_READER; } zap->zap_objset = os; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3P(zap->zap_dbuf, ==, db); ASSERT(!zap->zap_ismicro || zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; if (newsz > MZAP_MAX_BLKSZ) { dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; return (mzap_upgrade(zapp, tx, 0)); } err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); ASSERT3U(err, ==, 0); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; } *zapp = zap; return (0); } void zap_unlockdir(zap_t *zap) { rw_exit(&zap->zap_rwlock); dmu_buf_rele(zap->zap_dbuf, NULL); } static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) { mzap_phys_t *mzp; int i, sz, nchunks; int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); sz = zap->zap_dbuf->db_size; - mzp = kmem_alloc(sz, KM_SLEEP); + mzp = vmem_alloc(sz, KM_SLEEP); bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; if (!flags) { err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err) { - kmem_free(mzp, sz); + vmem_free(mzp, sz); return (err); } } dprintf("upgrading obj=%llu with %u chunks\n", zap->zap_object, nchunks); /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); for (i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; zap_name_t *zn; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, mze->mze_value); zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ zap_name_free(zn); if (err) break; } - kmem_free(mzp, sz); + vmem_free(mzp, sz); *zapp = zap; return (err); } static void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; mzap_phys_t *zp; VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); } #endif dmu_buf_will_dirty(db, tx); zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; dmu_buf_rele(db, FTAG); if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags)); zap_unlockdir(zap); } } int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_norm(os, obj, 0, ot, bonustype, bonuslen, tx)); } int zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { int err; err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); if (err != 0) return (err); mzap_create_impl(os, obj, normflags, 0, tx); return (0); } uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); mzap_create_impl(os, obj, normflags, 0, tx); return (obj); } uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && leaf_blockshift <= SPA_MAXBLOCKSHIFT && indirect_blockshift >= SPA_MINBLOCKSHIFT && indirect_blockshift <= SPA_MAXBLOCKSHIFT); VERIFY(dmu_object_set_blocksize(os, obj, 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); mzap_create_impl(os, obj, normflags, flags, tx); return (obj); } int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) { /* * dmu_object_free will free the object number and free the * data. Freeing the data will cause our pageout function to be * called, which will destroy our data (zap_leaf_t's and zap_t). */ return (dmu_object_free(os, zapobj, tx)); } _NOTE(ARGSUSED(0)) void zap_evict(dmu_buf_t *db, void *vzap) { zap_t *zap = vzap; rw_destroy(&zap->zap_rwlock); if (zap->zap_ismicro) mze_destroy(zap); else mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); } int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); if (!zap->zap_ismicro) { err = fzap_count(zap, count); } else { *count = zap->zap_m.zap_num_entries; } zap_unlockdir(zap); return (err); } /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) { mzap_ent_t *other; int direction = AVL_BEFORE; boolean_t allocdzn = B_FALSE; if (zap->zap_normflags == 0) return (B_FALSE); again: for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); other && other->mze_hash == mze->mze_hash; other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_FIRST); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (direction == AVL_BEFORE) { direction = AVL_AFTER; goto again; } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for manipulating attributes. */ int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm(os, zapobj, name, integer_size, num_integers, buf, MT_EXACT, NULL, 0, NULL)); } int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err; mzap_ent_t *mze; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, mt); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { mze = mze_find(zn); if (mze == NULL) { err = ENOENT; } else { if (num_integers < 1) { err = EOVERFLOW; } else if (integer_size != 8) { err = EINVAL; } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; (void) strlcpy(realname, MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); } } } } zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_contains(objset_t *os, uint64_t zapobj, const char *name) { int err = (zap_lookup_norm(os, zapobj, name, 0, 0, NULL, MT_EXACT, NULL, 0, NULL)); if (err == EOVERFLOW || err == EINVAL) err = 0; /* found, but skipped reading the value */ return (err); } int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err; mzap_ent_t *mze; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, MT_EXACT); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { mze = mze_find(zn); if (mze == NULL) { err = ENOENT; } else { if (integer_size) *integer_size = 8; if (num_integers) *num_integers = 1; } } zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); zap_unlockdir(zap); return (err); } static void mzap_addent(zap_name_t *zn, uint64_t value) { int i; zap_t *zap = zn->zn_zap; int start = zap->zap_m.zap_alloc_next; uint32_t cd; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { ASSERTV(mzap_ent_phys_t *mze=&zap->zap_m.zap_phys->mz_chunk[i]); ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd < zap_maxcd(zap)); again: for (i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; (void) strcpy(mze->mze_name, zn->zn_key_orig); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; mze_insert(zap, i, zn->zn_hash); return; } } if (start != 0) { start = 0; goto again; } ASSERT(!"out of entries!"); } int zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; mzap_ent_t *mze; const uint64_t *intval = val; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); zn = zap_name_alloc(zap, key, MT_EXACT); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } if (!zap->zap_ismicro) { err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN) { err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else { mze = mze_find(zn); if (mze != NULL) { err = EEXIST; } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap); return (err); } int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap); return (err); } int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; mzap_ent_t *mze; const uint64_t *intval = val; zap_name_t *zn; int err; #ifdef ZFS_DEBUG uint64_t oldval; /* * If there is an old value, it shouldn't change across the * lockdir (eg, due to bprewrite's xlation). */ if (integer_size == 8 && num_integers == 1) (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); #endif err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, MT_EXACT); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } if (!zap->zap_ismicro) { err = fzap_update(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_update(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else { mze = mze_find(zn); if (mze != NULL) { ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap); return (err); } int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; zap_name_t *zn; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } err = fzap_update(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap); return (err); } int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); } int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx) { zap_t *zap; int err; mzap_ent_t *mze; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, mt); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { mze = mze_find(zn); if (mze == NULL) { err = ENOENT; } else { zap->zap_m.zap_num_entries--; bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], sizeof (mzap_ent_phys_t)); mze_remove(zap, mze); } } zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); } err = fzap_remove(zn, tx); zap_name_free(zn); zap_unlockdir(zap); return (err); } /* * Routines for iterating over the attributes. */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) { zc->zc_objset = os; zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; } void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_serialized(zc, os, zapobj, 0); } void zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); zap_unlockdir(zc->zc_zap); zc->zc_zap = NULL; } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } zc->zc_objset = NULL; } uint64_t zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); if (zc->zc_zap == NULL) return (zc->zc_serialized); ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); /* * We want to keep the high 32 bits of the cursor zero if we can, so * that 32-bit programs can access this. So usually use a small * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits * of the cursor. * * [ collision differentiator | zap_hashbits()-bit hash value ] */ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; avl_index_t idx; mzap_ent_t mze_tofind; mzap_ent_t *mze; if (zc->zc_hash == -1ULL) return (ENOENT); if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, &zc->zc_zap); if (err) return (err); /* * To support zap_cursor_init_serialized, advance, retrieve, * we must add to the existing zc_cd, which may already * be 1 due to the zap_cursor_advance. */ ASSERT(zc->zc_hash == 0); hb = zap_hashbits(zc->zc_zap); zc->zc_hash = zc->zc_serialized << (64 - hb); zc->zc_cd += zc->zc_serialized >> hb; if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { err = ENOENT; mze_tofind.mze_hash = zc->zc_hash; mze_tofind.mze_cd = zc->zc_cd; mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, idx, AVL_AFTER); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; } } rw_exit(&zc->zc_zap->zap_rwlock); return (err); } void zap_cursor_advance(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return; zc->zc_cd++; } int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) { int err = 0; mzap_ent_t *mze; zap_name_t *zn; if (zc->zc_zap == NULL) { err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, &zc->zc_zap); if (err) return (err); } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } zn = zap_name_alloc(zc->zc_zap, name, mt); if (zn == NULL) { rw_exit(&zc->zc_zap->zap_rwlock); return (ENOTSUP); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_move_to_key(zc, zn); } else { mze = mze_find(zn); if (mze == NULL) { err = ENOENT; goto out; } zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; } out: zap_name_free(zn); rw_exit(&zc->zc_zap->zap_rwlock); return (err); } int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) { int err; zap_t *zap; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); bzero(zs, sizeof (zap_stats_t)); if (zap->zap_ismicro) { zs->zs_blocksize = zap->zap_dbuf->db_size; zs->zs_num_entries = zap->zap_m.zap_num_entries; zs->zs_num_blocks = 1; } else { fzap_get_stats(zap, zs); } zap_unlockdir(zap); return (0); } int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, uint64_t *towrite, uint64_t *tooverwrite) { zap_t *zap; int err = 0; /* * Since, we don't have a name, we cannot figure out which blocks will * be affected in this operation. So, account for the worst case : * - 3 blocks overwritten: target leaf, ptrtbl block, header block * - 4 new blocks written if adding: * - 2 blocks for possibly split leaves, * - 2 grown ptrtbl blocks * * This also accomodates the case where an add operation to a fairly * large microzap results in a promotion to fatzap. */ if (name == NULL) { *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; return (err); } /* * We lock the zap with adding == FALSE. Because, if we pass * the actual value of add, it could trigger a mzap_upgrade(). * At present we are just evaluating the possibility of this operation * and hence we donot want to trigger an upgrade. */ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); if (!zap->zap_ismicro) { zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT); if (zn) { err = fzap_count_write(zn, add, towrite, tooverwrite); zap_name_free(zn); } else { /* * We treat this case as similar to (name == NULL) */ *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; } } else { /* * We are here if (name != NULL) and this is a micro-zap. * We account for the header block depending on whether it * is freeable. * * Incase of an add-operation it is hard to find out * if this add will promote this microzap to fatzap. * Hence, we consider the worst case and account for the * blocks assuming this microzap would be promoted to a * fatzap. * * 1 block overwritten : header block * 4 new blocks written : 2 new split leaf, 2 grown * ptrtbl blocks */ if (dmu_buf_freeable(zap->zap_dbuf)) *tooverwrite += SPA_MAXBLOCKSIZE; else *towrite += SPA_MAXBLOCKSIZE; if (add) { *towrite += 4 * SPA_MAXBLOCKSIZE; } } zap_unlockdir(zap); return (err); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 902c2342a718..221b1e335925 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1,5272 +1,5272 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" kmutex_t zfsdev_state_lock; list_t zfsdev_state_list; extern void zfs_init(void); extern void zfs_fini(void); typedef int zfs_ioc_func_t(zfs_cmd_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); typedef enum { NO_NAME, POOL_NAME, DATASET_NAME } zfs_ioc_namecheck_t; typedef enum { POOL_CHECK_NONE = 1 << 0, POOL_CHECK_SUSPENDED = 1 << 1, POOL_CHECK_READONLY = 1 << 2 } zfs_ioc_poolcheck_t; typedef struct zfs_ioc_vec { zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_his_log; zfs_ioc_poolcheck_t zvec_pool_check; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **); /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; char buf[512]; va_list adx; /* * Get rid of annoying "../common/" prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); /* * To get this data, use the zfs-dprintf probe as so: * dtrace -q -n 'zfs-dprintf \ * /stringof(arg0) == "dbuf.c"/ \ * {printf("%s: %s", stringof(arg1), stringof(arg3))}' * arg0 = file name * arg1 = function name * arg2 = line number * arg3 = message */ DTRACE_PROBE4(zfs__dprintf, char *, newfile, char *, func, int, line, char *, buf); } static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); - buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP | KM_NODEBUG); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Check to see if the named dataset is currently defined as bootable */ static boolean_t zfs_is_bootfs(const char *name) { objset_t *os; if (dmu_objset_hold(name, FTAG, &os) == 0) { boolean_t ret; ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); dmu_objset_rele(os, FTAG); return (ret); } return (B_FALSE); } /* * zfs_earlier_version * * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * zpl_earlier_version * * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf, LOG_CMD_NORMAL); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) { return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ /* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) { if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (ENOENT); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (ENOENT); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (EPERM); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (EPERM); /* must be writable by this zone */ if (!writable) return (EPERM); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) return (ENOENT); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) { rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); return (ENOENT); } rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck(name, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error) error = dsl_deleg_access(name, perm, cr); } return (error); } int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error) return (EPERM); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (EINVAL); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (EPERM); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (EPERM); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (EPERM); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, setsl_tag, &os); if (error) return (EPERM); dmu_objset_disown(os, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (EPERM); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); #else return ENOTSUP; #endif /* HAVE_MLSLABEL */ } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { default: break; case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (EPERM); break; case ZFS_PROP_QUOTA: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[MAXNAMELEN]; /* * Unprivileged users are allowed to modify the * quota on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, "zoned", &zoned, setpoint)) return (EPERM); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (EPERM); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (EPERM); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } int zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); if (error) return (error); /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ return (0); } int zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } int zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) { spa_t *spa; dsl_pool_t *dp; dsl_dataset_t *ds; char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (EINVAL); error = spa_open(zc->zc_name, &spa, FTAG); if (error) return (error); dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); rw_exit(&dp->dp_config_rwlock); spa_close(spa, FTAG); if (error) return (error); dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); return (error); } #ifdef HAVE_ZPL static int zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) { vnode_t *vp; int error; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (vp->v_vfsp->vfs_fstype != zfsfstype || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (EPERM); } VN_RELE(vp); return (dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_SHARE, cr)); } #endif /* HAVE_ZPL */ int zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) { #ifdef HAVE_ZPL if (!INGLOBALZONE(curproc)) return (EPERM); if (secpolicy_nfs(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, cr)); } #else return (ENOTSUP); #endif /* HAVE_ZPL */ } int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) { #ifdef HAVE_ZPL if (!INGLOBALZONE(curproc)) return (EPERM); if (secpolicy_smb(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, cr)); } #else return (ENOTSUP); #endif /* HAVE_ZPL */ } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strncpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (ENOENT); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } static int zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendent mount and destroy permissions. * Reassemble the full filesystem@snap name so dsl_deleg_access() * can do the correct permission check. * * Since this routine is used when doing a recursive destroy of snapshots * and destroying snapshots requires descendent permissions, a successfull * check of the top level snapshot applies to snapshots of all descendent * datasets as well. */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr) { int error; char *dsname; dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); error = zfs_secpolicy_destroy_perms(dsname, cr); strfree(dsname); return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[MAXNAMELEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } static int zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) { return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } static int zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) { char parentname[MAXNAMELEN]; objset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error) return (error); error = dmu_objset_hold(zc->zc_name, FTAG, &clone); if (error == 0) { dsl_dataset_t *pclone = NULL; dsl_dir_t *dd; dd = clone->os_dsl_dataset->ds_dir; rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_origin_obj, FTAG, &pclone); rw_exit(&dd->dd_pool->dp_config_rwlock); if (error) { dmu_objset_rele(clone, FTAG); return (error); } error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(pclone, parentname); dmu_objset_rele(clone, FTAG); dsl_dataset_rele(pclone, FTAG); if (error == 0) error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_PROMOTE, cr); } return (error); } static int zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) { return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr)); } static int zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) { char parentname[MAXNAMELEN]; int error; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (zc->zc_value[0] != '\0') { if ((error = zfs_secpolicy_write_perms(zc->zc_value, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); } if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr); return (error); } #ifdef HAVE_ZPL static int zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr) { int error; error = secpolicy_fs_unmount(cr, NULL); if (error) { error = dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr); } return (error); } #endif /* HAVE_ZPL */ /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ static int zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (EPERM); return (0); } /* * Policy for object to name lookups. */ /* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr) { int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) { return (secpolicy_zinject(cr)); } static int zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) return (EINVAL); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) { int err = zfs_secpolicy_read(zc, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else { if (groupmember(zc->zc_guid, cr)) return (0); } } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) { int err = zfs_secpolicy_read(zc, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } static int zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_HOLD, cr)); } static int zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RELEASE, cr)); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot(zc, cr); if (!error) error = zfs_secpolicy_hold(zc, cr); if (!error) error = zfs_secpolicy_release(zc, cr); if (!error) error = zfs_secpolicy_destroy(zc, cr); return (error); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (EINVAL); - packed = kmem_alloc(size, KM_SLEEP); + packed = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { kmem_free(packed, size); return (error); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { kmem_free(packed, size); return (error); } kmem_free(packed, size); *nvp = list; return (0); } static int fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) { size_t size; VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); if (size > zc->zc_nvlist_dst_size) { nvpair_t *more_errors; int n = 0; if (zc->zc_nvlist_dst_size < 1024) return (ENOMEM); VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0); more_errors = nvlist_prev_nvpair(*errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(*errors, more_errors); VERIFY(nvlist_remove_nvpair(*errors, pair) == 0); n++; VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); } while (size > zc->zc_nvlist_dst_size); VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0); VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0); ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); ASSERT(size <= zc->zc_nvlist_dst_size); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); if (size > zc->zc_nvlist_dst_size) { error = ENOMEM; } else { - packed = kmem_alloc(size, KM_SLEEP); + packed = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, KM_SLEEP) == 0); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = EFAULT; kmem_free(packed, size); } zc->zc_nvlist_dst_size = size; return (error); } #ifdef HAVE_ZPL static int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (EINVAL); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); if (*zfvp) { VFS_HOLD((*zfvp)->z_vfs); } else { error = ESRCH; } mutex_exit(&os->os_user_ptr_lock); dmu_objset_rele(os, FTAG); return (error); } #endif /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_vfs will be NULL, and it will be opened as the owner. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { #ifdef HAVE_ZPL int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, zfvp); if (error == 0) { rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : RW_READER, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ rrw_exit(&(*zfvp)->z_teardown_lock, tag); return (EBUSY); } } return (error); #else return ENOTSUP; #endif } static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { #ifdef HAVE_ZPL rrw_exit(&zfsvfs->z_teardown_lock, tag); if (zfsvfs->z_vfs) { VFS_RELE(zfsvfs->z_vfs); } else { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } #endif } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; char *buf; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; uint64_t version = SPA_VERSION; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) { error = EINVAL; goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) { nvlist_free(config); nvlist_free(props); return (error); } (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error) goto pool_props_bad; } buf = history_str_get(zc); error = spa_create(zc->zc_name, config, props, buf, zplprops); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) (void) spa_destroy(zc->zc_name); if (buf != NULL) history_str_free(buf); pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = EINVAL; else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } if (error == 0) zvol_create_minors(zc->zc_name); nvlist_free(config); if (props) nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (EEXIST); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (EINVAL); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { spa_close(spa, FTAG); return (EINVAL); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (EINVAL); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (ENOTSUP); } hist_buf = kmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); kmem_free(hist_buf, size); return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { int error; if ((error = dsl_dsobj_to_dsname(zc->zc_name,zc->zc_obj,zc->zc_value))) return (error); return (0); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (EINVAL); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (EINVAL); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config, **l2cache, **spares; uint_t nl2cache = 0, nspares = 0; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, &spares, &nspares); /* * A root pool with concatenated devices is not supported. * Thus, can not add a device to a root pool. * * Intent log device can not be added to a rootpool because * during mountroot, zil is replayed, a seperated log device * can not be accessed during the mountroot time. * * l2cache and spare devices are ok to be added to a rootpool. */ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { nvlist_free(config); spa_close(spa, FTAG); return (EDOM); } if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_nvlist_conf nvlist of devices to remove * zc_cookie to stop the remove? */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = EINVAL; } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; int replacing = zc->zc_cookie; nvlist_t *config; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent) { if (dmu_objset_type(os) == DMU_OST_ZVOL) error = zvol_get_stats(os, nv); } if (error == 0) error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os = NULL; int error; if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (error); error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { objset_t *os = NULL; int error; nvlist_t *nv; if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (error); /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(os)) { dmu_objset_rele(os, FTAG); return (ENOTSUP); } if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(os, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } dmu_objset_rele(os, FTAG); return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = ENOENT; } dmu_objset_rele(os, FTAG); return (err); } static boolean_t dataset_name_hidden(const char *name) { /* * Skip over datasets that are not visible in this zone, * internal datasets (which have a $ in their name), and * temporary datasets (which have a % in their name). */ if (strchr(name, '$') != NULL) return (B_TRUE); if (strchr(name, '%') != NULL) return (B_TRUE); if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL)) return (B_TRUE); return (B_FALSE); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { if (error == ENOENT) error = ESRCH; return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); /* * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 * but is not declared void because its called by dmu_objset_find(). */ if (zc->zc_cookie == 0) { uint64_t cookie = 0; int len = sizeof (zc->zc_name) - (p - zc->zc_name); while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) (void) dmu_objset_prefetch(p, NULL); } do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = ESRCH; } while (error == 0 && dataset_name_hidden(zc->zc_name) && !(zc->zc_iflags & FKIOCTL)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; int error; top: if (zc->zc_cookie == 0) (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, NULL, DS_FIND_SNAPSHOTS); error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error) return (error == ENOENT ? ESRCH : error); /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { dmu_objset_rele(os, FTAG); return (ESRCH); } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == 0) { dsl_dataset_t *ds; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; /* * Since we probably don't have a hold on this snapshot, * it's possible that the objsetid could have been destroyed * and reused for a new objset. It's OK if this happens during * a zfs send operation, since the new createtxg will be * beyond the range we're interested in. */ rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); rw_exit(&dp->dp_config_rwlock); if (error) { if (error == ENOENT) { /* Racing with destroy, get the next one. */ *strchr(zc->zc_name, '@') = '\0'; dmu_objset_rele(os, FTAG); goto top; } } else { objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); if (error == 0) error = zfs_ioc_objset_stats_impl(zc, ossnap); dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { error = ESRCH; } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { #ifdef HAVE_ZPL const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *domain; char *dash; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (EINVAL); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (EINVAL); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); #else return ENOTSUP; #endif } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) return (-1); VERIFY(0 == nvpair_value_uint64(pair, &intval)); switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_quota(dsname, source, intval); break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_reservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; #ifdef HAVE_ZPL err = zfs_set_version(zfsvfs, intval); #endif zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dsname); (void) zfs_ioc_userspace_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the first error * encountered. If the caller provides a non-NULL errlist, it also gives the * complete list of names of all the properties it failed to set along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property is set successfully, zero is returned and the list pointed * at by errlist is NULL. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t **errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; char *strval; nvlist_t *genericnvl; nvlist_t *errors; nvlist_t *retrynvl; VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); int err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = EINVAL; } /* Validate value type */ if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = EINVAL; } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = EINVAL; } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = EINVAL; } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; VERIFY(nvpair_value_uint64(propval, &intval) == 0); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = EINVAL; break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = EINVAL; break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = EINVAL; } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) VERIFY(nvlist_add_int32(errors, propname, err) == 0); } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (!nvlist_empty(genericnvl) && dsl_props_set(dsname, source, genericnvl) != 0) { /* * If this fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); int err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) == 0); } if (nvpair_type(propval) == DATA_TYPE_STRING) { VERIFY(nvpair_value_string(propval, &strval) == 0); err = dsl_prop_set(dsname, propname, source, 1, strlen(strval) + 1, strval); } else { VERIFY(nvpair_value_uint64(propval, &intval) == 0); err = dsl_prop_set(dsname, propname, source, 8, 1, &intval); } if (err != 0) { VERIFY(nvlist_add_int32(errors, propname, err) == 0); } } } nvlist_free(genericnvl); nvlist_free(retrynvl); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(char *fsname, nvlist_t *nvl) { nvpair_t *pair = NULL; int error = 0; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); char *valstr; if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (EINVAL); if ((error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_USERPROP, CRED()))) return (error); if (strlen(propname) >= ZAP_MAXNAMELEN) return (ENAMETOOLONG); VERIFY(nvpair_value_string(pair, &valstr) == 0); if (strlen(valstr) >= ZAP_MAXVALUELEN) return (E2BIG); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(objset_t *os, const char *fs, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; objset_t *os; if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) { if (dsl_prop_get_received(os, &origprops) == 0) { (void) clear_received_props(os, zc->zc_name, origprops, nvl); nvlist_free(origprops); } dsl_prop_set_hasrecvd(os); dmu_objset_rele(os, FTAG); } } error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ if (received) { nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) return (EINVAL); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (EINVAL); } else { type = zfs_prop_get_type(prop); } VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: nvlist_free(dummy); return (EINVAL); } pair = nvlist_next_nvpair(dummy, NULL); err = zfs_prop_set_special(zc->zc_name, source, pair); nvlist_free(dummy); if (err != -1) return (err); /* special property already handled */ } else { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) return (EINVAL); } /* the property name has been validated by zfs_secpolicy_inherit() */ return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL)); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_config_sync(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = EFAULT; nvlist_free(nvp); return (error); } /* * inputs: * zc_name name of volume * * outputs: none */ static int zfs_ioc_create_minor(zfs_cmd_t *zc) { return (zvol_create_minor(zc->zc_name)); } /* * inputs: * zc_name name of volume * * outputs: none */ static int zfs_ioc_remove_minor(zfs_cmd_t *zc) { return (zvol_remove_minor(zc->zc_name)); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (EINVAL); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } #ifdef HAVE_ZPL /* * Search the vfs list for a specified resource. Returns a pointer to it * or NULL if no suitable entry is found. The caller of this routine * is responsible for releasing the returned vfs pointer. */ static vfs_t * zfs_get_vfs(const char *resource) { struct vfs *vfsp; struct vfs *vfs_found = NULL; vfs_list_read_lock(); vfsp = rootvfs; do { if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { VFS_HOLD(vfsp); vfs_found = vfsp; break; } vfsp = vfsp->vfs_next; } while (vfsp != rootvfs); vfs_list_unlock(); return (vfs_found); } #endif /* HAVE_ZPL */ /* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * createprops list of properties requested by creator * default_zplver zpl version to use if unspecified in createprops * fuids_ok fuids allowed in this version of the spa? * os parent objset pointer (NULL if root fs) * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; ASSERT(zplprops != NULL); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (ENOTSUP); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[MAXNAMELEN]; char *cp; spa_t *spa; uint64_t spa_vers; int error; (void) strlcpy(parentname, dataset, sizeof (parentname)); cp = strrchr(parentname, '/'); ASSERT(cp != NULL); cp[0] = '\0'; if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * inputs: * zc_objset_type type of objset to create (fs vs zvol) * zc_name name of new objset * zc_value name of snapshot to clone from (may be empty) * zc_nvlist_src{_size} nvlist of properties to apply * * outputs: none */ static int zfs_ioc_create(zfs_cmd_t *zc) { objset_t *clone; int error = 0; zfs_creat_t zct; nvlist_t *nvprops = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type = zc->zc_objset_type; switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(zc->zc_name, '@') || strchr(zc->zc_name, '%')) return (EINVAL); if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvprops)) != 0) return (error); zct.zct_zplprops = NULL; zct.zct_props = nvprops; if (zc->zc_value[0] != '\0') { /* * We're creating a clone of an existing snapshot. */ zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { nvlist_free(nvprops); return (EINVAL); } error = dmu_objset_hold(zc->zc_value, FTAG, &clone); if (error) { nvlist_free(nvprops); return (error); } error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0); dmu_objset_rele(clone, FTAG); if (error) { nvlist_free(nvprops); return (error); } } else { boolean_t is_insensitive = B_FALSE; if (cbfunc == NULL) { nvlist_free(nvprops); return (EINVAL); } if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL || nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) { nvlist_free(nvprops); return (EINVAL); } if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) { nvlist_free(nvprops); return (EINVAL); } if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize( volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) { nvlist_free(nvprops); return (error); } } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(zc->zc_name, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(nvprops); nvlist_free(zct.zct_zplprops); return (error); } } error = dmu_objset_create(zc->zc_name, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); nvlist_free(zct.zct_zplprops); } /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, nvprops, NULL); if (error != 0) (void) dmu_objset_destroy(zc->zc_name, B_FALSE); } nvlist_free(nvprops); return (error); } /* * inputs: * zc_name name of filesystem * zc_value short name of snapshot * zc_cookie recursive flag * zc_nvlist_src[_size] property list * * outputs: * zc_value short snapname (i.e. part after the '@') */ static int zfs_ioc_snapshot(zfs_cmd_t *zc) { nvlist_t *nvprops = NULL; int error; boolean_t recursive = zc->zc_cookie; if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) return (EINVAL); if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvprops)) != 0) return (error); error = zfs_check_userprops(zc->zc_name, nvprops); if (error) goto out; if (!nvlist_empty(nvprops) && zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) { error = ENOTSUP; goto out; } error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL, nvprops, recursive, B_FALSE, -1); out: nvlist_free(nvprops); return (error); } int zfs_unmount_snap(const char *name, void *arg) { #ifdef HAVE_ZPL vfs_t *vfsp = NULL; if (arg) { char *snapname = arg; char *fullname = kmem_asprintf("%s@%s", name, snapname); vfsp = zfs_get_vfs(fullname); strfree(fullname); } else if (strchr(name, '@')) { vfsp = zfs_get_vfs(name); } if (vfsp) { /* * Always force the unmount for snapshots. */ int flag = MS_FORCE; int err; if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { VFS_RELE(vfsp); return (err); } VFS_RELE(vfsp); if ((err = dounmount(vfsp, flag, kcred)) != 0) return (err); } #endif /* HAVE_ZPL */ return (0); } /* * inputs: * zc_name name of filesystem * zc_value short name of snapshot * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy_snaps(zfs_cmd_t *zc) { int err; if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) return (EINVAL); err = dmu_objset_find(zc->zc_name, zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); if (err) return (err); return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value, zc->zc_defer_destroy)); } /* * inputs: * zc_name name of dataset to destroy * zc_objset_type type of objset * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { int err; if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { err = zfs_unmount_snap(zc->zc_name, NULL); if (err) return (err); } err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) (void) zvol_remove_minor(zc->zc_name); return (err); } /* * inputs: * zc_name name of dataset to rollback (to most recent snapshot) * * outputs: none */ static int zfs_ioc_rollback(zfs_cmd_t *zc) { #ifdef HAVE_ZPL dsl_dataset_t *ds, *clone; int error; zfsvfs_t *zfsvfs; char *clone_name; error = dsl_dataset_hold(zc->zc_name, FTAG, &ds); if (error) return (error); /* must not be a snapshot */ if (dsl_dataset_is_snapshot(ds)) { dsl_dataset_rele(ds, FTAG); return (EINVAL); } /* must have a most recent snapshot */ if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { dsl_dataset_rele(ds, FTAG); return (EINVAL); } /* * Create clone of most recent snapshot. */ clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name); error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT); if (error) goto out; error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone); if (error) goto out; /* * Do clone swap. */ if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { error = dsl_dataset_clone_swap(clone, ds, B_TRUE); dsl_dataset_disown(ds, FTAG); ds = NULL; } else { error = EBUSY; } resume_err = zfs_resume_fs(zfsvfs, zc->zc_name); error = error ? error : resume_err; } VFS_RELE(zfsvfs->z_vfs); } else { if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { error = dsl_dataset_clone_swap(clone, ds, B_TRUE); dsl_dataset_disown(ds, FTAG); ds = NULL; } else { error = EBUSY; } } /* * Destroy clone (which also closes it). */ (void) dsl_dataset_destroy(clone, FTAG, B_FALSE); out: strfree(clone_name); if (ds) dsl_dataset_rele(ds, FTAG); return (error); #else return (ENOTSUP); #endif /* HAVE_ZPL */ } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie & 1; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '%')) return (EINVAL); /* * Unmount snapshot unless we're doing a recursive rename, * in which case the dataset code figures out which snapshots * to unmount. */ if (!recursive && strchr(zc->zc_name, '@') != NULL && zc->zc_objset_type == DMU_OST_ZFS) { int err = zfs_unmount_snap(zc->zc_name, NULL); if (err) return (err); } if (zc->zc_objset_type == DMU_OST_ZVOL) (void) zvol_remove_minor(zc->zc_name); return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else { /* USERUSED and GROUPUSED are read-only */ return (EINVAL); } if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) return (err); return (0); } return (EINVAL); } if (issnap) return (EINVAL); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval >= ZIO_COMPRESS_GZIP_1 && intval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (ENOTSUP); } if (intval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (ENOTSUP); /* * If this is a bootable dataset then * verify that the compression algorithm * is supported for booting. We must return * something other than ENOTSUP since it * implies a downrev pool version. */ if (zfs_is_bootfs(dsname) && !BOOTFS_COMPRESS_VALID(intval)) { return (ERANGE); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (ENOTSUP); break; case ZFS_PROP_DEDUP: if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (ENOTSUP); break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (ENOTSUP); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (ENOTSUP); } break; default: break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dataset); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strcpy(zc->zc_value, nvpair_name(pair)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit(zc, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } #ifdef DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * inputs: * zc_name name of containing filesystem * zc_nvlist_src{_size} nvlist of properties to apply * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * zc_cleanup_fd cleanup-on-exit file descriptor * zc_action_handle handle for this guid/ds mapping (or zero on first call) * * outputs: * zc_cookie number of bytes read * zc_nvlist_dst{_size} error for each unapplied received property * zc_obj zprop_errflags_t * zc_action_handle handle for this guid/ds mapping */ static int zfs_ioc_recv(zfs_cmd_t *zc) { file_t *fp; objset_t *os; dmu_recv_cookie_t drc; boolean_t force = (boolean_t)zc->zc_guid; int fd; int error = 0; int props_error = 0; nvlist_t *errors; offset_t off; nvlist_t *props = NULL; /* sent properties */ nvlist_t *origprops = NULL; /* existing properties */ objset_t *origin = NULL; char *tosnap; char tofs[ZFS_MAXNAMELEN]; boolean_t first_recvd_props = B_FALSE; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (EINVAL); (void) strcpy(tofs, zc->zc_value); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props)) != 0) return (error); fd = zc->zc_cookie; fp = getf(fd); if (fp == NULL) { nvlist_free(props); return (EBADF); } VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) { if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) && !dsl_prop_get_hasrecvd(os)) { first_recvd_props = B_TRUE; } /* * If new received properties are supplied, they are to * completely replace the existing received properties, so stash * away the existing ones. */ if (dsl_prop_get_received(os, &origprops) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(props, origprops); if (zfs_check_clearable(tofs, origprops, &errlist) != 0) (void) nvlist_merge(errors, errlist, 0); nvlist_free(errlist); } dmu_objset_rele(os, FTAG); } if (zc->zc_string[0]) { error = dmu_objset_hold(zc->zc_string, FTAG, &origin); if (error) goto out; } error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds, &zc->zc_begin_record, force, origin, &drc); if (origin) dmu_objset_rele(origin, FTAG); if (error) goto out; /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (props) { nvlist_t *errlist; if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) { if (drc.drc_newfs) { if (spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) first_recvd_props = B_TRUE; } else if (origprops != NULL) { if (clear_received_props(os, tofs, origprops, first_recvd_props ? NULL : props) != 0) zc->zc_obj |= ZPROP_ERR_NOCLEAR; } else { zc->zc_obj |= ZPROP_ERR_NOCLEAR; } dsl_prop_set_hasrecvd(os); } else if (!drc.drc_newfs) { zc->zc_obj |= ZPROP_ERR_NOCLEAR; } (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, props, &errlist); (void) nvlist_merge(errors, errlist, 0); nvlist_free(errlist); } if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ props_error = EINVAL; } off = fp->f_offset; error = dmu_recv_stream(&drc, fp->f_vnode, &off, zc->zc_cleanup_fd, &zc->zc_action_handle); if (error == 0) { #ifdef HAVE_ZPL zfsvfs_t *zfsvfs = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ int end_err; error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc); if (error == 0) error = zfs_resume_fs(zfsvfs, tofs); error = error ? error : end_err; VFS_RELE(zfsvfs->z_vfs); } else { error = dmu_recv_end(&drc); } #else error = dmu_recv_end(&drc); #endif /* HAVE_ZPL */ } zc->zc_cookie = off - fp->f_offset; if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; #ifdef DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error && props) { if (dmu_objset_hold(tofs, FTAG, &os) == 0) { if (clear_received_props(os, tofs, props, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(os); } dmu_objset_rele(os, FTAG); } else if (!drc.drc_newfs) { /* We failed to clear the received properties. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } if (origprops == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explictly if we're restoring local properties cleared in the * first new-style receive. */ if (origprops != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origprops, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } } out: nvlist_free(props); nvlist_free(origprops); nvlist_free(errors); releasef(fd); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * * outputs: none */ static int zfs_ioc_send(zfs_cmd_t *zc) { objset_t *fromsnap = NULL; objset_t *tosnap; file_t *fp; int error; offset_t off; dsl_dataset_t *ds; dsl_dataset_t *dsfrom = NULL; spa_t *spa; dsl_pool_t *dp; error = spa_open(zc->zc_name, &spa, FTAG); if (error) return (error); dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); rw_exit(&dp->dp_config_rwlock); if (error) { spa_close(spa, FTAG); return (error); } error = dmu_objset_from_ds(ds, &tosnap); if (error) { dsl_dataset_rele(ds, FTAG); spa_close(spa, FTAG); return (error); } if (zc->zc_fromobj != 0) { rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom); rw_exit(&dp->dp_config_rwlock); spa_close(spa, FTAG); if (error) { dsl_dataset_rele(ds, FTAG); return (error); } error = dmu_objset_from_ds(dsfrom, &fromsnap); if (error) { dsl_dataset_rele(dsfrom, FTAG); dsl_dataset_rele(ds, FTAG); return (error); } } else { spa_close(spa, FTAG); } fp = getf(zc->zc_cookie); if (fp == NULL) { dsl_dataset_rele(ds, FTAG); if (dsfrom) dsl_dataset_rele(dsfrom, FTAG); return (EBADF); } off = fp->f_offset; error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(zc->zc_cookie); if (dsfrom) dsl_dataset_rele(dsfrom, FTAG); dsl_dataset_rele(ds, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (EIO); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (EINVAL); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error) return (error); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { (void) spa_vdev_state_exit(spa, NULL, ENODEV); spa_close(spa, FTAG); return (ENODEV); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, NULL, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = EIO; spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of origin snapshot * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { char *cp; /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(zc->zc_value, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(zc->zc_value, zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { #ifdef HAVE_ZPL zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); #else return (ENOTSUP); #endif /* HAVE_ZPL */ } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { #ifdef HAVE_ZPL zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (ENOMEM); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error) return (error); void *buf = kmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } kmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); #else return (ENOTSUP); #endif /* HAVE_ZPL */ } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { #ifdef HAVE_ZPL objset_t *os; int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ error = zfs_suspend_fs(zfsvfs); if (error == 0) error = zfs_resume_fs(zfsvfs, zc->zc_name); } if (error == 0) error = dmu_objset_userspace_upgrade(zfsvfs->z_os); VFS_RELE(zfsvfs->z_vfs); } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error) return (error); error = dmu_objset_userspace_upgrade(os); dmu_objset_rele(os, FTAG); } return (error); #else return (ENOTSUP); #endif /* HAVE_ZPL */ } /* * We don't want to have a hard dependency * against some special symbols in sharefs * nfs, and smbsrv. Determine them if needed when * the first file system is shared. * Neither sharefs, nfs or smbsrv are unloadable modules. */ #ifdef HAVE_ZPL int (*znfsexport_fs)(void *arg); int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); int (*zsmbexport_fs)(void *arg, boolean_t add_share); int zfs_nfsshare_inited; int zfs_smbshare_inited; ddi_modhandle_t nfs_mod; ddi_modhandle_t sharefs_mod; ddi_modhandle_t smbsrv_mod; kmutex_t zfs_share_lock; static int zfs_init_sharefs() { int error; ASSERT(MUTEX_HELD(&zfs_share_lock)); /* Both NFS and SMB shares also require sharetab support. */ if (sharefs_mod == NULL && ((sharefs_mod = ddi_modopen("fs/sharefs", KRTLD_MODE_FIRST, &error)) == NULL)) { return (ENOSYS); } if (zshare_fs == NULL && ((zshare_fs = (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { return (ENOSYS); } return (0); } #endif /* HAVE_ZPL */ static int zfs_ioc_share(zfs_cmd_t *zc) { #ifdef HAVE_ZPL int error; int opcode; switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (zfs_nfsshare_inited == 0) { mutex_enter(&zfs_share_lock); if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (ENOSYS); } if (znfsexport_fs == NULL && ((znfsexport_fs = (int (*)(void *)) ddi_modsym(nfs_mod, "nfs_export", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (ENOSYS); } error = zfs_init_sharefs(); if (error) { mutex_exit(&zfs_share_lock); return (ENOSYS); } zfs_nfsshare_inited = 1; mutex_exit(&zfs_share_lock); } break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (zfs_smbshare_inited == 0) { mutex_enter(&zfs_share_lock); if (smbsrv_mod == NULL && ((smbsrv_mod = ddi_modopen("drv/smbsrv", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (ENOSYS); } if (zsmbexport_fs == NULL && ((zsmbexport_fs = (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, "smb_server_share", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (ENOSYS); } error = zfs_init_sharefs(); if (error) { mutex_exit(&zfs_share_lock); return (ENOSYS); } zfs_smbshare_inited = 1; mutex_exit(&zfs_share_lock); } break; default: return (EINVAL); } switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (error = znfsexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata)) return (error); break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (error = zsmbexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata, zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? B_TRUE: B_FALSE)) { return (error); } break; } opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? SHAREFS_ADD : SHAREFS_REMOVE; /* * Add or remove share from sharetab */ error = zshare_fs(opcode, (void *)(uintptr_t)zc->zc_share.z_sharedata, zc->zc_share.z_sharemax); return (error); #else return (ENOTSUP); #endif /* HAVE_ZPL */ } ace_t full_access[] = { {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} }; /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, os->os_dsl_dataset->ds_phys->ds_prev_snap_txg); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; int error; snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); if (strlen(snap_name) >= MAXNAMELEN) { strfree(snap_name); return (E2BIG); } error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name, NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd); if (error != 0) { strfree(snap_name); return (error); } (void) strcpy(zc->zc_value, snap_name); strfree(snap_name); return (0); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { objset_t *fromsnap; objset_t *tosnap; file_t *fp; offset_t off; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); if (error) return (error); error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap); if (error) { dmu_objset_rele(tosnap, FTAG); return (error); } fp = getf(zc->zc_cookie); if (fp == NULL) { dmu_objset_rele(fromsnap, FTAG); dmu_objset_rele(tosnap, FTAG); return (EBADF); } off = fp->f_offset; error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; releasef(zc->zc_cookie); dmu_objset_rele(fromsnap, FTAG); dmu_objset_rele(tosnap, FTAG); return (error); } /* * Remove all ACL files in shares dir */ #ifdef HAVE_ZPL static int zfs_smb_acl_purge(znode_t *dzp) { zap_cursor_t zc; zap_attribute_t zap; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int error; for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); (error = zap_cursor_retrieve(&zc, &zap)) == 0; zap_cursor_advance(&zc)) { if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, NULL, 0)) != 0) break; } zap_cursor_fini(&zc); return (error); } #endif /* HAVE ZPL */ static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { #ifdef HAVE_ZPL vnode_t *vp; znode_t *dzp; vnode_t *resourcevp = NULL; znode_t *sharedir; zfsvfs_t *zfsvfs; nvlist_t *nvlist; char *src, *target; vattr_t vattr; vsecattr_t vsec; int error = 0; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (vp->v_vfsp->vfs_fstype != zfsfstype || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (EINVAL); } dzp = VTOZ(vp); zfsvfs = dzp->z_zfsvfs; ZFS_ENTER(zfsvfs); /* * Create share dir if its missing. */ mutex_enter(&zfsvfs->z_lock); if (zfsvfs->z_shares_dir == 0) { dmu_tx_t *tx; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, ZFS_SHARES_DIR); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { error = zfs_create_share_dir(zfsvfs, tx); dmu_tx_commit(tx); } if (error) { mutex_exit(&zfsvfs->z_lock); VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } } mutex_exit(&zfsvfs->z_lock); ASSERT(zfsvfs->z_shares_dir); if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } switch (zc->zc_cookie) { case ZFS_SMB_ACL_ADD: vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VREG; vattr.va_mode = S_IFREG|0777; vattr.va_uid = 0; vattr.va_gid = 0; vsec.vsa_mask = VSA_ACE; vsec.vsa_aclentp = &full_access; vsec.vsa_aclentsz = sizeof (full_access); vsec.vsa_aclcnt = 1; error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); if (resourcevp) VN_RELE(resourcevp); break; case ZFS_SMB_ACL_REMOVE: error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, NULL, 0); break; case ZFS_SMB_ACL_RENAME: if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, &target)) { VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); nvlist_free(nvlist); return (error); } error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, kcred, NULL, 0); nvlist_free(nvlist); break; case ZFS_SMB_ACL_PURGE: error = zfs_smb_acl_purge(sharedir); break; default: error = EINVAL; break; } VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); #else return (ENOTSUP); #endif /* HAVE_ZPL */ } /* * inputs: * zc_name name of filesystem * zc_value short name of snap * zc_string user-supplied tag for this hold * zc_cookie recursive flag * zc_temphold set if hold is temporary * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * zc_sendobj if non-zero, the objid for zc_name@zc_value * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg * * outputs: none */ static int zfs_ioc_hold(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie; spa_t *spa; dsl_pool_t *dp; dsl_dataset_t *ds; int error; minor_t minor = 0; if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) return (EINVAL); if (zc->zc_sendobj == 0) { return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, zc->zc_string, recursive, zc->zc_temphold, zc->zc_cleanup_fd)); } if (recursive) return (EINVAL); error = spa_open(zc->zc_name, &spa, FTAG); if (error) return (error); dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); rw_exit(&dp->dp_config_rwlock); spa_close(spa, FTAG); if (error) return (error); /* * Until we have a hold on this snapshot, it's possible that * zc_sendobj could've been destroyed and reused as part * of a later txg. Make sure we're looking at the right object. */ if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) { dsl_dataset_rele(ds, FTAG); return (ENOENT); } if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) { error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (error) { dsl_dataset_rele(ds, FTAG); return (error); } } error = dsl_dataset_user_hold_for_send(ds, zc->zc_string, zc->zc_temphold); if (minor != 0) { if (error == 0) { dsl_register_onexit_hold_cleanup(ds, zc->zc_string, minor); } zfs_onexit_fd_rele(zc->zc_cleanup_fd); } dsl_dataset_rele(ds, FTAG); return (error); } /* * inputs: * zc_name name of dataset from which we're releasing a user hold * zc_value short name of snap * zc_string user-supplied tag for this hold * zc_cookie recursive flag * * outputs: none */ static int zfs_ioc_release(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie; if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) return (EINVAL); return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, zc->zc_string, recursive)); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of snapshot holds */ static int zfs_ioc_get_holds(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } /* * inputs: * zc_guid flags (ZEVENT_NONBLOCK) * * outputs: * zc_nvlist_dst next nvlist event * zc_cookie dropped events since last get * zc_cleanup_fd cleanup-on-exit file descriptor */ static int zfs_ioc_events_next(zfs_cmd_t *zc) { zfs_zevent_t *ze; nvlist_t *event = NULL; minor_t minor; uint64_t dropped = 0; int error; error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (error != 0) return (error); do { error = zfs_zevent_next(ze, &event, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); nvlist_free(event); } if (zc->zc_guid & ZEVENT_NONBLOCK) break; if ((error == 0) || (error != ENOENT)) break; error = zfs_zevent_wait(ze); if (error) break; } while (1); zfs_zevent_fd_rele(zc->zc_cleanup_fd); return (error); } /* * outputs: * zc_cookie cleared events count */ static int zfs_ioc_events_clear(zfs_cmd_t *zc) { int count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; return 0; } /* * pool create, destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export * do the logging of those commands. */ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_NONE }, { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY }, { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED }, { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED }, { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED }, { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_create_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_remove_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, POOL_CHECK_NONE }, { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_NONE }, { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED }, { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED }, { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED }, { zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE }, { zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE }, }; int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = EAGAIN; else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = EROFS; spa_close(spa, FTAG); } return (error); } static void * zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (zs = list_head(&zfsdev_state_list); zs != NULL; zs = list_next(&zfsdev_state_list, zs)) { if (zs->zs_minor == minor) { switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); case ZST_ZEVENT: return (zs->zs_zevent); case ZST_ALL: return (zs); } } } return NULL; } void * zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { void *ptr; mutex_enter(&zfsdev_state_lock); ptr = zfsdev_get_state_impl(minor, which); mutex_exit(&zfsdev_state_lock); return ptr; } minor_t zfsdev_getminor(struct file *filp) { ASSERT(filp != NULL); ASSERT(filp->private_data != NULL); return (((zfsdev_state_t *)filp->private_data)->zs_minor); } /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (zfsdev_get_state_impl(m, ZST_ALL) == NULL) { last_minor = m; return (m); } } return (0); } static int zfsdev_state_init(struct file *filp) { zfsdev_state_t *zs; minor_t minor; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (ENXIO); zs = kmem_zalloc( sizeof(zfsdev_state_t), KM_SLEEP); if (zs == NULL) return (ENOMEM); zs->zs_file = filp; zs->zs_minor = minor; filp->private_data = zs; zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); list_insert_tail(&zfsdev_state_list, zs); return (0); } static int zfsdev_state_destroy(struct file *filp) { zfsdev_state_t *zs; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); ASSERT(filp->private_data != NULL); zs = filp->private_data; zfs_onexit_destroy(zs->zs_onexit); zfs_zevent_destroy(zs->zs_zevent); list_remove(&zfsdev_state_list, zs); kmem_free(zs, sizeof(zfsdev_state_t)); return 0; } static int zfsdev_open(struct inode *ino, struct file *filp) { int error; mutex_enter(&zfsdev_state_lock); error = zfsdev_state_init(filp); mutex_exit(&zfsdev_state_lock); return (-error); } static int zfsdev_release(struct inode *ino, struct file *filp) { int error; mutex_enter(&zfsdev_state_lock); error = zfsdev_state_destroy(filp); mutex_exit(&zfsdev_state_lock); return (-error); } static long zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) { zfs_cmd_t *zc; uint_t vec; int error, rc, flag = 0; vec = cmd - ZFS_IOC; if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (-EINVAL); - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP | KM_NODEBUG); error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); if (error != 0) error = EFAULT; if ((error == 0) && !(flag & FKIOCTL)) error = zfs_ioc_vec[vec].zvec_secpolicy(zc, NULL); /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ if (error == 0) { zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_iflags = flag & FKIOCTL; switch (zfs_ioc_vec[vec].zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; error = pool_status_check(zc->zc_name, zfs_ioc_vec[vec].zvec_namecheck, zfs_ioc_vec[vec].zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; error = pool_status_check(zc->zc_name, zfs_ioc_vec[vec].zvec_namecheck, zfs_ioc_vec[vec].zvec_pool_check); break; case NO_NAME: break; } } if (error == 0) error = zfs_ioc_vec[vec].zvec_func(zc); rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); if (error == 0) { if (rc != 0) error = EFAULT; if (zfs_ioc_vec[vec].zvec_his_log) zfs_log_history(zc); } kmem_free(zc, sizeof (zfs_cmd_t)); return (-error); } #ifdef CONFIG_COMPAT static long zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) { return zfsdev_ioctl(filp, cmd, arg); } #else #define zfs_compat_ioctl NULL #endif static const struct file_operations zfsdev_fops = { .open = zfsdev_open, .release = zfsdev_release, .unlocked_ioctl = zfsdev_ioctl, .compat_ioctl = zfsdev_compat_ioctl, .owner = THIS_MODULE, }; static struct miscdevice zfs_misc = { .minor = MISC_DYNAMIC_MINOR, .name = ZFS_DRIVER, .fops = &zfsdev_fops, }; static int zfs_attach(void) { int error; mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsdev_state_list, sizeof (zfsdev_state_t), offsetof(zfsdev_state_t, zs_next)); error = misc_register(&zfs_misc); if (error) { printk(KERN_INFO "ZFS: misc_register() failed %d\n", error); return (error); } return (0); } static void zfs_detach(void) { int error; error = misc_deregister(&zfs_misc); if (error) printk(KERN_INFO "ZFS: misc_deregister() failed %d\n", error); mutex_destroy(&zfsdev_state_lock); list_destroy(&zfsdev_state_list); } #ifdef HAVE_ZPL uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; #endif #ifdef DEBUG #define ZFS_DEBUG_STR " (DEBUG mode)" #else #define ZFS_DEBUG_STR "" #endif int _init(void) { int error; spa_init(FREAD | FWRITE); zfs_init(); if ((error = zvol_init()) != 0) goto out1; if ((error = zfs_attach()) != 0) goto out2; #ifdef HAVE_ZPL tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); #endif /* HAVE_ZPL */ printk(KERN_NOTICE "ZFS: Loaded ZFS Filesystem v%s%s\n", ZFS_META_VERSION, ZFS_DEBUG_STR); return (0); out2: (void) zvol_fini(); out1: zfs_fini(); spa_fini(); printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s%s" ", rc = %d\n", ZFS_META_VERSION, ZFS_DEBUG_STR, error); return (error); } int _fini(void) { zfs_detach(); zvol_fini(); zfs_fini(); spa_fini(); #ifdef HAVE_ZPL if (zfs_nfsshare_inited) (void) ddi_modclose(nfs_mod); if (zfs_smbshare_inited) (void) ddi_modclose(smbsrv_mod); if (zfs_nfsshare_inited || zfs_smbshare_inited) (void) ddi_modclose(sharefs_mod); mutex_destroy(&zfs_share_lock); tsd_destroy(&zfs_fsyncer_key); #endif /* HAVE_ZPL */ printk(KERN_NOTICE "ZFS: Unloaded ZFS Filesystem v%s%s\n", ZFS_META_VERSION, ZFS_DEBUG_STR); return (0); } #ifdef HAVE_SPL spl_module_init(_init); spl_module_exit(_fini); MODULE_DESCRIPTION("ZFS"); MODULE_AUTHOR(ZFS_META_AUTHOR); MODULE_LICENSE(ZFS_META_LICENSE); #endif /* HAVE_SPL */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 3c18d43fa5c2..ad11fd6c6357 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1,1996 +1,1996 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The zfs intent log (ZIL) saves transaction records of system calls * that change the file system in memory with enough information * to be able to replay them. These are stored in memory until * either the DMU transaction group (txg) commits them to the stable pool * and they can be discarded, or they are flushed to the stable log * (also in the pool) due to a fsync, O_DSYNC or other synchronous * requirement. In the event of a panic or power fail then those log * records (transactions) are replayed. * * There is one ZIL per file system. Its on-disk (pool) format consists * of 3 parts: * * - ZIL header * - ZIL blocks * - ZIL records * * A log record holds a system call transaction. Log blocks can * hold many log records and the blocks are chained together. * Each ZIL block contains a block pointer (blkptr_t) to the next * ZIL block in the chain. The ZIL header points to the first * block in the chain. Note there is not a fixed place in the pool * to hold blocks. They are dynamically allocated and freed as * needed from the blocks available. Figure X shows the ZIL structure: */ /* * This global ZIL switch affects all pools */ int zil_replay_disable = 0; /* disable intent logging replay */ /* * Tunable parameter for debugging or performance analysis. Setting * zfs_nocacheflush will cause corruption on power loss if a volatile * out-of-order write cache is enabled. */ boolean_t zfs_nocacheflush = B_FALSE; static kmem_cache_t *zil_lwb_cache; static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) /* * ziltest is by and large an ugly hack, but very useful in * checking replay without tedious work. * When running ziltest we want to keep all itx's and so maintain * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG * We subtract TXG_CONCURRENT_STATES to allow for common code. */ #define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) static int zil_bp_compare(const void *x1, const void *x2) { const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) return (-1); if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) return (1); if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) return (-1); if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) return (1); return (0); } static void zil_bp_tree_init(zilog_t *zilog) { avl_create(&zilog->zl_bp_tree, zil_bp_compare, sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); } static void zil_bp_tree_fini(zilog_t *zilog) { avl_tree_t *t = &zilog->zl_bp_tree; zil_bp_node_t *zn; void *cookie = NULL; while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zn, sizeof (zil_bp_node_t)); avl_destroy(t); } int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; const dva_t *dva = BP_IDENTITY(bp); zil_bp_node_t *zn; avl_index_t where; if (avl_find(t, dva, &where) != NULL) return (EEXIST); zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); zn->zn_dva = *dva; avl_insert(t, zn, where); return (0); } static zil_header_t * zil_header_in_syncing_context(zilog_t *zilog) { return ((zil_header_t *)zilog->zl_header); } static void zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) { zio_cksum_t *zc = &bp->blk_cksum; zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } /* * Read a log block and make sure it's valid. */ static int zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, char **end) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; uint32_t aflags = ARC_WAIT; arc_buf_t *abuf = NULL; zbookmark_t zb; int error; if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) zio_flags |= ZIO_FLAG_SPECULATIVE; SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = dsl_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; /* * Validate the checksummed log block. * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. * * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = abuf->b_data; char *lr = (char *)(zilc + 1); uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { error = ECKSUM; } else { bcopy(lr, dst, len); *end = (char *)dst + len; *nbp = zilc->zc_next_blk; } } else { char *lr = abuf->b_data; uint64_t size = BP_GET_LSIZE(bp); zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = ECKSUM; } else { bcopy(lr, dst, zilc->zc_nused); *end = (char *)dst + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); } return (error); } /* * Read a TX_WRITE log data block. */ static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; uint32_t aflags = ARC_WAIT; arc_buf_t *abuf = NULL; zbookmark_t zb; int error; if (BP_IS_HOLE(bp)) { if (wbuf != NULL) bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); return (0); } if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); (void) arc_buf_remove_ref(abuf, &abuf); } return (error); } /* * Parse the intent log, and call parse_func for each valid record within. */ int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) { const zil_header_t *zh = zilog->zl_header; boolean_t claimed = !!zh->zh_claim_txg; uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; uint64_t max_blk_seq = 0; uint64_t max_lr_seq = 0; uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk; char *lrbuf, *lrp; int error = 0; bzero(&next_blk, sizeof(blkptr_t)); /* * Old logs didn't record the maximum zh_claim_lr_seq. */ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) claim_lr_seq = UINT64_MAX; /* * Starting at the block pointed to by zh_log we read the log chain. * For each block in the chain we strongly check that block to * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; char *end = NULL; if (blk_seq > claim_blk_seq) break; if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) break; ASSERT3U(max_blk_seq, <, blk_seq); max_blk_seq = blk_seq; blk_count++; if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) break; error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); if (error) break; for (lrp = lrbuf; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); if (lr->lrc_seq > claim_lr_seq) goto done; if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) goto done; ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } } done: zilog->zl_parse_error = error; zilog->zl_parse_blk_seq = max_blk_seq; zilog->zl_parse_lr_seq = max_lr_seq; zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); zil_bp_tree_fini(zilog); zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); return (error); } static int zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) { /* * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); return (zio_wait(zio_claim(NULL, zilog->zl_spa, tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); } static int zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; if (lrc->lrc_txtype != TX_WRITE) return (0); /* * If the block is not readable, don't claim it. This can happen * in normal operation when a log block is written to disk before * some of the dmu_sync() blocks it points to. In this case, the * transaction cannot have been committed to anyone (we would have * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ if (lr->lr_blkptr.blk_birth >= first_txg && (error = zil_read_log_data(zilog, lr, NULL)) != 0) return (error); return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } /* ARGSUSED */ static int zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) { zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; /* * If we previously claimed it, we need to free it. */ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0) zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static lwb_t * zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); lwb->lwb_max_txg = txg; lwb->lwb_zio = NULL; lwb->lwb_tx = NULL; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { lwb->lwb_nused = sizeof (zil_chain_t); lwb->lwb_sz = BP_GET_LSIZE(bp); } else { lwb->lwb_nused = 0; lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); } mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); return (lwb); } /* * Create an on-disk intent log. */ static lwb_t * zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb = NULL; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); blk = zh->zh_log; /* * Allocate an initial log block if: * - there isn't one already * - the existing block is the wrong endianess */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { zio_free_zil(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); if (error == 0) zil_init_log_chain(zilog, &blk); } /* * Allocate a log write buffer (lwb) for the first log block. */ if (error == 0) lwb = zil_alloc_lwb(zilog, &blk, txg); /* * If we just allocated the first log block, commit our transaction * and wait for zil_sync() to stuff the block poiner into zh_log. * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); return (lwb); } /* * In one tx, free all log blocks and clear the log header. * If keep_first is set, then we're replaying a log with no content. * We want to keep the first block, however, so that the first * synchronous transaction doesn't require a txg_wait_synced() * in zil_create(). We don't need to txg_wait_synced() here either * when keep_first is set, because both zil_create() and zil_destroy() * will wait for any in-progress destroys to complete. */ void zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) return; tx = dmu_tx_create(zilog->zl_os); VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); mutex_enter(&zilog->zl_lock); ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); ASSERT(!keep_first); while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); } } else if (!keep_first) { (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, zh->zh_claim_txg); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); } int zil_claim(const char *osname, void *txarg) { dmu_tx_t *tx = txarg; uint64_t first_txg = dmu_tx_get_txg(tx); zilog_t *zilog; zil_header_t *zh; objset_t *os; int error; error = dmu_objset_hold(osname, FTAG, &os); if (error) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { if (!BP_IS_HOLE(&zh->zh_log)) zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); BP_ZERO(&zh->zh_log); dsl_dataset_dirty(dmu_objset_ds(os), tx); dmu_objset_rele(os, FTAG); return (0); } /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), * but we can read the entire log later, we will not try to replay * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg); zh->zh_claim_txg = first_txg; zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) zh->zh_flags |= ZIL_REPLAY_NEEDED; zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_rele(os, FTAG); return (0); } /* * Check the log by walking the log chain. * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ int zil_check_log_chain(const char *osname, void *tx) { zilog_t *zilog; objset_t *os; blkptr_t *bp; int error; ASSERT(tx == NULL); error = dmu_objset_hold(osname, FTAG, &os); if (error) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } zilog = dmu_objset_zil(os); bp = (blkptr_t *)&zilog->zl_header->zh_log; /* * Check the first block and determine if it's on a log device * which may have been removed or faulted prior to loading this * pool. If so, there's no point in checking the rest of the log * as its content should have already been synced to the pool. */ if (!BP_IS_HOLE(bp)) { vdev_t *vd; boolean_t valid = B_TRUE; spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (vd->vdev_islog && vdev_is_dead(vd)) valid = vdev_log_state_valid(vd); spa_config_exit(os->os_spa, SCL_STATE, FTAG); if (!valid) { dmu_objset_rele(os, FTAG); return (0); } } /* * Because tx == NULL, zil_claim_log_block() will not actually claim * any blocks, but just determine whether it is possible to do so. * In addition to checking the log chain, zil_claim_log_block() * will invoke zio_claim() with a done func of spa_claim_notify(), * which will update spa_max_claim_txg. See spa_load() for details. */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); dmu_objset_rele(os, FTAG); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } static int zil_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; if (v1 < v2) return (-1); if (v1 > v2) return (1); return (0); } void zil_add_block(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_vdev_tree; avl_index_t where; zil_vdev_node_t *zv, zvsearch; int ndvas = BP_GET_NDVAS(bp); int i; if (zfs_nocacheflush) return; ASSERT(zilog->zl_writer); /* * Even though we're zl_writer, we still need a lock because the * zl_get_data() callbacks may have dmu_sync() done callbacks * that will run concurrently. */ mutex_enter(&zilog->zl_vdev_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { zv = kmem_alloc(sizeof (*zv), KM_SLEEP); zv->zv_vdev = zvsearch.zv_vdev; avl_insert(t, zv, where); } } mutex_exit(&zilog->zl_vdev_lock); } static void zil_flush_vdevs(zilog_t *zilog) { spa_t *spa = zilog->zl_spa; avl_tree_t *t = &zilog->zl_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; zio_t *zio; ASSERT(zilog->zl_writer); /* * We don't need zl_vdev_lock here because we're the zl_writer, * and all zl_get_data() callbacks are done. */ if (avl_numnodes(t) == 0) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL) zio_flush(zio, vd); kmem_free(zv, sizeof (*zv)); } /* * Wait for all the flushes to complete. Not all devices actually * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. */ (void) zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); } /* * Function called when a log block write completes */ static void zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; dmu_tx_t *tx = lwb->lwb_tx; ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); ASSERT(!BP_IS_GANG(zio->io_bp)); ASSERT(!BP_IS_HOLE(zio->io_bp)); ASSERT(zio->io_bp->blk_fill == 0); /* * Ensure the lwb buffer pointer is cleared before releasing * the txg. If we have had an allocation failure and * the txg is waiting to sync then we want want zil_sync() * to remove the lwb so that it's not picked up as the next new * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; lwb->lwb_tx = NULL; mutex_exit(&zilog->zl_lock); /* * Now that we've written this log block, we have a stable pointer * to the next block in the chain, so it's OK to let the txg in * which we allocated the next block sync. */ dmu_tx_commit(tx); } /* * Initialize the io for a log block. */ static void zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) { zbookmark_t zb; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); if (zilog->zl_root_zio == NULL) { zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, ZIO_FLAG_CANFAIL); } if (lwb->lwb_zio == NULL) { lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); } } /* * Define a limited set of intent log block sizes. * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. */ uint64_t zil_block_buckets[] = { 4096, /* non TX_WRITE */ 8192+4096, /* data base */ 32*1024 + 4096, /* NFS writes */ UINT64_MAX }; /* * Use the slog as long as the logbias is 'latency' and the current commit size * is less than the limit or the total list size is less than 2X the limit. * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. */ uint64_t zil_slog_limit = 1024 * 1024; #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ (((zilog)->zl_cur_used < zil_slog_limit) || \ ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) /* * Start a log block write and advance to the next log block. * Calls are serialized. */ static lwb_t * zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb = NULL; zil_chain_t *zilc; spa_t *spa = zilog->zl_spa; blkptr_t *bp; dmu_tx_t *tx; uint64_t txg; uint64_t zil_blksz, wsz; int i, error; if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { zilc = (zil_chain_t *)lwb->lwb_buf; bp = &zilc->zc_next_blk; } else { zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); bp = &zilc->zc_next_blk; } ASSERT(lwb->lwb_nused <= lwb->lwb_sz); /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. * Note that if the allocation of nlwb synced before we wrote * the block that points at it (lwb), we'd leak it if we crashed. * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). * We dirty the dataset to ensure that zil_sync() will be called * to clean up in the event of allocation failure or I/O failure. */ tx = dmu_tx_create(zilog->zl_os); VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); lwb->lwb_tx = tx; /* * Log blocks are pre-allocated. Here we select the size of the next * block, based on size used in the last block. * - first find the smallest bucket that will fit the block from a * limited set of block sizes. This is because it's faster to write * blocks allocated from the same metaslab as they are adjacent or * close. * - next find the maximum from the new suggested size and an array of * previous sizes. This lessens a picket fence effect of wrongly * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k * requests. * * Note we only write what is used, but we can't just allocate * the maximum block size because we can exhaust the available * pool log space. */ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i]; i++) continue; zil_blksz = zil_block_buckets[i]; if (zil_blksz == UINT64_MAX) zil_blksz = SPA_MAXBLOCKSIZE; zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); /* pass the old blkptr in order to spread log blocks across devs */ error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, USE_SLOG(zilog)); if (!error) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; /* * Allocate a new log write buffer (lwb). */ nlwb = zil_alloc_lwb(zilog, bp, txg); /* Record the block for later vdev flushing */ zil_add_block(zilog, &lwb->lwb_blk); } if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); ASSERT3U(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_zio, wsz); } else { wsz = lwb->lwb_sz; } zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; /* * clear unused data for security */ bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ /* * If there was an allocation failure then nlwb will be null which * forces a txg_wait_synced(). */ return (nlwb); } static lwb_t * zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { lr_t *lrc = &itx->itx_lr; /* common log record */ lr_write_t *lrw = (lr_write_t *)lrc; char *lr_buf; uint64_t txg = lrc->lrc_txg; uint64_t reclen = lrc->lrc_reclen; uint64_t dlen = 0; if (lwb == NULL) return (NULL); ASSERT(lwb->lwb_buf != NULL); if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); zilog->zl_cur_used += (reclen + dlen); zil_lwb_write_init(zilog, lwb); /* * If this record won't fit in the current log block, start a new one. */ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { lwb = zil_lwb_write_start(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_init(zilog, lwb); ASSERT(LWB_EMPTY(lwb)); if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } } lr_buf = lwb->lwb_buf + lwb->lwb_nused; bcopy(lrc, lr_buf, reclen); lrc = (lr_t *)lr_buf; lrw = (lr_write_t *)lrc; /* * If it's a write, fetch the data or get its blkptr as appropriate. */ if (lrc->lrc_txtype == TX_WRITE) { if (txg > spa_freeze_txg(zilog->zl_spa)) txg_wait_synced(zilog->zl_dmu_pool, txg); if (itx->itx_wr_state != WR_COPIED) { char *dbuf; int error; if (dlen) { ASSERT(itx->itx_wr_state == WR_NEED_COPY); dbuf = lr_buf + reclen; lrw->lr_common.lrc_reclen += dlen; } else { ASSERT(itx->itx_wr_state == WR_INDIRECT); dbuf = NULL; } error = zilog->zl_get_data( itx->itx_private, lrw, dbuf, lwb->lwb_zio); if (error == EIO) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } if (error) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); return (lwb); } } } /* * We're actually making an entry, so update lrc_seq to be the * log record sequence number. Note that this is generally not * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ lwb->lwb_nused += reclen + dlen; lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); return (lwb); } itx_t * zil_itx_create(uint64_t txtype, size_t lrsize) { itx_t *itx; lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); - itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); + itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP|KM_NODEBUG); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ itx->itx_lr.lrc_seq = 0; /* defensive */ itx->itx_sync = B_TRUE; /* default is synchronous */ return (itx); } void zil_itx_destroy(itx_t *itx) { kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } /* * Free up the sync and async itxs. The itxs_t has already been detached * so no locks are needed. */ static void zil_itxg_clean(itxs_t *itxs) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; itx_async_node_t *ian; list = &itxs->i_sync_list; while ((itx = list_head(list)) != NULL) { list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } cookie = NULL; t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_head(list)) != NULL) { list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); } avl_destroy(t); kmem_free(itxs, sizeof (itxs_t)); } static int zil_aitx_compare(const void *x1, const void *x2) { const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; if (o1 < o2) return (-1); if (o1 > o2) return (1); return (0); } /* * Remove all async itx with the given oid. */ static void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; list_t clean_list; itx_t *itx; ASSERT(oid != 0); list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; ian = avl_find(t, &oid, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } while ((itx = list_head(&clean_list)) != NULL) { list_remove(&clean_list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } list_destroy(&clean_list); } void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { uint64_t txg; itxg_t *itxg; itxs_t *itxs, *clean = NULL; /* * Object ids can be re-instantiated in the next txg so * remove any async transactions to avoid future leaks. * This can happen if a fsync occurs on the re-instantiated * object for a WR_INDIRECT or WR_NEED_COPY write, which gets * the new file data and flushes a write record for the old object. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) zil_remove_async(zilog, itx->itx_oid); /* * Ensure the data of a renamed file is committed before the rename. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); itxs = itxg->itxg_itxs; if (itxg->itxg_txg != txg) { if (itxs != NULL) { /* * The zil_clean callback hasn't got around to cleaning * this itxg. Save the itxs for release below. * This should be rare. */ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); itxg->itxg_sod = 0; clean = itxg->itxg_itxs; } ASSERT(itxg->itxg_sod == 0); itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); list_create(&itxs->i_sync_list, sizeof (itx_t), offsetof(itx_t, itx_node)); avl_create(&itxs->i_async_tree, zil_aitx_compare, sizeof (itx_async_node_t), offsetof(itx_async_node_t, ia_node)); } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod); itxg->itxg_sod += itx->itx_sod; } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; itx_async_node_t *ian; avl_index_t where; ian = avl_find(t, &foid, &where); if (ian == NULL) { ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); list_create(&ian->ia_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ian->ia_foid = foid; avl_insert(t, ian, where); } list_insert_tail(&ian->ia_list, itx); } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ if (clean != NULL) zil_itxg_clean(clean); } /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) { itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; itxs_t *clean_me; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return; } ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT(itxg->itxg_txg != 0); ASSERT(zilog->zl_clean_taskq != NULL); atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); itxg->itxg_sod = 0; clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; mutex_exit(&itxg->itxg_lock); /* * Preferably start a task queue to free up the old itxs but * if taskq_dispatch can't allocate resources to do that then * free it in-line. This should be rare. Note, using TQ_SLEEP * created a bad performance problem. */ if (taskq_dispatch(zilog->zl_clean_taskq, (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) zil_itxg_clean(clean_me); } /* * Get the list of itxs to commit into zl_itx_commit_list. */ static void zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg; list_t *commit_list = &zilog->zl_itx_commit_list; uint64_t push_sod = 0; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); push_sod += itxg->itxg_sod; itxg->itxg_sod = 0; mutex_exit(&itxg->itxg_lock); } atomic_add_64(&zilog->zl_itx_list_sz, -push_sod); } /* * Move the async itxs for a specified object to commit into sync lists. */ static void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If a foid is specified then find that node and append its * list. Otherwise walk the tree appending all the lists * to the sync list. We add to the end rather than the * beginning to ensure the create has happened. */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { ian = avl_find(t, &foid, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); } } else { void *cookie = NULL; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); list_destroy(&ian->ia_list); kmem_free(ian, sizeof (itx_async_node_t)); } } mutex_exit(&itxg->itxg_lock); } } static void zil_commit_writer(zilog_t *zilog) { uint64_t txg; itx_t *itx; lwb_t *lwb; spa_t *spa = zilog->zl_spa; int error = 0; ASSERT(zilog->zl_root_zio == NULL); mutex_exit(&zilog->zl_lock); zil_get_commit_list(zilog); /* * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ if (list_head(&zilog->zl_itx_commit_list) == NULL) { mutex_enter(&zilog->zl_lock); return; } if (zilog->zl_suspend) { lwb = NULL; } else { lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) lwb = zil_create(zilog); } DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); while ((itx = list_head(&zilog->zl_itx_commit_list))) { txg = itx->itx_lr.lrc_txg; ASSERT(txg); if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) lwb = zil_lwb_commit(zilog, itx, lwb); list_remove(&zilog->zl_itx_commit_list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); /* write the last block out */ if (lwb != NULL && lwb->lwb_zio != NULL) lwb = zil_lwb_write_start(zilog, lwb); zilog->zl_cur_used = 0; /* * Wait if necessary for the log blocks to be on stable storage. */ if (zilog->zl_root_zio) { error = zio_wait(zilog->zl_root_zio); zilog->zl_root_zio = NULL; zil_flush_vdevs(zilog); } if (error || lwb == NULL) txg_wait_synced(zilog->zl_dmu_pool, 0); mutex_enter(&zilog->zl_lock); /* * Remember the highest committed log sequence number for ztest. * We only update this value when all the log writes succeeded, * because ztest wants to ASSERT that it got the whole log chain. */ if (error == 0 && lwb != NULL) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } /* * Commit zfs transactions to stable storage. * If foid is 0 push out all transactions, otherwise push only those * for that object or might reference that object. * * itxs are committed in batches. In a heavily stressed zil there will be * a commit writer thread who is writing out a bunch of itxs to the log * for a set of committing threads (cthreads) in the same batch as the writer. * Those cthreads are all waiting on the same cv for that batch. * * There will also be a different and growing batch of threads that are * waiting to commit (qthreads). When the committing batch completes * a transition occurs such that the cthreads exit and the qthreads become * cthreads. One of the new cthreads becomes the writer thread for the * batch. Any new threads arriving become new qthreads. * * Only 2 condition variables are needed and there's no transition * between the two cvs needed. They just flip-flop between qthreads * and cthreads. * * Using this scheme we can efficiently wakeup up only those threads * that have been committed. */ void zil_commit(zilog_t *zilog, uint64_t foid) { uint64_t mybatch; if (zilog->zl_sync == ZFS_SYNC_DISABLED) return; /* move the async itxs for the foid to the sync queues */ zil_async_to_sync(zilog, foid); mutex_enter(&zilog->zl_lock); mybatch = zilog->zl_next_batch; while (zilog->zl_writer) { cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock); if (mybatch <= zilog->zl_com_batch) { mutex_exit(&zilog->zl_lock); return; } } zilog->zl_next_batch++; zilog->zl_writer = B_TRUE; zil_commit_writer(zilog); zilog->zl_com_batch = mybatch; zilog->zl_writer = B_FALSE; mutex_exit(&zilog->zl_lock); /* wake up one thread to become the next writer */ cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]); /* wake up all threads waiting for this batch to be committed */ cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]); } /* * Called in syncing context to free committed log blocks and update log header. */ void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; lwb_t *lwb; /* * We don't zero out zl_destroy_txg, so make sure we don't try * to destroy it twice. */ if (spa_sync_pass(spa) != 1) return; mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); zh->zh_replay_seq = *replayed_seq; *replayed_seq = 0; } if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; ASSERT(list_head(&zilog->zl_lwb_list) == NULL); bzero(zh, sizeof (zil_header_t)); bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* * If this block was part of log chain that couldn't * be claimed because a device was missing during * zil_claim(), but that device later returns, * then this block could erroneously appear valid. * To guard against this, assign a new GUID to the new * log chain so it doesn't matter what blk points to. */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; } } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); zio_free_zil(spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); /* * If we don't have anything left in the lwb list then * we've had an allocation failure and we need to zero * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ if (list_head(&zilog->zl_lwb_list) == NULL) BP_ZERO(&zh->zh_log); } mutex_exit(&zilog->zl_lock); } void zil_init(void) { zil_lwb_cache = kmem_cache_create("zil_lwb_cache", sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zil_fini(void) { kmem_cache_destroy(zil_lwb_cache); } void zil_set_sync(zilog_t *zilog, uint64_t sync) { zilog->zl_sync = sync; } void zil_set_logbias(zilog_t *zilog, uint64_t logbias) { zilog->zl_logbias = logbias; } zilog_t * zil_alloc(objset_t *os, zil_header_t *zh_phys) { zilog_t *zilog; int i; zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); zilog->zl_header = zh_phys; zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); zilog->zl_destroy_txg = TXG_INITIAL - 1; zilog->zl_logbias = dmu_objset_logbias(os); zilog->zl_sync = dmu_objset_syncprop(os); zilog->zl_next_batch = 1; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); for (i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, MUTEX_DEFAULT, NULL); } list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL); return (zilog); } void zil_free(zilog_t *zilog) { lwb_t *head_lwb; int i; zilog->zl_stop_sync = 1; /* * After zil_close() there should only be one lwb with a buffer. */ head_lwb = list_head(&zilog->zl_lwb_list); if (head_lwb) { ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list)); list_remove(&zilog->zl_lwb_list, head_lwb); zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz); kmem_cache_free(zil_lwb_cache, head_lwb); } list_destroy(&zilog->zl_lwb_list); avl_destroy(&zilog->zl_vdev_tree); mutex_destroy(&zilog->zl_vdev_lock); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); for (i = 0; i < TXG_SIZE; i++) { /* * It's possible for an itx to be generated that doesn't dirty * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() * callback to remove the entry. We remove those here. * * Also free up the ziltest itxs. */ if (zilog->zl_itxg[i].itxg_itxs) zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); mutex_destroy(&zilog->zl_itxg[i].itxg_lock); } mutex_destroy(&zilog->zl_lock); cv_destroy(&zilog->zl_cv_writer); cv_destroy(&zilog->zl_cv_suspend); cv_destroy(&zilog->zl_cv_batch[0]); cv_destroy(&zilog->zl_cv_batch[1]); kmem_free(zilog, sizeof (zilog_t)); } /* * Open an intent log. */ zilog_t * zil_open(objset_t *os, zil_get_data_t *get_data) { zilog_t *zilog = dmu_objset_zil(os); zilog->zl_get_data = get_data; zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 2, 2, TASKQ_PREPOPULATE); return (zilog); } /* * Close an intent log. */ void zil_close(zilog_t *zilog) { lwb_t *tail_lwb; uint64_t txg = 0; zil_commit(zilog, 0); /* commit all itx */ /* * The lwb_max_txg for the stubby lwb will reflect the last activity * for the zil. After a txg_wait_synced() on the txg we know all the * callbacks have occurred that may clean the zil. Only then can we * destroy the zl_clean_taskq. */ mutex_enter(&zilog->zl_lock); tail_lwb = list_tail(&zilog->zl_lwb_list); if (tail_lwb != NULL) txg = tail_lwb->lwb_max_txg; mutex_exit(&zilog->zl_lock); if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; zilog->zl_get_data = NULL; } /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. * We suspend the log briefly when taking a snapshot so that the snapshot * contains all the data it's supposed to, and has an empty intent log. */ int zil_suspend(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; mutex_enter(&zilog->zl_lock); if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); return (EBUSY); } if (zilog->zl_suspend++ != 0) { /* * Someone else already began a suspend. * Just wait for them to finish. */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); return (0); } zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); zil_commit(zilog, 0); zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); return (0); } void zil_resume(zilog_t *zilog) { mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); } typedef struct zil_replay_arg { zil_replay_func_t **zr_replay; void *zr_arg; boolean_t zr_byteswap; char *zr_lr; } zil_replay_arg_t; static int zil_replay_error(zilog_t *zilog, lr_t *lr, int error) { char name[MAXNAMELEN]; zilog->zl_replaying_seq--; /* didn't actually replay this one */ dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)(lr->lrc_txtype & ~TX_CI), (lr->lrc_txtype & TX_CI) ? "CI" : ""); return (error); } static int zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int error = 0; zilog->zl_replaying_seq = lr->lrc_seq; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return (0); if (lr->lrc_txg < claim_txg) /* already committed */ return (0); /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) return (zil_replay_error(zilog, lr, EINVAL)); /* * If this record type can be logged out of order, the object * (lr_foid) may no longer exist. That's legitimate, not an error. */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, ((lr_ooo_t *)lr)->lr_foid, NULL); if (error == ENOENT || error == EEXIST) return (0); } /* * Make a copy of the data so we can revise and extend it. */ bcopy(lr, zr->zr_lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); if (error) return (zil_replay_error(zilog, lr, error)); } /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different record types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lr, reclen); /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); if (error) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. Note that we * specify B_FALSE for byteswap now, so we don't do it twice. */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); if (error) return (zil_replay_error(zilog, lr, error)); } return (0); } /* ARGSUSED */ static int zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zilog->zl_replay_blks++; return (0); } /* * If this dataset has a non-empty intent log, replay it and destroy it. */ void zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { zil_destroy(zilog, B_TRUE); return; } zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); - zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); + zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. */ txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg); - kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); + vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; } boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx) { if (zilog->zl_sync == ZFS_SYNC_DISABLED) return (B_TRUE); if (zilog->zl_replay) { dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = zilog->zl_replaying_seq; return (B_TRUE); } return (B_FALSE); } /* ARGSUSED */ int zil_vdev_offline(const char *osname, void *arg) { objset_t *os; zilog_t *zilog; int error; error = dmu_objset_hold(osname, FTAG, &os); if (error) return (error); zilog = dmu_objset_zil(os); if (zil_suspend(zilog) != 0) error = EEXIST; else zil_resume(zilog); dmu_objset_rele(os, FTAG); return (error); }