diff --git a/include/sys/zio.h b/include/sys/zio.h index d4350badc100..129e2bcb9b33 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -1,596 +1,594 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ #ifndef _ZIO_H #define _ZIO_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Embedded checksum */ #define ZEC_MAGIC 0x210da7ab10c7a11ULL typedef struct zio_eck { uint64_t zec_magic; /* for validation, endianness */ zio_cksum_t zec_cksum; /* 256-bit checksum */ } zio_eck_t; /* * Gang block headers are self-checksumming and contain an array * of block pointers. */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, ZIO_CHECKSUM_ON, ZIO_CHECKSUM_OFF, ZIO_CHECKSUM_LABEL, ZIO_CHECKSUM_GANG_HEADER, ZIO_CHECKSUM_ZILOG, ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_FUNCTIONS }; #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_MASK 0xffULL #define ZIO_CHECKSUM_VERIFY (1 << 8) #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 #define ZIO_DEDUPDITTO_MIN 100 enum zio_compress { ZIO_COMPRESS_INHERIT = 0, ZIO_COMPRESS_ON, ZIO_COMPRESS_OFF, ZIO_COMPRESS_LZJB, ZIO_COMPRESS_EMPTY, ZIO_COMPRESS_GZIP_1, ZIO_COMPRESS_GZIP_2, ZIO_COMPRESS_GZIP_3, ZIO_COMPRESS_GZIP_4, ZIO_COMPRESS_GZIP_5, ZIO_COMPRESS_GZIP_6, ZIO_COMPRESS_GZIP_7, ZIO_COMPRESS_GZIP_8, ZIO_COMPRESS_GZIP_9, ZIO_COMPRESS_ZLE, ZIO_COMPRESS_LZ4, ZIO_COMPRESS_FUNCTIONS }; #define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF #define BOOTFS_COMPRESS_VALID(compress) \ ((compress) == ZIO_COMPRESS_LZJB || \ (compress) == ZIO_COMPRESS_LZ4 || \ ((compress) == ZIO_COMPRESS_ON && \ ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ (compress) == ZIO_COMPRESS_OFF) /* * Default Linux timeout for a sd device. */ #define ZIO_DELAY_MAX (30 * MILLISEC) #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 typedef enum zio_priority { ZIO_PRIORITY_SYNC_READ, ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ ZIO_PRIORITY_ASYNC_READ, /* prefetch */ ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ } zio_priority_t; #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 enum zio_flag { /* * Flags inherited by gang, ddt, and vdev children, * and that must be equal for two zios to aggregate */ ZIO_FLAG_DONT_AGGREGATE = 1 << 0, ZIO_FLAG_IO_REPAIR = 1 << 1, ZIO_FLAG_SELF_HEAL = 1 << 2, ZIO_FLAG_RESILVER = 1 << 3, ZIO_FLAG_SCRUB = 1 << 4, ZIO_FLAG_SCAN_THREAD = 1 << 5, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */ ZIO_FLAG_SPECULATIVE = 1 << 7, ZIO_FLAG_CONFIG_WRITER = 1 << 8, ZIO_FLAG_DONT_RETRY = 1 << 9, ZIO_FLAG_DONT_CACHE = 1 << 10, ZIO_FLAG_NODATA = 1 << 11, ZIO_FLAG_INDUCE_DAMAGE = 1 << 12, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) /* * Flags inherited by vdev children. */ ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */ ZIO_FLAG_PROBE = 1 << 14, ZIO_FLAG_TRYHARD = 1 << 15, ZIO_FLAG_OPTIONAL = 1 << 16, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */ ZIO_FLAG_DONT_PROPAGATE = 1 << 18, ZIO_FLAG_IO_BYPASS = 1 << 19, ZIO_FLAG_IO_REWRITE = 1 << 20, ZIO_FLAG_RAW = 1 << 21, ZIO_FLAG_GANG_CHILD = 1 << 22, ZIO_FLAG_DDT_CHILD = 1 << 23, ZIO_FLAG_GODFATHER = 1 << 24, ZIO_FLAG_NOPWRITE = 1 << 25, ZIO_FLAG_REEXECUTED = 1 << 26, ZIO_FLAG_DELEGATED = 1 << 27, ZIO_FLAG_FASTWRITE = 1 << 28 }; #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_DDT_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_VDEV_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ ZIO_FLAG_CANFAIL) enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; enum zio_wait_type { ZIO_WAIT_READY = 0, ZIO_WAIT_DONE, ZIO_WAIT_TYPES }; /* * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent * graveyard) to indicate checksum errors and fragmentation. */ #define ECKSUM EBADE #define EFRAGS EBADR typedef void zio_done_func_t(zio_t *zio); extern const char *zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) * is objset 0, and the meta-dnode is object 0. This covers all blocks * except root blocks and ZIL blocks, which are defined as follows: * * Root blocks (objset_phys_t) are object 0, level -1: . * ZIL blocks are bookmarked . * dmu_sync()ed ZIL data blocks are bookmarked . * * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel. * Therefore it must not change size or alignment between 32/64 bit * compilation options. */ struct zbookmark { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; - char * zb_func; }; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ - (zb)->zb_func = FTAG; \ } #define ZB_DESTROYED_OBJSET (-1ULL) #define ZB_ROOT_OBJECT (0ULL) #define ZB_ROOT_LEVEL (-1LL) #define ZB_ROOT_BLKID (0ULL) #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) #define ZB_IS_ZERO(zb) \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ (zb)->zb_level == 0 && (zb)->zb_blkid == 0) #define ZB_IS_ROOT(zb) \ ((zb)->zb_object == ZB_ROOT_OBJECT && \ (zb)->zb_level == ZB_ROOT_LEVEL && \ (zb)->zb_blkid == ZB_ROOT_BLKID) typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, const void *good_data); typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; struct zio_cksum_report { struct zio_cksum_report *zcr_next; nvlist_t *zcr_ereport; nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; zio_cksum_free_f *zcr_free; /* internal use only */ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ }; typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, void *arg); zio_vsd_cksum_report_f zio_vsd_default_cksum_report; typedef struct zio_vsd_ops { zio_done_func_t *vsd_free; zio_vsd_cksum_report_f *vsd_cksum_report; } zio_vsd_ops_t; typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, zio_gang_node_t *gn, void *data); typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); typedef struct zio_transform { void *zt_orig_data; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; struct zio_transform *zt_next; } zio_transform_t; typedef int zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must * be able to propagate them to the parent. The normal io_flags are local * to the zio, not protected by any lock, and not modifiable by children; * the reexecute flags are protected by io_lock, modifiable by children, * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; list_node_t zl_parent_node; list_node_t zl_child_node; } zio_link_t; struct zio { /* Core information about this I/O */ zbookmark_t io_bookmark; zio_prop_t io_prop; zio_type_t io_type; enum zio_child io_child_type; int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; blkptr_t *io_bp_override; blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; zio_link_t *io_walk_link; zio_t *io_logical; zio_transform_t *io_transform_stack; /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_physdone; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* Data represented by this I/O */ void *io_data; void *io_orig_data; uint64_t io_size; uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_delta; /* vdev queue service delta */ uint64_t io_delay; /* vdev disk service delta (ticks) */ avl_node_t io_queue_node; /* Internal pipeline state */ enum zio_flag io_flags; enum zio_stage io_stage; enum zio_stage io_pipeline; enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t io_child_count; uint64_t io_phys_children; uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; kmutex_t io_lock; kcondvar_t io_cv; /* FMA state */ zio_cksum_report_t *io_cksum_report; uint64_t io_ena; /* Taskq dispatching state */ taskq_ent_t io_tqent; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t use_slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); extern void zio_execute(zio_t *zio); extern void zio_interrupt(zio_t *zio); extern zio_t *zio_walk_parents(zio_t *cio); extern zio_t *zio_walk_children(zio_t *pio); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void *zio_vdev_alloc(void); extern void zio_vdev_free(void *buf); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); extern enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent); extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent); extern enum zio_compress zio_compress_select(enum zio_compress child, enum zio_compress parent); extern void zio_suspend(spa_t *spa, zio_t *zio); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); /* * Initial setup and teardown. */ extern void zio_init(void); extern void zio_fini(void); /* * Fault injection */ struct zinject_record; extern uint32_t zio_injection_enabled; extern int zio_inject_fault(char *name, int flags, int *id, struct zinject_record *record); extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern uint64_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions */ extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const void *good_data, const void *bad_data, boolean_t drop_if_identical); extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t offset, uint64_t length, const void *good_data, const void *bad_data, struct zio_bad_cksum *info); /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark functions */ boolean_t zbookmark_is_before(const struct dnode_phys *dnp, const zbookmark_t *zb1, const zbookmark_t *zb2); #ifdef __cplusplus } #endif #endif /* _ZIO_H */ diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index a94fecfe87f8..a35f5df65609 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -1,692 +1,691 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include /* * Keeps stats on last N reads per spa_t, disabled by default. */ int zfs_read_history = 0; /* * Include cache hits in history, disabled by default. */ int zfs_read_history_hits = 0; /* * Keeps stats on the last N txgs, disabled by default. */ int zfs_txg_history = 0; /* * ========================================================================== * SPA Read History Routines * ========================================================================== */ /* * Read statistics - Information exported regarding each arc_read call */ typedef struct spa_read_history { uint64_t uid; /* unique identifier */ hrtime_t start; /* time read completed */ uint64_t objset; /* read from this objset */ uint64_t object; /* read of this object number */ uint64_t level; /* block's indirection level */ uint64_t blkid; /* read of this block id */ char origin[24]; /* read originated from here */ uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ pid_t pid; /* PID of task doing read */ char comm[16]; /* process name of task doing read */ list_node_t srh_link; } spa_read_history_t; static int spa_read_history_headers(char *buf, size_t size) { size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", "level", "blkid", "aflags", "origin", "pid", "process"); buf[size] = '\0'; return (0); } static int spa_read_history_data(char *buf, size_t size, void *data) { spa_read_history_t *srh = (spa_read_history_t *)data; size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx " "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", (u_longlong_t)srh->uid, srh->start, (longlong_t)srh->objset, (longlong_t)srh->object, (longlong_t)srh->level, (longlong_t)srh->blkid, srh->aflags, srh->origin, srh->pid, srh->comm); buf[size] = '\0'; return (0); } /* * Calculate the address for the next spa_stats_history_t entry. The * ssh->lock will be held until ksp->ks_ndata entries are processed. */ static void * spa_read_history_addr(kstat_t *ksp, loff_t n) { spa_t *spa = ksp->ks_private; spa_stats_history_t *ssh = &spa->spa_stats.read_history; ASSERT(MUTEX_HELD(&ssh->lock)); if (n == 0) ssh->private = list_tail(&ssh->list); else if (ssh->private) ssh->private = list_prev(&ssh->list, ssh->private); return (ssh->private); } /* * When the kstat is written discard all spa_read_history_t entires. The * ssh->lock will be held until ksp->ks_ndata entries are processed. */ static int spa_read_history_update(kstat_t *ksp, int rw) { spa_t *spa = ksp->ks_private; spa_stats_history_t *ssh = &spa->spa_stats.read_history; if (rw == KSTAT_WRITE) { spa_read_history_t *srh; while ((srh = list_remove_head(&ssh->list))) { ssh->size--; kmem_free(srh, sizeof (spa_read_history_t)); } ASSERT3U(ssh->size, ==, 0); } ksp->ks_ndata = ssh->size; ksp->ks_data_size = ssh->size * sizeof (spa_read_history_t); return (0); } static void spa_read_history_init(spa_t *spa) { spa_stats_history_t *ssh = &spa->spa_stats.read_history; char name[KSTAT_STRLEN]; kstat_t *ksp; mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); list_create(&ssh->list, sizeof (spa_read_history_t), offsetof(spa_read_history_t, srh_link)); ssh->count = 0; ssh->size = 0; ssh->private = NULL; (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); name[KSTAT_STRLEN-1] = '\0'; ksp = kstat_create(name, 0, "reads", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); ssh->kstat = ksp; if (ksp) { ksp->ks_lock = &ssh->lock; ksp->ks_data = NULL; ksp->ks_private = spa; ksp->ks_update = spa_read_history_update; kstat_set_raw_ops(ksp, spa_read_history_headers, spa_read_history_data, spa_read_history_addr); kstat_install(ksp); } } static void spa_read_history_destroy(spa_t *spa) { spa_stats_history_t *ssh = &spa->spa_stats.read_history; spa_read_history_t *srh; kstat_t *ksp; ksp = ssh->kstat; if (ksp) kstat_delete(ksp); mutex_enter(&ssh->lock); while ((srh = list_remove_head(&ssh->list))) { ssh->size--; kmem_free(srh, sizeof (spa_read_history_t)); } ASSERT3U(ssh->size, ==, 0); list_destroy(&ssh->list); mutex_exit(&ssh->lock); mutex_destroy(&ssh->lock); } void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) { spa_stats_history_t *ssh = &spa->spa_stats.read_history; spa_read_history_t *srh, *rm; ASSERT3P(spa, !=, NULL); ASSERT3P(zb, !=, NULL); if (zfs_read_history == 0 && ssh->size == 0) return; if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) return; srh = kmem_zalloc(sizeof (spa_read_history_t), KM_PUSHPAGE); - strlcpy(srh->origin, zb->zb_func, sizeof (srh->origin)); strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); srh->start = gethrtime(); srh->objset = zb->zb_objset; srh->object = zb->zb_object; srh->level = zb->zb_level; srh->blkid = zb->zb_blkid; srh->aflags = aflags; srh->pid = getpid(); mutex_enter(&ssh->lock); srh->uid = ssh->count++; list_insert_head(&ssh->list, srh); ssh->size++; while (ssh->size > zfs_read_history) { ssh->size--; rm = list_remove_tail(&ssh->list); kmem_free(rm, sizeof (spa_read_history_t)); } mutex_exit(&ssh->lock); } /* * ========================================================================== * SPA TXG History Routines * ========================================================================== */ /* * Txg statistics - Information exported regarding each txg sync */ typedef struct spa_txg_history { uint64_t txg; /* txg id */ txg_state_t state; /* active txg state */ uint64_t nread; /* number of bytes read */ uint64_t nwritten; /* number of bytes written */ uint64_t reads; /* number of read operations */ uint64_t writes; /* number of write operations */ uint64_t nreserved; /* number of bytes reserved */ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ list_node_t sth_link; } spa_txg_history_t; static int spa_txg_history_headers(char *buf, size_t size) { size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s " "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", "nreserved", "nread", "nwritten", "reads", "writes", "otime", "qtime", "wtime", "stime"); buf[size] = '\0'; return (0); } static int spa_txg_history_data(char *buf, size_t size, void *data) { spa_txg_history_t *sth = (spa_txg_history_t *)data; uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; char state; switch (sth->state) { case TXG_STATE_BIRTH: state = 'B'; break; case TXG_STATE_OPEN: state = 'O'; break; case TXG_STATE_QUIESCED: state = 'Q'; break; case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; case TXG_STATE_SYNCED: state = 'S'; break; case TXG_STATE_COMMITTED: state = 'C'; break; default: state = '?'; break; } if (sth->times[TXG_STATE_OPEN]) open = sth->times[TXG_STATE_OPEN] - sth->times[TXG_STATE_BIRTH]; if (sth->times[TXG_STATE_QUIESCED]) quiesce = sth->times[TXG_STATE_QUIESCED] - sth->times[TXG_STATE_OPEN]; if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - sth->times[TXG_STATE_QUIESCED]; if (sth->times[TXG_STATE_SYNCED]) sync = sth->times[TXG_STATE_SYNCED] - sth->times[TXG_STATE_WAIT_FOR_SYNC]; size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu " "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, (u_longlong_t)sth->nreserved, (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, (u_longlong_t)sync); buf[size] = '\0'; return (0); } /* * Calculate the address for the next spa_stats_history_t entry. The * ssh->lock will be held until ksp->ks_ndata entries are processed. */ static void * spa_txg_history_addr(kstat_t *ksp, loff_t n) { spa_t *spa = ksp->ks_private; spa_stats_history_t *ssh = &spa->spa_stats.txg_history; ASSERT(MUTEX_HELD(&ssh->lock)); if (n == 0) ssh->private = list_tail(&ssh->list); else if (ssh->private) ssh->private = list_prev(&ssh->list, ssh->private); return (ssh->private); } /* * When the kstat is written discard all spa_txg_history_t entires. The * ssh->lock will be held until ksp->ks_ndata entries are processed. */ static int spa_txg_history_update(kstat_t *ksp, int rw) { spa_t *spa = ksp->ks_private; spa_stats_history_t *ssh = &spa->spa_stats.txg_history; ASSERT(MUTEX_HELD(&ssh->lock)); if (rw == KSTAT_WRITE) { spa_txg_history_t *sth; while ((sth = list_remove_head(&ssh->list))) { ssh->size--; kmem_free(sth, sizeof (spa_txg_history_t)); } ASSERT3U(ssh->size, ==, 0); } ksp->ks_ndata = ssh->size; ksp->ks_data_size = ssh->size * sizeof (spa_txg_history_t); return (0); } static void spa_txg_history_init(spa_t *spa) { spa_stats_history_t *ssh = &spa->spa_stats.txg_history; char name[KSTAT_STRLEN]; kstat_t *ksp; mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); list_create(&ssh->list, sizeof (spa_txg_history_t), offsetof(spa_txg_history_t, sth_link)); ssh->count = 0; ssh->size = 0; ssh->private = NULL; (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); name[KSTAT_STRLEN-1] = '\0'; ksp = kstat_create(name, 0, "txgs", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); ssh->kstat = ksp; if (ksp) { ksp->ks_lock = &ssh->lock; ksp->ks_data = NULL; ksp->ks_private = spa; ksp->ks_update = spa_txg_history_update; kstat_set_raw_ops(ksp, spa_txg_history_headers, spa_txg_history_data, spa_txg_history_addr); kstat_install(ksp); } } static void spa_txg_history_destroy(spa_t *spa) { spa_stats_history_t *ssh = &spa->spa_stats.txg_history; spa_txg_history_t *sth; kstat_t *ksp; ksp = ssh->kstat; if (ksp) kstat_delete(ksp); mutex_enter(&ssh->lock); while ((sth = list_remove_head(&ssh->list))) { ssh->size--; kmem_free(sth, sizeof (spa_txg_history_t)); } ASSERT3U(ssh->size, ==, 0); list_destroy(&ssh->list); mutex_exit(&ssh->lock); mutex_destroy(&ssh->lock); } /* * Add a new txg to historical record. */ void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) { spa_stats_history_t *ssh = &spa->spa_stats.txg_history; spa_txg_history_t *sth, *rm; if (zfs_txg_history == 0 && ssh->size == 0) return; sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_PUSHPAGE); sth->txg = txg; sth->state = TXG_STATE_OPEN; sth->times[TXG_STATE_BIRTH] = birth_time; mutex_enter(&ssh->lock); list_insert_head(&ssh->list, sth); ssh->size++; while (ssh->size > zfs_txg_history) { ssh->size--; rm = list_remove_tail(&ssh->list); kmem_free(rm, sizeof (spa_txg_history_t)); } mutex_exit(&ssh->lock); } /* * Set txg state completion time and increment current state. */ int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time) { spa_stats_history_t *ssh = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); mutex_enter(&ssh->lock); for (sth = list_head(&ssh->list); sth != NULL; sth = list_next(&ssh->list, sth)) { if (sth->txg == txg) { sth->times[completed_state] = completed_time; sth->state++; error = 0; break; } } mutex_exit(&ssh->lock); return (error); } /* * Set txg IO stats. */ int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved) { spa_stats_history_t *ssh = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); mutex_enter(&ssh->lock); for (sth = list_head(&ssh->list); sth != NULL; sth = list_next(&ssh->list, sth)) { if (sth->txg == txg) { sth->nread = nread; sth->nwritten = nwritten; sth->reads = reads; sth->writes = writes; sth->nreserved = nreserved; error = 0; break; } } mutex_exit(&ssh->lock); return (error); } /* * ========================================================================== * SPA TX Assign Histogram Routines * ========================================================================== */ /* * Tx statistics - Information exported regarding dmu_tx_assign time. */ /* * When the kstat is written zero all buckets. When the kstat is read * count the number of trailing buckets set to zero and update ks_ndata * such that they are not output. */ static int spa_tx_assign_update(kstat_t *ksp, int rw) { spa_t *spa = ksp->ks_private; spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; int i; if (rw == KSTAT_WRITE) { for (i = 0; i < ssh->count; i++) ((kstat_named_t *)ssh->private)[i].value.ui64 = 0; } for (i = ssh->count; i > 0; i--) if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0) break; ksp->ks_ndata = i; ksp->ks_data_size = i * sizeof (kstat_named_t); return (0); } static void spa_tx_assign_init(spa_t *spa) { spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; char name[KSTAT_STRLEN]; kstat_named_t *ks; kstat_t *ksp; int i; mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); ssh->count = 42; /* power of two buckets for 1ns to 2,199s */ ssh->size = ssh->count * sizeof (kstat_named_t); ssh->private = kmem_alloc(ssh->size, KM_SLEEP); (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); name[KSTAT_STRLEN-1] = '\0'; for (i = 0; i < ssh->count; i++) { ks = &((kstat_named_t *)ssh->private)[i]; ks->data_type = KSTAT_DATA_UINT64; ks->value.ui64 = 0; (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", (u_longlong_t)1 << i); } ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); ssh->kstat = ksp; if (ksp) { ksp->ks_lock = &ssh->lock; ksp->ks_data = ssh->private; ksp->ks_ndata = ssh->count; ksp->ks_data_size = ssh->size; ksp->ks_private = spa; ksp->ks_update = spa_tx_assign_update; kstat_install(ksp); } } static void spa_tx_assign_destroy(spa_t *spa) { spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; kstat_t *ksp; ksp = ssh->kstat; if (ksp) kstat_delete(ksp); kmem_free(ssh->private, ssh->size); mutex_destroy(&ssh->lock); } void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) { spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; uint64_t idx = 0; while (((1 << idx) < nsecs) && (idx < ssh->size - 1)) idx++; atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64); } /* * ========================================================================== * SPA IO History Routines * ========================================================================== */ static int spa_io_history_update(kstat_t *ksp, int rw) { if (rw == KSTAT_WRITE) memset(ksp->ks_data, 0, ksp->ks_data_size); return (0); } static void spa_io_history_init(spa_t *spa) { spa_stats_history_t *ssh = &spa->spa_stats.io_history; char name[KSTAT_STRLEN]; kstat_t *ksp; mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); name[KSTAT_STRLEN-1] = '\0'; ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); ssh->kstat = ksp; if (ksp) { ksp->ks_lock = &ssh->lock; ksp->ks_private = spa; ksp->ks_update = spa_io_history_update; kstat_install(ksp); } } static void spa_io_history_destroy(spa_t *spa) { spa_stats_history_t *ssh = &spa->spa_stats.io_history; if (ssh->kstat) kstat_delete(ssh->kstat); mutex_destroy(&ssh->lock); } void spa_stats_init(spa_t *spa) { spa_read_history_init(spa); spa_txg_history_init(spa); spa_tx_assign_init(spa); spa_io_history_init(spa); } void spa_stats_destroy(spa_t *spa) { spa_tx_assign_destroy(spa); spa_txg_history_destroy(spa); spa_read_history_destroy(spa); spa_io_history_destroy(spa); } #if defined(_KERNEL) && defined(HAVE_SPL) module_param(zfs_read_history, int, 0644); MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); module_param(zfs_read_history_hits, int, 0644); MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); module_param(zfs_txg_history, int, 0644); MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); #endif