diff --git a/include/os/freebsd/spl/sys/kstat.h b/include/os/freebsd/spl/sys/kstat.h index f5157c7f4fe3..947dfee62393 100644 --- a/include/os/freebsd/spl/sys/kstat.h +++ b/include/os/freebsd/spl/sys/kstat.h @@ -1,234 +1,230 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_KSTAT_H #define _SPL_KSTAT_H #include #ifndef _STANDALONE #include #endif struct list_head {}; #include #include #define KSTAT_STRLEN 255 #define KSTAT_RAW_MAX (128*1024) /* * For reference valid classes are: * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc */ #define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ #define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ #define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ #define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ #define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ #define KSTAT_NUM_TYPES 5 #define KSTAT_DATA_CHAR 0 #define KSTAT_DATA_INT32 1 #define KSTAT_DATA_UINT32 2 #define KSTAT_DATA_INT64 3 #define KSTAT_DATA_UINT64 4 #define KSTAT_DATA_LONG 5 #define KSTAT_DATA_ULONG 6 #define KSTAT_DATA_STRING 7 #define KSTAT_NUM_DATAS 8 #define KSTAT_INTR_HARD 0 #define KSTAT_INTR_SOFT 1 #define KSTAT_INTR_WATCHDOG 2 #define KSTAT_INTR_SPURIOUS 3 #define KSTAT_INTR_MULTSVC 4 #define KSTAT_NUM_INTRS 5 #define KSTAT_FLAG_VIRTUAL 0x01 #define KSTAT_FLAG_VAR_SIZE 0x02 #define KSTAT_FLAG_WRITABLE 0x04 #define KSTAT_FLAG_PERSISTENT 0x08 #define KSTAT_FLAG_DORMANT 0x10 #define KSTAT_FLAG_INVALID 0x20 #define KSTAT_FLAG_LONGSTRINGS 0x40 #define KSTAT_FLAG_NO_HEADERS 0x80 #define KS_MAGIC 0x9d9d9d9d /* Dynamic updates */ #define KSTAT_READ 0 #define KSTAT_WRITE 1 struct kstat_s; typedef struct kstat_s kstat_t; typedef int kid_t; /* unique kstat id */ typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ struct seq_file { char *sf_buf; size_t sf_size; }; void seq_printf(struct seq_file *m, const char *fmt, ...); typedef struct kstat_module { char ksm_name[KSTAT_STRLEN+1]; /* module name */ struct list_head ksm_module_list; /* module linkage */ struct list_head ksm_kstat_list; /* list of kstat entries */ struct proc_dir_entry *ksm_proc; /* proc entry */ } kstat_module_t; typedef struct kstat_raw_ops { int (*headers)(char *buf, size_t size); int (*seq_headers)(struct seq_file *); int (*data)(char *buf, size_t size, void *data); void *(*addr)(kstat_t *ksp, loff_t index); } kstat_raw_ops_t; struct kstat_s { int ks_magic; /* magic value */ kid_t ks_kid; /* unique kstat ID */ hrtime_t ks_crtime; /* creation time */ hrtime_t ks_snaptime; /* last access time */ char ks_module[KSTAT_STRLEN+1]; /* provider module name */ int ks_instance; /* provider module instance */ char ks_name[KSTAT_STRLEN+1]; /* kstat name */ char ks_class[KSTAT_STRLEN+1]; /* kstat class */ uchar_t ks_type; /* kstat data type */ uchar_t ks_flags; /* kstat flags */ void *ks_data; /* kstat type-specific data */ uint_t ks_ndata; /* # of data records */ size_t ks_data_size; /* size of kstat data section */ kstat_update_t *ks_update; /* dynamic updates */ void *ks_private; /* private data */ void *ks_private1; /* private data */ kmutex_t ks_private_lock; /* kstat private data lock */ kmutex_t *ks_lock; /* kstat data lock */ struct list_head ks_list; /* kstat linkage */ kstat_module_t *ks_owner; /* kstat module linkage */ kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ char *ks_raw_buf; /* buf used for raw ops */ size_t ks_raw_bufsize; /* size of raw ops buffer */ #ifndef _STANDALONE struct sysctl_ctx_list ks_sysctl_ctx; struct sysctl_oid *ks_sysctl_root; #endif /* _STANDALONE */ }; typedef struct kstat_named_s { char name[KSTAT_STRLEN]; /* name of counter */ uchar_t data_type; /* data type */ union { char c[16]; /* 128-bit int */ int32_t i32; /* 32-bit signed int */ uint32_t ui32; /* 32-bit unsigned int */ int64_t i64; /* 64-bit signed int */ uint64_t ui64; /* 64-bit unsigned int */ long l; /* native signed long */ ulong_t ul; /* native unsigned long */ struct { union { char *ptr; /* NULL-term string */ char __pad[8]; /* 64-bit padding */ } addr; uint32_t len; /* # bytes for strlen + '\0' */ } string; } value; } kstat_named_t; #define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) #define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) typedef struct kstat_intr { uint_t intrs[KSTAT_NUM_INTRS]; } kstat_intr_t; typedef struct kstat_io { u_longlong_t nread; /* number of bytes read */ u_longlong_t nwritten; /* number of bytes written */ uint_t reads; /* number of read operations */ uint_t writes; /* number of write operations */ hrtime_t wtime; /* cumulative wait (pre-service) time */ hrtime_t wlentime; /* cumulative wait len*time product */ hrtime_t wlastupdate; /* last time wait queue changed */ hrtime_t rtime; /* cumulative run (service) time */ hrtime_t rlentime; /* cumulative run length*time product */ hrtime_t rlastupdate; /* last time run queue changed */ uint_t wcnt; /* count of elements in wait state */ uint_t rcnt; /* count of elements in run state */ } kstat_io_t; typedef struct kstat_timer { char name[KSTAT_STRLEN+1]; /* event name */ u_longlong_t num_events; /* number of events */ hrtime_t elapsed_time; /* cumulative elapsed time */ hrtime_t min_time; /* shortest event duration */ hrtime_t max_time; /* longest event duration */ hrtime_t start_time; /* previous event start time */ hrtime_t stop_time; /* previous event stop time */ } kstat_timer_t; int spl_kstat_init(void); void spl_kstat_fini(void); extern void __kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void* (*addr)(kstat_t *ksp, loff_t index)); extern void __kstat_set_seq_raw_ops(kstat_t *ksp, int (*headers)(struct seq_file *), int (*data)(char *buf, size_t size, void *data), void* (*addr)(kstat_t *ksp, loff_t index)); extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags); extern void __kstat_install(kstat_t *ksp); extern void __kstat_delete(kstat_t *ksp); -extern void kstat_waitq_enter(kstat_io_t *); -extern void kstat_waitq_exit(kstat_io_t *); -extern void kstat_runq_enter(kstat_io_t *); -extern void kstat_runq_exit(kstat_io_t *); #define kstat_set_seq_raw_ops(k, h, d, a) \ __kstat_set_seq_raw_ops(k, h, d, a) #define kstat_set_raw_ops(k, h, d, a) \ __kstat_set_raw_ops(k, h, d, a) #ifndef _STANDALONE #define kstat_create(m, i, n, c, t, s, f) \ __kstat_create(m, i, n, c, t, s, f) #define kstat_install(k) __kstat_install(k) #define kstat_delete(k) __kstat_delete(k) #else #define kstat_create(m, i, n, c, t, s, f) ((kstat_t *)0) #define kstat_install(k) #define kstat_delete(k) #endif #endif /* _SPL_KSTAT_H */ diff --git a/include/os/linux/spl/sys/kstat.h b/include/os/linux/spl/sys/kstat.h index 905d8257c8d3..928f70757545 100644 --- a/include/os/linux/spl/sys/kstat.h +++ b/include/os/linux/spl/sys/kstat.h @@ -1,222 +1,218 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_KSTAT_H #define _SPL_KSTAT_H #include #include #include #include #include #include #define KSTAT_STRLEN 255 #define KSTAT_RAW_MAX (128*1024) /* * For reference valid classes are: * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc */ #define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ #define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ #define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ #define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ #define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ #define KSTAT_NUM_TYPES 5 #define KSTAT_DATA_CHAR 0 #define KSTAT_DATA_INT32 1 #define KSTAT_DATA_UINT32 2 #define KSTAT_DATA_INT64 3 #define KSTAT_DATA_UINT64 4 #define KSTAT_DATA_LONG 5 #define KSTAT_DATA_ULONG 6 #define KSTAT_DATA_STRING 7 #define KSTAT_NUM_DATAS 8 #define KSTAT_INTR_HARD 0 #define KSTAT_INTR_SOFT 1 #define KSTAT_INTR_WATCHDOG 2 #define KSTAT_INTR_SPURIOUS 3 #define KSTAT_INTR_MULTSVC 4 #define KSTAT_NUM_INTRS 5 #define KSTAT_FLAG_VIRTUAL 0x01 #define KSTAT_FLAG_VAR_SIZE 0x02 #define KSTAT_FLAG_WRITABLE 0x04 #define KSTAT_FLAG_PERSISTENT 0x08 #define KSTAT_FLAG_DORMANT 0x10 #define KSTAT_FLAG_INVALID 0x20 #define KSTAT_FLAG_LONGSTRINGS 0x40 #define KSTAT_FLAG_NO_HEADERS 0x80 #define KS_MAGIC 0x9d9d9d9d /* Dynamic updates */ #define KSTAT_READ 0 #define KSTAT_WRITE 1 struct kstat_s; typedef struct kstat_s kstat_t; typedef int kid_t; /* unique kstat id */ typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ typedef struct kstat_module { char ksm_name[KSTAT_STRLEN+1]; /* module name */ struct list_head ksm_module_list; /* module linkage */ struct list_head ksm_kstat_list; /* list of kstat entries */ struct proc_dir_entry *ksm_proc; /* proc entry */ } kstat_module_t; typedef struct kstat_raw_ops { int (*headers)(char *buf, size_t size); int (*data)(char *buf, size_t size, void *data); void *(*addr)(kstat_t *ksp, loff_t index); } kstat_raw_ops_t; typedef struct kstat_proc_entry { char kpe_name[KSTAT_STRLEN+1]; /* kstat name */ char kpe_module[KSTAT_STRLEN+1]; /* provider module name */ kstat_module_t *kpe_owner; /* kstat module linkage */ struct list_head kpe_list; /* kstat linkage */ struct proc_dir_entry *kpe_proc; /* procfs entry */ } kstat_proc_entry_t; struct kstat_s { int ks_magic; /* magic value */ kid_t ks_kid; /* unique kstat ID */ hrtime_t ks_crtime; /* creation time */ hrtime_t ks_snaptime; /* last access time */ int ks_instance; /* provider module instance */ char ks_class[KSTAT_STRLEN+1]; /* kstat class */ uchar_t ks_type; /* kstat data type */ uchar_t ks_flags; /* kstat flags */ void *ks_data; /* kstat type-specific data */ uint_t ks_ndata; /* # of data records */ size_t ks_data_size; /* size of kstat data section */ kstat_update_t *ks_update; /* dynamic updates */ void *ks_private; /* private data */ kmutex_t ks_private_lock; /* kstat private data lock */ kmutex_t *ks_lock; /* kstat data lock */ kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ char *ks_raw_buf; /* buf used for raw ops */ size_t ks_raw_bufsize; /* size of raw ops buffer */ kstat_proc_entry_t ks_proc; /* data for procfs entry */ }; typedef struct kstat_named_s { char name[KSTAT_STRLEN]; /* name of counter */ uchar_t data_type; /* data type */ union { char c[16]; /* 128-bit int */ int32_t i32; /* 32-bit signed int */ uint32_t ui32; /* 32-bit unsigned int */ int64_t i64; /* 64-bit signed int */ uint64_t ui64; /* 64-bit unsigned int */ long l; /* native signed long */ ulong_t ul; /* native unsigned long */ struct { union { char *ptr; /* NULL-term string */ char __pad[8]; /* 64-bit padding */ } addr; uint32_t len; /* # bytes for strlen + '\0' */ } string; } value; } kstat_named_t; #define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) #define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) #ifdef HAVE_PROC_OPS_STRUCT typedef struct proc_ops kstat_proc_op_t; #else typedef struct file_operations kstat_proc_op_t; #endif typedef struct kstat_intr { uint_t intrs[KSTAT_NUM_INTRS]; } kstat_intr_t; typedef struct kstat_io { u_longlong_t nread; /* number of bytes read */ u_longlong_t nwritten; /* number of bytes written */ uint_t reads; /* number of read operations */ uint_t writes; /* number of write operations */ hrtime_t wtime; /* cumulative wait (pre-service) time */ hrtime_t wlentime; /* cumulative wait len*time product */ hrtime_t wlastupdate; /* last time wait queue changed */ hrtime_t rtime; /* cumulative run (service) time */ hrtime_t rlentime; /* cumulative run length*time product */ hrtime_t rlastupdate; /* last time run queue changed */ uint_t wcnt; /* count of elements in wait state */ uint_t rcnt; /* count of elements in run state */ } kstat_io_t; typedef struct kstat_timer { char name[KSTAT_STRLEN+1]; /* event name */ u_longlong_t num_events; /* number of events */ hrtime_t elapsed_time; /* cumulative elapsed time */ hrtime_t min_time; /* shortest event duration */ hrtime_t max_time; /* longest event duration */ hrtime_t start_time; /* previous event start time */ hrtime_t stop_time; /* previous event stop time */ } kstat_timer_t; int spl_kstat_init(void); void spl_kstat_fini(void); extern void __kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void* (*addr)(kstat_t *ksp, loff_t index)); extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags); extern void kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, const char *name); extern void kstat_proc_entry_delete(kstat_proc_entry_t *kpep); extern void kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, const kstat_proc_op_t *file_ops, void *data); extern void __kstat_install(kstat_t *ksp); extern void __kstat_delete(kstat_t *ksp); -extern void kstat_waitq_enter(kstat_io_t *); -extern void kstat_waitq_exit(kstat_io_t *); -extern void kstat_runq_enter(kstat_io_t *); -extern void kstat_runq_exit(kstat_io_t *); #define kstat_set_raw_ops(k, h, d, a) \ __kstat_set_raw_ops(k, h, d, a) #define kstat_create(m, i, n, c, t, s, f) \ __kstat_create(m, i, n, c, t, s, f) #define kstat_install(k) __kstat_install(k) #define kstat_delete(k) __kstat_delete(k) #endif /* _SPL_KSTAT_H */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 374d36e7327e..d37c6c923d8c 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1,1211 +1,1210 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. */ #ifndef _SYS_SPA_H #define _SYS_SPA_H #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Forward references that lots of things need. */ typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct metaslab_group metaslab_group_t; typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; struct bpobj; struct bplist; struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; /* * Alignment Shift (ashift) is an immutable, internal top-level vdev property * which can only be set at vdev creation time. Physical writes are always done * according to it, which makes 2^ashift the smallest possible IO on a vdev. * * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB * (2^16 = 65,536). */ #define ASHIFT_MIN 9 #define ASHIFT_MAX 16 /* * Size of block to hold the configuration data (a packed nvlist) */ #define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. * The ASIZE encoding should be at least 64 times larger (6 more bits) * to support up to 4-way RAID-Z mirror mode with worst-case gang block * overhead, three DVAs per bp, plus one more bit in case we do anything * else that expands the ASIZE. */ #define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 #define SPA_VDEVBITS 24 #define SPA_COMPRESSMASK ((1U << SPA_COMPRESSBITS) - 1) /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. */ typedef struct dva { uint64_t dva_word[2]; } dva_t; /* * Some checksums/hashes need a 256-bit initialization salt. This salt is kept * secret and is suitable for use in MAC algorithms as the key. */ typedef struct zio_cksum_salt { uint8_t zcs_bytes[32]; } zio_cksum_salt_t; /* * Each block is described by its DVAs, time of birth, checksum, etc. * The word-by-word, bit-by-bit layout of the blkptr is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | pad | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | pad | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | pad | vdev3 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | checksum[2] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | checksum[3] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * vdev virtual device ID * offset offset into virtual device * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) * GRID RAID-Z layout information (reserved for future use) * cksum checksum function * comp compression function * G gang block indicator * B byteorder (endianness) * D dedup * X encryption * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * phys birth txg when dva[0] was written; zero if same as logical birth txg * note that typically all the dva's would be written in this * txg, but they could be different if they were moved by * device removal. * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ /* * The blkptr_t's of encrypted blocks also need to store the encryption * parameters so that the block can be decrypted. This layout is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | salt | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 | IV1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | IV2 | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | MAC[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | MAC[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * salt Salt for generating encryption keys * IV1 First 64 bits of encryption IV * X Block requires encryption handling (set to 1) * E blkptr_t contains embedded data (set to 0, see below) * fill count number of non-zero blocks under this bp (truncated to 32 bits) * IV2 Last 32 bits of encryption IV * checksum[2] 128-bit checksum of the data this bp describes * MAC[2] 128-bit message authentication code for this data * * The X bit being set indicates that this block is one of 3 types. If this is * a level 0 block with an encrypted object type, the block is encrypted * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted * object type, this block is authenticated with an HMAC (see * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC * words to store a checksum-of-MACs from the level below (see * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() * refers to any of these 3 kinds of blocks. * * The additional encryption parameters are the salt, IV, and MAC which are * explained in greater detail in the block comment at the top of zio_crypt.c. * The MAC occupies half of the checksum space since it serves a very similar * purpose: to prevent data corruption on disk. The only functional difference * is that the checksum is used to detect on-disk corruption whether or not the * encryption key is loaded and the MAC provides additional protection against * malicious disk tampering. We use the 3rd DVA to store the salt and first * 64 bits of the IV. As a result encrypted blocks can only have 2 copies * maximum instead of the normal 3. The last 32 bits of the IV are stored in * the upper bits of what is usually the fill count. Note that only blocks at * level 0 or -2 are ever encrypted, which allows us to guarantee that these * 32 bits are not trampled over by other code (see zio_crypt.c for details). * The salt and IV are not used for authenticated bps or bps with an indirect * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits * for the fill count. */ /* * "Embedded" blkptr_t's don't actually point to a block, instead they * have a data payload embedded in the blkptr_t itself. See the comment * in blkptr.c for more details. * * The blkptr_t is laid out as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | payload | * 1 | payload | * 2 | payload | * 3 | payload | * 4 | payload | * 5 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | payload | * 8 | payload | * 9 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | payload | * c | payload | * d | payload | * e | payload | * f | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * payload contains the embedded data * B (byteorder) byteorder (endianness) * D (dedup) padding (set to zero) * X encryption (set to zero) * E (embedded) set to one * lvl indirection level * type DMU object type * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) * comp compression function of payload * PSIZE size of payload after compression, in bytes * LSIZE logical size of payload, in bytes * note that 25 bits is enough to store the largest * "normal" BP's LSIZE (2^16 * 2^9) in bytes * log. birth transaction group in which the block was logically born * * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded * bp's they are stored in units of SPA_MINBLOCKSHIFT. * Generally, the generic BP_GET_*() macros can be used on embedded BP's. * The B, D, X, lvl, type, and comp fields are stored the same as with normal * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before * other macros, as they assert that they are only used on BP's of the correct * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use * the payload space for encryption parameters (see the comment above on * how encryption parameters are stored). */ #define BPE_GET_ETYPE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET((bp)->blk_prop, 40, 8)) #define BPE_SET_ETYPE(bp, t) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, t); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_LSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) #define BPE_SET_LSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_PSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */ BP_EMBEDDED_TYPE_REDACTED, NUM_BP_EMBEDDED_TYPES } bp_embedded_type_t; #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ #define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ /* * A block is a hole when it has either 1) never been written to, or * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads * without physically allocating disk space. Holes are represented in the * blkptr_t structure by zeroed blk_dva. Correct checking for holes is * done through the BP_IS_HOLE macro. For holes, the logical size, level, * DMU object type, and birth times are all also stored for holes that * were written to at some point (i.e. were punched after having been filled). */ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_pad[2]; /* Extra space for the future */ uint64_t blk_phys_birth; /* txg when block was allocated */ uint64_t blk_birth; /* transaction group at birth */ uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; /* * Macros to get and set fields in a bp or DVA. */ /* * Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for * this gang DVA including its children BP's. The space allocated at this * DVA's vdev/offset is vdev_gang_header_asize(vdev). */ #define DVA_GET_ASIZE(dva) \ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) #define DVA_SET_VDEV(dva, x) \ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_OFFSET(dva, x) \ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? \ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_LSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_PSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_PSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_COMPRESS(bp) \ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) #define BP_SET_COMPRESS(bp, x) \ BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) #define BP_GET_CHECKSUM(bp) \ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ BF64_GET((bp)->blk_prop, 40, 8)) #define BP_SET_CHECKSUM(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) /* encrypted, authenticated, and MAC cksum bps use the same bit */ #define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) #define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) #define BP_IS_ENCRYPTED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_IS_AUTHENTICATED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) #define BP_IS_PROTECTED(bp) \ (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) #define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) #define BP_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ (bp)->blk_birth = (logical); \ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ } #define BP_GET_FILL(bp) \ ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) #define BP_SET_FILL(bp, fill) \ { \ if (BP_IS_ENCRYPTED(bp)) \ BF64_SET((bp)->blk_fill, 0, 32, fill); \ else \ (bp)->blk_fill = fill; \ } #define BP_GET_IV2(bp) \ (ASSERT(BP_IS_ENCRYPTED(bp)), \ BF64_GET((bp)->blk_fill, 32, 32)) #define BP_SET_IV2(bp, iv2) \ { \ ASSERT(BP_IS_ENCRYPTED(bp)); \ BF64_SET((bp)->blk_fill, 32, 32, iv2); \ } #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_GET_UCSIZE(bp) \ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_COUNT_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ (bp1)->blk_birth == (bp2)->blk_birth && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) #define BP_IS_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ (dva)->dva_word[1] == 0ULL) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) #define BP_SET_REDACTED(bp) \ { \ BP_SET_EMBEDDED(bp, B_TRUE); \ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED); \ } #define BP_IS_REDACTED(bp) \ (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) #define BP_ZERO(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[0] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ (bp)->blk_phys_birth = 0; \ (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } #ifdef _ZFS_BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else #define ZFS_HOST_BYTEORDER (1ULL) #endif #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) #define BP_SPRINTF_LEN 400 /* * This macro allows code sharing between zfs, libzpool, and mdb. * 'func' is either snprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *copyname[] = \ { "zero", "single", "double", "triple" }; \ int len = 0; \ int copies = 0; \ const char *crypt_type; \ if (bp != NULL) { \ if (BP_IS_ENCRYPTED(bp)) { \ crypt_type = "encrypted"; \ /* LINTED E_SUSPICIOUS_COMPARISON */ \ } else if (BP_IS_AUTHENTICATED(bp)) { \ crypt_type = "authenticated"; \ } else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { \ crypt_type = "indirect-MAC"; \ } else { \ crypt_type = "unencrypted"; \ } \ } \ if (bp == NULL) { \ len += func(buf + len, size - len, ""); \ } else if (BP_IS_HOLE(bp)) { \ len += func(buf + len, size - len, \ "HOLE [L%llu %s] " \ "size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ "size=%llxL/%llxP birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (int)BPE_GET_ETYPE(bp), \ compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else if (BP_IS_REDACTED(bp)) { \ len += func(buf + len, size - len, \ "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ if (DVA_IS_VALID(dva)) \ copies++; \ len += func(buf + len, size - len, \ "DVA[%d]=<%llu:%llx:%llx>%c", d, \ (u_longlong_t)DVA_GET_VDEV(dva), \ (u_longlong_t)DVA_GET_OFFSET(dva), \ (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ if (BP_IS_ENCRYPTED(bp)) { \ len += func(buf + len, size - len, \ "salt=%llx iv=%llx:%llx%c", \ (u_longlong_t)bp->blk_dva[2].dva_word[0], \ (u_longlong_t)bp->blk_dva[2].dva_word[1], \ (u_longlong_t)BP_GET_IV2(bp), \ ws); \ } \ if (BP_IS_GANG(bp) && \ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ copies--; \ len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ "cksum=%llx:%llx:%llx:%llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ compress, \ crypt_type, \ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ BP_IS_GANG(bp) ? "gang" : "contiguous", \ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ copyname[copies], \ ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth, \ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \ (u_longlong_t)bp->blk_cksum.zc_word[2], \ (u_longlong_t)bp->blk_cksum.zc_word[3]); \ } \ ASSERT(len < size); \ } #define BP_GET_BUFC_TYPE(bp) \ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, SPA_IMPORT_ASSEMBLE } spa_import_type_t; typedef enum spa_mode { SPA_MODE_UNINIT = 0, SPA_MODE_READ = 1, SPA_MODE_WRITE = 2, } spa_mode_t; /* * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources. */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, #ifdef IN_FREEBSD_BASE SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON, #else SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF, #endif } spa_autotrim_t; /* * Reason TRIM command was issued, used internally for accounting purposes. */ typedef enum trim_type { TRIM_TYPE_MANUAL = 0, TRIM_TYPE_AUTO = 1, TRIM_TYPE_SIMPLE = 2 } trim_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(const char *pool); extern int spa_checkpoint(const char *pool); extern int spa_checkpoint_discard(const char *pool); extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); extern int spa_reset(const char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern int spa_async_tasks(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 #define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 #define SPA_ASYNC_REBUILD_DONE 0x2000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist); extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); /* L2ARC state (which is global across all pools) */ extern void spa_l2cache_add(vdev_t *vd); extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); /* scanning */ extern int spa_scan(spa_t *spa, pool_scan_func_t func); extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); extern int zfs_sync_pass_deferred_free; /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; /* * SPA configuration functions in spa_config.c */ #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); /* * Miscellaneous SPA routines in spa_misc.c */ /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); extern void spa_async_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ #define SCL_ALLOC 0x08 #define SCL_ZIO 0x10 #define SCL_FREE 0x20 #define SCL_VDEV 0x40 #define SCL_LOCKS 7 #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) /* Historical pool statistics */ typedef struct spa_history_kstat { kmutex_t lock; uint64_t count; uint64_t size; kstat_t *kstat; void *priv; list_t list; } spa_history_kstat_t; typedef struct spa_history_list { uint64_t size; procfs_list_t procfs_list; } spa_history_list_t; typedef struct spa_stats { spa_history_list_t read_history; spa_history_list_t txg_history; spa_history_kstat_t tx_assign_histogram; - spa_history_kstat_t io_history; spa_history_list_t mmp_history; spa_history_kstat_t state; /* pool state */ spa_history_kstat_t iostats; } spa_stats_t; typedef enum txg_state { TXG_STATE_BIRTH = 0, TXG_STATE_OPEN = 1, TXG_STATE_QUIESCED = 2, TXG_STATE_WAIT_FOR_SYNC = 3, TXG_STATE_SYNCED = 4, TXG_STATE_COMMITTED = 5, } txg_state_t; typedef struct txg_stat { vdev_stat_t vs1; vdev_stat_t vs2; uint64_t txg; uint64_t ndirty; } txg_stat_t; /* Assorted pool IO kstats */ typedef struct spa_iostats { kstat_named_t trim_extents_written; kstat_named_t trim_bytes_written; kstat_named_t trim_extents_skipped; kstat_named_t trim_bytes_skipped; kstat_named_t trim_extents_failed; kstat_named_t trim_bytes_failed; kstat_named_t autotrim_extents_written; kstat_named_t autotrim_bytes_written; kstat_named_t autotrim_extents_skipped; kstat_named_t autotrim_bytes_skipped; kstat_named_t autotrim_extents_failed; kstat_named_t autotrim_bytes_failed; kstat_named_t simple_trim_extents_written; kstat_named_t simple_trim_bytes_written; kstat_named_t simple_trim_extents_skipped; kstat_named_t simple_trim_bytes_skipped; kstat_named_t simple_trim_extents_failed; kstat_named_t simple_trim_bytes_failed; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags); extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); extern int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time); extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t, struct dsl_pool *); extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *); extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, hrtime_t duration); extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, int error); extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining); extern int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t max_txg); extern int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t spa_load_state); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Log state */ typedef enum spa_log_state { SPA_LOG_UNKNOWN = 0, /* unknown log state */ SPA_LOG_MISSING, /* missing log(s) */ SPA_LOG_CLEAR, /* clear the log(s) */ SPA_LOG_GOOD, /* log(s) are good */ } spa_log_state_t; extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_reset_logs(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); extern void spa_deadman(void *); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); extern int spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern uint64_t spa_get_failmode(spa_t *spa); extern uint64_t spa_get_deadman_failmode(spa_t *spa); extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern space_map_t *spa_syncing_log_sm(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_deadman_ziotime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); extern spa_autotrim_t spa_get_autotrim(spa_t *spa); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...); extern void spa_load_note(spa_t *spa, const char *fmt, ...); extern void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); extern uint64_t spa_generate_guid(spa_t *spa); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern boolean_t spa_has_checkpoint(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); extern uint64_t spa_min_claim_txg(spa_t *spa); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp); typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, void *arg); extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg); extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); extern char *spa_his_ievent_table[]; extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf); extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); extern void spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx); extern void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, zio_t *zio); extern void zfs_ereport_taskq_fini(void); extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); /* vdev cache */ extern void vdev_cache_stat_init(void); extern void vdev_cache_stat_fini(void); /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ extern void spa_init(spa_mode_t mode); extern void spa_fini(void); extern void spa_boot_init(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, const char *name); /* waiting for pool activities to complete */ extern int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited); extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited); extern void spa_notify_waiters(spa_t *spa); extern void spa_wake_waiters(spa_t *spa); /* module param call functions */ int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS); int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ _NOTE(CONSTCOND) } while (0) #else #define dprintf_bp(bp, fmt, ...) #endif extern spa_mode_t spa_mode_global; extern int zfs_deadman_enabled; extern unsigned long zfs_deadman_synctime_ms; extern unsigned long zfs_deadman_ziotime_ms; extern unsigned long zfs_deadman_checktime_ms; #ifdef __cplusplus } #endif #endif /* _SYS_SPA_H */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 89afa98253f7..aa4338ed2859 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -1,776 +1,770 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H #ifdef __cplusplus extern "C" { #endif /* * This code compiles in three different contexts. When __KERNEL__ is defined, * the code uses "unix-like" kernel interfaces. When _STANDALONE is defined, the * code is running in a reduced capacity environment of the boot loader which is * generally a subset of both POSIX and kernel interfaces (with a few unique * interfaces too). When neither are defined, it's in a userland POSIX or * similar environment. */ #if defined(__KERNEL__) || defined(_STANDALONE) #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #else /* _KERNEL || _STANDALONE */ #define _SYS_MUTEX_H #define _SYS_RWLOCK_H #define _SYS_CONDVAR_H #define _SYS_VNODE_H #define _SYS_VFS_H #define _SYS_SUNDDI_H #define _SYS_CALLB_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Stack */ #define noinline __attribute__((noinline)) #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) /* * Debugging */ /* * Note that we are not using the debugging levels. */ #define CE_CONT 0 /* continuation */ #define CE_NOTE 1 /* notice */ #define CE_WARN 2 /* warning */ #define CE_PANIC 3 /* panic */ #define CE_IGNORE 4 /* print nothing */ /* * ZFS debugging */ extern void dprintf_setup(int *argc, char **argv); extern void cmn_err(int, const char *, ...); extern void vcmn_err(int, const char *, va_list); extern void panic(const char *, ...) __NORETURN; extern void vpanic(const char *, va_list) __NORETURN; #define fm_panic panic /* * DTrace SDT probes have different signatures in userland than they do in * the kernel. If they're being used in kernel code, re-define them out of * existence for their counterparts in libzpool. * * Here's an example of how to use the set-error probes in userland: * zfs$target:::set-error /arg0 == EBUSY/ {stack();} * * Here's an example of how to use DTRACE_PROBE probes in userland: * If there is a probe declared as follows: * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn); * Then you can use it as follows: * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/ * {printf("%u %p\n", arg1, arg2);} */ #ifdef DTRACE_PROBE #undef DTRACE_PROBE #endif /* DTRACE_PROBE */ #define DTRACE_PROBE(a) #ifdef DTRACE_PROBE1 #undef DTRACE_PROBE1 #endif /* DTRACE_PROBE1 */ #define DTRACE_PROBE1(a, b, c) #ifdef DTRACE_PROBE2 #undef DTRACE_PROBE2 #endif /* DTRACE_PROBE2 */ #define DTRACE_PROBE2(a, b, c, d, e) #ifdef DTRACE_PROBE3 #undef DTRACE_PROBE3 #endif /* DTRACE_PROBE3 */ #define DTRACE_PROBE3(a, b, c, d, e, f, g) #ifdef DTRACE_PROBE4 #undef DTRACE_PROBE4 #endif /* DTRACE_PROBE4 */ #define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) /* * Tunables. */ typedef struct zfs_kernel_param { const char *name; /* unused stub */ } zfs_kernel_param_t; #define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) #define ZFS_MODULE_PARAM_ARGS void #define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ getfunc, perm, desc) /* * Threads. */ typedef pthread_t kthread_t; #define TS_RUN 0x00000002 #define TS_JOINABLE 0x00000004 #define curthread ((void *)(uintptr_t)pthread_self()) #define kpreempt(x) yield() #define getcomm() "unknown" #define thread_create_named(name, stk, stksize, func, arg, len, \ pp, state, pri) \ zk_thread_create(func, arg, stksize, state) #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(func, arg, stksize, state) #define thread_exit() pthread_exit(NULL) #define thread_join(t) pthread_join((pthread_t)(t), NULL) #define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) /* in libzpool, p0 exists only to have its address taken */ typedef struct proc { uintptr_t this_is_never_used_dont_dereference_it; } proc_t; extern struct proc p0; #define curproc (&p0) #define PS_NONE -1 extern kthread_t *zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state); #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) #define kpreempt_disable() ((void)0) #define kpreempt_enable() ((void)0) #define cond_resched() sched_yield() /* * Mutexes */ typedef struct kmutex { pthread_mutex_t m_lock; pthread_t m_owner; } kmutex_t; #define MUTEX_DEFAULT 0 #define MUTEX_NOLOCKDEP MUTEX_DEFAULT #define MUTEX_HELD(mp) pthread_equal((mp)->m_owner, pthread_self()) #define MUTEX_NOT_HELD(mp) !MUTEX_HELD(mp) extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) /* * RW locks */ typedef struct krwlock { pthread_rwlock_t rw_lock; pthread_t rw_owner; uint_t rw_readers; } krwlock_t; typedef int krw_t; #define RW_READER 0 #define RW_WRITER 1 #define RW_DEFAULT RW_READER #define RW_NOLOCKDEP RW_READER #define RW_READ_HELD(rw) ((rw)->rw_readers > 0) #define RW_WRITE_HELD(rw) pthread_equal((rw)->rw_owner, pthread_self()) #define RW_LOCK_HELD(rw) (RW_READ_HELD(rw) || RW_WRITE_HELD(rw)) extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); extern void rw_destroy(krwlock_t *rwlp); extern void rw_enter(krwlock_t *rwlp, krw_t rw); extern int rw_tryenter(krwlock_t *rwlp, krw_t rw); extern int rw_tryupgrade(krwlock_t *rwlp); extern void rw_exit(krwlock_t *rwlp); #define rw_downgrade(rwlp) do { } while (0) /* * Credentials */ extern uid_t crgetuid(cred_t *cr); extern uid_t crgetruid(cred_t *cr); extern gid_t crgetgid(cred_t *cr); extern int crgetngroups(cred_t *cr); extern gid_t *crgetgroups(cred_t *cr); /* * Condition variables */ typedef pthread_cond_t kcondvar_t; #define CV_DEFAULT 0 #define CALLOUT_FLAG_ABSOLUTE 0x2 extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); extern void cv_destroy(kcondvar_t *cv); extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp); extern int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); extern int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag); extern void cv_signal(kcondvar_t *cv); extern void cv_broadcast(kcondvar_t *cv); #define cv_timedwait_io(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_timedwait_idle(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_timedwait_sig(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_wait_io(cv, mp) cv_wait(cv, mp) #define cv_wait_idle(cv, mp) cv_wait(cv, mp) #define cv_wait_io_sig(cv, mp) cv_wait_sig(cv, mp) #define cv_timedwait_sig_hires(cv, mp, t, r, f) \ cv_timedwait_hires(cv, mp, t, r, f) #define cv_timedwait_idle_hires(cv, mp, t, r, f) \ cv_timedwait_hires(cv, mp, t, r, f) /* * Thread-specific data */ #define tsd_get(k) pthread_getspecific(k) #define tsd_set(k, v) pthread_setspecific(k, v) #define tsd_create(kp, d) pthread_key_create((pthread_key_t *)kp, d) #define tsd_destroy(kp) /* nothing */ #ifdef __FreeBSD__ typedef off_t loff_t; #endif /* * kstat creation, installation and deletion */ extern kstat_t *kstat_create(const char *, int, const char *, const char *, uchar_t, ulong_t, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); -extern void kstat_waitq_enter(kstat_io_t *); -extern void kstat_waitq_exit(kstat_io_t *); -extern void kstat_runq_enter(kstat_io_t *); -extern void kstat_runq_exit(kstat_io_t *); -extern void kstat_waitq_to_runq(kstat_io_t *); -extern void kstat_runq_back_to_waitq(kstat_io_t *); extern void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)); /* * procfs list manipulation */ typedef struct procfs_list { void *pl_private; kmutex_t pl_lock; list_t pl_list; uint64_t pl_next_id; size_t pl_node_offset; } procfs_list_t; #ifndef __cplusplus struct seq_file { }; void seq_printf(struct seq_file *m, const char *fmt, ...); typedef struct procfs_list_node { list_node_t pln_link; uint64_t pln_id; } procfs_list_node_t; void procfs_list_install(const char *module, const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, int (*show)(struct seq_file *f, void *p), int (*show_header)(struct seq_file *f), int (*clear)(procfs_list_t *procfs_list), size_t procfs_list_node_off); void procfs_list_uninstall(procfs_list_t *procfs_list); void procfs_list_destroy(procfs_list_t *procfs_list); void procfs_list_add(procfs_list_t *procfs_list, void *p); #endif /* * Kernel memory */ #define KM_SLEEP UMEM_NOFAIL #define KM_PUSHPAGE KM_SLEEP #define KM_NOSLEEP UMEM_DEFAULT #define KM_NORMALPRI 0 /* not needed with UMEM_DEFAULT */ #define KMC_NODEBUG UMC_NODEBUG #define KMC_KVMEM 0x0 #define kmem_alloc(_s, _f) umem_alloc(_s, _f) #define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) #define kmem_free(_b, _s) umem_free(_b, _s) #define vmem_alloc(_s, _f) kmem_alloc(_s, _f) #define vmem_zalloc(_s, _f) kmem_zalloc(_s, _f) #define vmem_free(_b, _s) kmem_free(_b, _s) #define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \ umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) #define kmem_cache_destroy(_c) umem_cache_destroy(_c) #define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f) #define kmem_cache_free(_c, _b) umem_cache_free(_c, _b) #define kmem_debugging() 0 #define kmem_cache_reap_now(_c) umem_cache_reap_now(_c); #define kmem_cache_set_move(_c, _cb) /* nothing */ #define POINTER_INVALIDATE(_pp) /* nothing */ #define POINTER_IS_VALID(_p) 0 typedef umem_cache_t kmem_cache_t; typedef enum kmem_cbrc { KMEM_CBRC_YES, KMEM_CBRC_NO, KMEM_CBRC_LATER, KMEM_CBRC_DONT_NEED, KMEM_CBRC_DONT_KNOW } kmem_cbrc_t; /* * Task queues */ #define TASKQ_NAMELEN 31 typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); typedef struct taskq_ent { struct taskq_ent *tqent_next; struct taskq_ent *tqent_prev; task_func_t *tqent_func; void *tqent_arg; uintptr_t tqent_flags; } taskq_ent_t; typedef struct taskq { char tq_name[TASKQ_NAMELEN + 1]; kmutex_t tq_lock; krwlock_t tq_threadlock; kcondvar_t tq_dispatch_cv; kcondvar_t tq_wait_cv; kthread_t **tq_threadlist; int tq_flags; int tq_active; int tq_nthreads; int tq_nalloc; int tq_minalloc; int tq_maxalloc; kcondvar_t tq_maxalloc_cv; int tq_maxalloc_wait; taskq_ent_t *tq_freelist; taskq_ent_t tq_task; } taskq_t; #define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */ #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ #define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */ #define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */ #define TQ_SLEEP KM_SLEEP /* Can block for memory */ #define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_FRONT 0x08 /* Queue in front */ #define TASKQID_INVALID ((taskqid_t)0) extern taskq_t *system_taskq; extern taskq_t *system_delay_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); #define taskq_create_proc(a, b, c, d, e, p, f) \ (taskq_create(a, b, c, d, e, f)) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ (taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); extern void taskq_init_ent(taskq_ent_t *); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern int taskq_member(taskq_t *, kthread_t *); extern taskq_t *taskq_of_curthread(void); extern int taskq_cancel_id(taskq_t *, taskqid_t); extern void system_taskq_init(void); extern void system_taskq_fini(void); #define XVA_MAPSIZE 3 #define XVA_MAGIC 0x78766174 extern char *vn_dumpdir; #define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ typedef struct xoptattr { inode_timespec_t xoa_createtime; /* Create time of file */ uint8_t xoa_archive; uint8_t xoa_system; uint8_t xoa_readonly; uint8_t xoa_hidden; uint8_t xoa_nounlink; uint8_t xoa_immutable; uint8_t xoa_appendonly; uint8_t xoa_nodump; uint8_t xoa_settable; uint8_t xoa_opaque; uint8_t xoa_av_quarantined; uint8_t xoa_av_modified; uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; uint8_t xoa_reparse; uint8_t xoa_offline; uint8_t xoa_sparse; } xoptattr_t; typedef struct vattr { uint_t va_mask; /* bit-mask of attributes */ u_offset_t va_size; /* file size in bytes */ } vattr_t; typedef struct xvattr { vattr_t xva_vattr; /* Embedded vattr structure */ uint32_t xva_magic; /* Magic Number */ uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ xoptattr_t xva_xoptattrs; /* Optional attributes */ } xvattr_t; typedef struct vsecattr { uint_t vsa_mask; /* See below */ int vsa_aclcnt; /* ACL entry count */ void *vsa_aclentp; /* pointer to ACL entries */ int vsa_dfaclcnt; /* default ACL entry count */ void *vsa_dfaclentp; /* pointer to default ACL entries */ size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ } vsecattr_t; #define AT_MODE 0x00002 #define AT_UID 0x00004 #define AT_GID 0x00008 #define AT_FSID 0x00010 #define AT_NODEID 0x00020 #define AT_NLINK 0x00040 #define AT_SIZE 0x00080 #define AT_ATIME 0x00100 #define AT_MTIME 0x00200 #define AT_CTIME 0x00400 #define AT_RDEV 0x00800 #define AT_BLKSIZE 0x01000 #define AT_NBLOCKS 0x02000 #define AT_SEQ 0x08000 #define AT_XVATTR 0x10000 #define CRCREAT 0 #define F_FREESP 11 #define FIGNORECASE 0x80000 /* request case-insensitive lookups */ /* * Random stuff */ #define ddi_get_lbolt() (gethrtime() >> 23) #define ddi_get_lbolt64() (gethrtime() >> 23) #define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */ #define ddi_time_before(a, b) (a < b) #define ddi_time_after(a, b) ddi_time_before(b, a) #define ddi_time_before_eq(a, b) (!ddi_time_after(a, b)) #define ddi_time_after_eq(a, b) ddi_time_before_eq(b, a) #define ddi_time_before64(a, b) (a < b) #define ddi_time_after64(a, b) ddi_time_before64(b, a) #define ddi_time_before_eq64(a, b) (!ddi_time_after64(a, b)) #define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a) extern void delay(clock_t ticks); #define SEC_TO_TICK(sec) ((sec) * hz) #define MSEC_TO_TICK(msec) (howmany((hrtime_t)(msec) * hz, MILLISEC)) #define USEC_TO_TICK(usec) (howmany((hrtime_t)(usec) * hz, MICROSEC)) #define NSEC_TO_TICK(nsec) (howmany((hrtime_t)(nsec) * hz, NANOSEC)) #define max_ncpus 64 #define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN)) /* * Process priorities as defined by setpriority(2) and getpriority(2). */ #define minclsyspri 19 #define maxclsyspri -20 #define defclsyspri 0 #define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1)) #define CPU_SEQID_UNSTABLE CPU_SEQID #define kcred NULL #define CRED() NULL #define ptob(x) ((x) * PAGESIZE) #define NN_DIVISOR_1000 (1U << 0) #define NN_NUMBUF_SZ (6) extern uint64_t physmem; extern const char *random_path; extern const char *urandom_path; extern int highbit64(uint64_t i); extern int lowbit64(uint64_t i); extern int random_get_bytes(uint8_t *ptr, size_t len); extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); extern void kernel_init(int mode); extern void kernel_fini(void); extern void random_init(void); extern void random_fini(void); struct spa; extern void show_pool_stats(struct spa *); extern int set_global_var(char const *arg); typedef struct callb_cpr { kmutex_t *cc_lockp; } callb_cpr_t; #define CALLB_CPR_INIT(cp, lockp, func, name) { \ (cp)->cc_lockp = lockp; \ } #define CALLB_CPR_SAFE_BEGIN(cp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ } #define CALLB_CPR_SAFE_END(cp, lockp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ } #define CALLB_CPR_EXIT(cp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ mutex_exit((cp)->cc_lockp); \ } #define zone_dataset_visible(x, y) (1) #define INGLOBALZONE(z) (1) extern uint32_t zone_get_hostid(void *zonep); extern char *kmem_vasprintf(const char *fmt, va_list adx); extern char *kmem_asprintf(const char *fmt, ...); #define kmem_strfree(str) kmem_free((str), strlen(str) + 1) #define kmem_strdup(s) strdup(s) /* * Hostname information */ extern char hw_serial[]; /* for userland-emulated hostid access */ extern int ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result); extern int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result); typedef struct utsname utsname_t; extern utsname_t *utsname(void); /* ZFS Boot Related stuff. */ struct _buf { intptr_t _fd; }; struct bootstat { uint64_t st_size; }; typedef struct ace_object { uid_t a_who; uint32_t a_access_mask; uint16_t a_flags; uint16_t a_type; uint8_t a_obj_type[16]; uint8_t a_inherit_obj_type[16]; } ace_object_t; #define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 #define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 #define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 #define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int secpolicy_zfs(const cred_t *cr); extern int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc); extern zoneid_t getzoneid(void); /* SID stuff */ typedef struct ksiddomain { uint_t kd_ref; uint_t kd_len; char *kd_name; } ksiddomain_t; ksiddomain_t *ksid_lookupdomain(const char *); void ksiddomain_rele(ksiddomain_t *); #define DDI_SLEEP KM_SLEEP #define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \ sysevent_post_event(_c, _d, _b, "libzpool", _e, _f) #define zfs_sleep_until(wakeup) \ do { \ hrtime_t delta = wakeup - gethrtime(); \ struct timespec ts; \ ts.tv_sec = delta / NANOSEC; \ ts.tv_nsec = delta % NANOSEC; \ (void) nanosleep(&ts, NULL); \ } while (0) typedef int fstrans_cookie_t; extern fstrans_cookie_t spl_fstrans_mark(void); extern void spl_fstrans_unmark(fstrans_cookie_t); extern int __spl_pf_fstrans_check(void); extern int kmem_cache_reap_active(void); #define ____cacheline_aligned /* * Kernel modules */ #define __init #define __exit #endif /* _KERNEL || _STANDALONE */ #ifdef __cplusplus }; #endif #endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h index 69fb6d401fc7..f73fb92eb797 100644 --- a/lib/libspl/include/sys/kstat.h +++ b/lib/libspl/include/sys/kstat.h @@ -1,822 +1,816 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_KSTAT_H #define _SYS_KSTAT_H /* * Definition of general kernel statistics structures and /dev/kstat ioctls */ #include #include #ifdef __cplusplus extern "C" { #endif typedef int kid_t; /* unique kstat id */ /* * Kernel statistics driver (/dev/kstat) ioctls */ #define KSTAT_IOC_BASE ('K' << 8) #define KSTAT_IOC_CHAIN_ID KSTAT_IOC_BASE | 0x01 #define KSTAT_IOC_READ KSTAT_IOC_BASE | 0x02 #define KSTAT_IOC_WRITE KSTAT_IOC_BASE | 0x03 /* * /dev/kstat ioctl usage (kd denotes /dev/kstat descriptor): * * kcid = ioctl(kd, KSTAT_IOC_CHAIN_ID, NULL); * kcid = ioctl(kd, KSTAT_IOC_READ, kstat_t *); * kcid = ioctl(kd, KSTAT_IOC_WRITE, kstat_t *); */ #define KSTAT_STRLEN 255 /* 254 chars + NULL; must be 16 * n - 1 */ /* * The generic kstat header */ typedef struct kstat { /* * Fields relevant to both kernel and user */ hrtime_t ks_crtime; /* creation time (from gethrtime()) */ struct kstat *ks_next; /* kstat chain linkage */ kid_t ks_kid; /* unique kstat ID */ char ks_module[KSTAT_STRLEN]; /* provider module name */ uchar_t ks_resv; /* reserved, currently just padding */ int ks_instance; /* provider module's instance */ char ks_name[KSTAT_STRLEN]; /* kstat name */ uchar_t ks_type; /* kstat data type */ char ks_class[KSTAT_STRLEN]; /* kstat class */ uchar_t ks_flags; /* kstat flags */ void *ks_data; /* kstat type-specific data */ uint_t ks_ndata; /* # of type-specific data records */ size_t ks_data_size; /* total size of kstat data section */ hrtime_t ks_snaptime; /* time of last data snapshot */ /* * Fields relevant to kernel only */ int (*ks_update)(struct kstat *, int); /* dynamic update */ void *ks_private; /* arbitrary provider-private data */ int (*ks_snapshot)(struct kstat *, void *, int); void *ks_lock; /* protects this kstat's data */ } kstat_t; #ifdef _SYSCALL32 typedef int32_t kid32_t; typedef struct kstat32 { /* * Fields relevant to both kernel and user */ hrtime_t ks_crtime; caddr32_t ks_next; /* struct kstat pointer */ kid32_t ks_kid; char ks_module[KSTAT_STRLEN]; uint8_t ks_resv; int32_t ks_instance; char ks_name[KSTAT_STRLEN]; uint8_t ks_type; char ks_class[KSTAT_STRLEN]; uint8_t ks_flags; caddr32_t ks_data; /* type-specific data */ uint32_t ks_ndata; size32_t ks_data_size; hrtime_t ks_snaptime; /* * Fields relevant to kernel only (only needed here for padding) */ int32_t _ks_update; caddr32_t _ks_private; int32_t _ks_snapshot; caddr32_t _ks_lock; } kstat32_t; #endif /* _SYSCALL32 */ /* * kstat structure and locking strategy * * Each kstat consists of a header section (a kstat_t) and a data section. * The system maintains a set of kstats, protected by kstat_chain_lock. * kstat_chain_lock protects all additions to/deletions from this set, * as well as all changes to kstat headers. kstat data sections are * *optionally* protected by the per-kstat ks_lock. If ks_lock is non-NULL, * kstat clients (e.g. /dev/kstat) will acquire this lock for all of their * operations on that kstat. It is up to the kstat provider to decide whether * guaranteeing consistent data to kstat clients is sufficiently important * to justify the locking cost. Note, however, that most statistic updates * already occur under one of the provider's mutexes, so if the provider sets * ks_lock to point to that mutex, then kstat data locking is free. * * NOTE: variable-size kstats MUST employ kstat data locking, to prevent * data-size races with kstat clients. * * NOTE: ks_lock is really of type (kmutex_t *); it is declared as (void *) * in the kstat header so that users don't have to be exposed to all of the * kernel's lock-related data structures. */ #if defined(_KERNEL) #define KSTAT_ENTER(k) \ { kmutex_t *lp = (k)->ks_lock; if (lp) mutex_enter(lp); } #define KSTAT_EXIT(k) \ { kmutex_t *lp = (k)->ks_lock; if (lp) mutex_exit(lp); } #define KSTAT_UPDATE(k, rw) (*(k)->ks_update)((k), (rw)) #define KSTAT_SNAPSHOT(k, buf, rw) (*(k)->ks_snapshot)((k), (buf), (rw)) #endif /* defined(_KERNEL) */ /* * kstat time * * All times associated with kstats (e.g. creation time, snapshot time, * kstat_timer_t and kstat_io_t timestamps, etc.) are 64-bit nanosecond values, * as returned by gethrtime(). The accuracy of these timestamps is machine * dependent, but the precision (units) is the same across all platforms. */ /* * kstat identity (KID) * * Each kstat is assigned a unique KID (kstat ID) when it is added to the * global kstat chain. The KID is used as a cookie by /dev/kstat to * request information about the corresponding kstat. There is also * an identity associated with the entire kstat chain, kstat_chain_id, * which is bumped each time a kstat is added or deleted. /dev/kstat uses * the chain ID to detect changes in the kstat chain (e.g., a new disk * coming online) between ioctl()s. */ /* * kstat module, kstat instance * * ks_module and ks_instance contain the name and instance of the module * that created the kstat. In cases where there can only be one instance, * ks_instance is 0. The kernel proper (/kernel/unix) uses "unix" as its * module name. */ /* * kstat name * * ks_name gives a meaningful name to a kstat. The full kstat namespace * is module.instance.name, so the name only need be unique within a * module. kstat_create() will fail if you try to create a kstat with * an already-used (ks_module, ks_instance, ks_name) triplet. Spaces are * allowed in kstat names, but strongly discouraged, since they hinder * awk-style processing at user level. */ /* * kstat type * * The kstat mechanism provides several flavors of kstat data, defined * below. The "raw" kstat type is just treated as an array of bytes; you * can use this to export any kind of data you want. * * Some kstat types allow multiple data structures per kstat, e.g. * KSTAT_TYPE_NAMED; others do not. This is part of the spec for each * kstat data type. * * User-level tools should *not* rely on the #define KSTAT_NUM_TYPES. To * get this information, read out the standard system kstat "kstat_types". */ #define KSTAT_TYPE_RAW 0 /* can be anything */ /* ks_ndata >= 1 */ #define KSTAT_TYPE_NAMED 1 /* name/value pair */ /* ks_ndata >= 1 */ #define KSTAT_TYPE_INTR 2 /* interrupt statistics */ /* ks_ndata == 1 */ #define KSTAT_TYPE_IO 3 /* I/O statistics */ /* ks_ndata == 1 */ #define KSTAT_TYPE_TIMER 4 /* event timer */ /* ks_ndata >= 1 */ #define KSTAT_NUM_TYPES 5 /* * kstat class * * Each kstat can be characterized as belonging to some broad class * of statistics, e.g. disk, tape, net, vm, streams, etc. This field * can be used as a filter to extract related kstats. The following * values are currently in use: disk, tape, net, controller, vm, kvm, * hat, streams, kstat, and misc. (The kstat class encompasses things * like kstat_types.) */ /* * kstat flags * * Any of the following flags may be passed to kstat_create(). They are * all zero by default. * * KSTAT_FLAG_VIRTUAL: * * Tells kstat_create() not to allocate memory for the * kstat data section; instead, you will set the ks_data * field to point to the data you wish to export. This * provides a convenient way to export existing data * structures. * * KSTAT_FLAG_VAR_SIZE: * * The size of the kstat you are creating will vary over time. * For example, you may want to use the kstat mechanism to * export a linked list. NOTE: The kstat framework does not * manage the data section, so all variable-size kstats must be * virtual kstats. Moreover, variable-size kstats MUST employ * kstat data locking to prevent data-size races with kstat * clients. See the section on "kstat snapshot" for details. * * KSTAT_FLAG_WRITABLE: * * Makes the kstat's data section writable by root. * The ks_snapshot routine (see below) does not need to check for * this; permission checking is handled in the kstat driver. * * KSTAT_FLAG_PERSISTENT: * * Indicates that this kstat is to be persistent over time. * For persistent kstats, kstat_delete() simply marks the * kstat as dormant; a subsequent kstat_create() reactivates * the kstat. This feature is provided so that statistics * are not lost across driver close/open (e.g., raw disk I/O * on a disk with no mounted partitions.) * NOTE: Persistent kstats cannot be virtual, since ks_data * points to garbage as soon as the driver goes away. * * The following flags are maintained by the kstat framework: * * KSTAT_FLAG_DORMANT: * * For persistent kstats, indicates that the kstat is in the * dormant state (e.g., the corresponding device is closed). * * KSTAT_FLAG_INVALID: * * This flag is set when a kstat is in a transitional state, * e.g. between kstat_create() and kstat_install(). * kstat clients must not attempt to access the kstat's data * if this flag is set. */ #define KSTAT_FLAG_VIRTUAL 0x01 #define KSTAT_FLAG_VAR_SIZE 0x02 #define KSTAT_FLAG_WRITABLE 0x04 #define KSTAT_FLAG_PERSISTENT 0x08 #define KSTAT_FLAG_DORMANT 0x10 #define KSTAT_FLAG_INVALID 0x20 #define KSTAT_FLAG_LONGSTRINGS 0x40 #define KSTAT_FLAG_NO_HEADERS 0x80 /* * Dynamic update support * * The kstat mechanism allows for an optional ks_update function to update * kstat data. This is useful for drivers where the underlying device * keeps cheap hardware stats, but extraction is expensive. Instead of * constantly keeping the kstat data section up to date, you can supply a * ks_update function which updates the kstat's data section on demand. * To take advantage of this feature, simply set the ks_update field before * calling kstat_install(). * * The ks_update function, if supplied, must have the following structure: * * int * foo_kstat_update(kstat_t *ksp, int rw) * { * if (rw == KSTAT_WRITE) { * ... update the native stats from ksp->ks_data; * return EACCES if you don't support this * } else { * ... update ksp->ks_data from the native stats * } * } * * The ks_update return codes are: 0 for success, EACCES if you don't allow * KSTAT_WRITE, and EIO for any other type of error. * * In general, the ks_update function may need to refer to provider-private * data; for example, it may need a pointer to the provider's raw statistics. * The ks_private field is available for this purpose. Its use is entirely * at the provider's discretion. * * All variable-size kstats MUST supply a ks_update routine, which computes * and sets ks_data_size (and ks_ndata if that is meaningful), since these * are needed to perform kstat snapshots (see below). * * No kstat locking should be done inside the ks_update routine. The caller * will already be holding the kstat's ks_lock (to ensure consistent data). */ #define KSTAT_READ 0 #define KSTAT_WRITE 1 /* * Kstat snapshot * * In order to get a consistent view of a kstat's data, clients must obey * the kstat's locking strategy. However, these clients may need to perform * operations on the data which could cause a fault (e.g. copyout()), or * operations which are simply expensive. Doing so could cause deadlock * (e.g. if you're holding a disk's kstat lock which is ultimately required * to resolve a copyout() fault), performance degradation (since the providers' * activity is serialized at the kstat lock), device timing problems, etc. * * To avoid these problems, kstat data is provided via snapshots. Taking * a snapshot is a simple process: allocate a wired-down kernel buffer, * acquire the kstat's data lock, copy the data into the buffer ("take the * snapshot"), and release the lock. This ensures that the kstat's data lock * will be held as briefly as possible, and that no faults will occur while * the lock is held. * * Normally, the snapshot is taken by default_kstat_snapshot(), which * timestamps the data (sets ks_snaptime), copies it, and does a little * massaging to deal with incomplete transactions on i/o kstats. However, * this routine only works for kstats with contiguous data (the typical case). * If you create a kstat whose data is, say, a linked list, you must provide * your own ks_snapshot routine. The routine you supply must have the * following prototype (replace "foo" with something appropriate): * * int foo_kstat_snapshot(kstat_t *ksp, void *buf, int rw); * * The minimal snapshot routine -- one which copies contiguous data that * doesn't need any massaging -- would be this: * * ksp->ks_snaptime = gethrtime(); * if (rw == KSTAT_WRITE) * bcopy(buf, ksp->ks_data, ksp->ks_data_size); * else * bcopy(ksp->ks_data, buf, ksp->ks_data_size); * return (0); * * A more illuminating example is taking a snapshot of a linked list: * * ksp->ks_snaptime = gethrtime(); * if (rw == KSTAT_WRITE) * return (EACCES); ... See below ... * for (foo = first_foo; foo; foo = foo->next) { * bcopy((char *) foo, (char *) buf, sizeof (struct foo)); * buf = ((struct foo *) buf) + 1; * } * return (0); * * In the example above, we have decided that we don't want to allow * KSTAT_WRITE access, so we return EACCES if this is attempted. * * The key points are: * * (1) ks_snaptime must be set (via gethrtime()) to timestamp the data. * (2) Data gets copied from the kstat to the buffer on KSTAT_READ, * and from the buffer to the kstat on KSTAT_WRITE. * (3) ks_snapshot return values are: 0 for success, EACCES if you * don't allow KSTAT_WRITE, and EIO for any other type of error. * * Named kstats (see section on "Named statistics" below) containing long * strings (KSTAT_DATA_STRING) need special handling. The kstat driver * assumes that all strings are copied into the buffer after the array of * named kstats, and the pointers (KSTAT_NAMED_STR_PTR()) are updated to point * into the copy within the buffer. The default snapshot routine does this, * but overriding routines should contain at least the following: * * if (rw == KSTAT_READ) { * kstat_named_t *knp = buf; * char *end = knp + ksp->ks_ndata; * uint_t i; * * ... Do the regular copy ... * bcopy(ksp->ks_data, buf, sizeof (kstat_named_t) * ksp->ks_ndata); * * for (i = 0; i < ksp->ks_ndata; i++, knp++) { * if (knp[i].data_type == KSTAT_DATA_STRING && * KSTAT_NAMED_STR_PTR(knp) != NULL) { * bcopy(KSTAT_NAMED_STR_PTR(knp), end, * KSTAT_NAMED_STR_BUFLEN(knp)); * KSTAT_NAMED_STR_PTR(knp) = end; * end += KSTAT_NAMED_STR_BUFLEN(knp); * } * } */ /* * Named statistics. * * List of arbitrary name=value statistics. */ typedef struct kstat_named { char name[KSTAT_STRLEN]; /* name of counter */ uchar_t data_type; /* data type */ union { char c[16]; /* enough for 128-bit ints */ int32_t i32; uint32_t ui32; struct { union { char *ptr; /* NULL-term string */ #if defined(_KERNEL) && defined(_MULTI_DATAMODEL) caddr32_t ptr32; #endif char __pad[8]; /* 64-bit padding */ } addr; uint32_t len; /* # bytes for strlen + '\0' */ } str; /* * The int64_t and uint64_t types are not valid for a maximally conformant * 32-bit compilation environment (cc -Xc) using compilers prior to the * introduction of C99 conforming compiler (reference ISO/IEC 9899:1990). * In these cases, the visibility of i64 and ui64 is only permitted for * 64-bit compilation environments or 32-bit non-maximally conformant * C89 or C90 ANSI C compilation environments (cc -Xt and cc -Xa). In the * C99 ANSI C compilation environment, the long long type is supported. * The _INT64_TYPE is defined by the implementation (see sys/int_types.h). */ #if defined(_INT64_TYPE) int64_t i64; uint64_t ui64; #endif long l; ulong_t ul; /* These structure members are obsolete */ longlong_t ll; u_longlong_t ull; float f; double d; } value; /* value of counter */ } kstat_named_t; #define KSTAT_DATA_CHAR 0 #define KSTAT_DATA_INT32 1 #define KSTAT_DATA_UINT32 2 #define KSTAT_DATA_INT64 3 #define KSTAT_DATA_UINT64 4 #if !defined(_LP64) #define KSTAT_DATA_LONG KSTAT_DATA_INT32 #define KSTAT_DATA_ULONG KSTAT_DATA_UINT32 #else #if !defined(_KERNEL) #define KSTAT_DATA_LONG KSTAT_DATA_INT64 #define KSTAT_DATA_ULONG KSTAT_DATA_UINT64 #else #define KSTAT_DATA_LONG 7 /* only visible to the kernel */ #define KSTAT_DATA_ULONG 8 /* only visible to the kernel */ #endif /* !_KERNEL */ #endif /* !_LP64 */ /* * Statistics exporting named kstats with long strings (KSTAT_DATA_STRING) * may not make the assumption that ks_data_size is equal to (ks_ndata * sizeof * (kstat_named_t)). ks_data_size in these cases is equal to the sum of the * amount of space required to store the strings (ie, the sum of * KSTAT_NAMED_STR_BUFLEN() for all KSTAT_DATA_STRING statistics) plus the * space required to store the kstat_named_t's. * * The default update routine will update ks_data_size automatically for * variable-length kstats containing long strings (using the default update * routine only makes sense if the string is the only thing that is changing * in size, and ks_ndata is constant). Fixed-length kstats containing long * strings must explicitly change ks_data_size (after creation but before * initialization) to reflect the correct amount of space required for the * long strings and the kstat_named_t's. */ #define KSTAT_DATA_STRING 9 /* These types are obsolete */ #define KSTAT_DATA_LONGLONG KSTAT_DATA_INT64 #define KSTAT_DATA_ULONGLONG KSTAT_DATA_UINT64 #define KSTAT_DATA_FLOAT 5 #define KSTAT_DATA_DOUBLE 6 #define KSTAT_NAMED_PTR(kptr) ((kstat_named_t *)(kptr)->ks_data) /* * Retrieve the pointer of the string contained in the given named kstat. */ #define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.str.addr.ptr) /* * Retrieve the length of the buffer required to store the string in the given * named kstat. */ #define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.str.len) /* * Interrupt statistics. * * An interrupt is a hard interrupt (sourced from the hardware device * itself), a soft interrupt (induced by the system via the use of * some system interrupt source), a watchdog interrupt (induced by * a periodic timer call), spurious (an interrupt entry point was * entered but there was no interrupt condition to service), * or multiple service (an interrupt condition was detected and * serviced just prior to returning from any of the other types). * * Measurement of the spurious class of interrupts is useful for * autovectored devices in order to pinpoint any interrupt latency * problems in a particular system configuration. * * Devices that have more than one interrupt of the same * type should use multiple structures. */ #define KSTAT_INTR_HARD 0 #define KSTAT_INTR_SOFT 1 #define KSTAT_INTR_WATCHDOG 2 #define KSTAT_INTR_SPURIOUS 3 #define KSTAT_INTR_MULTSVC 4 #define KSTAT_NUM_INTRS 5 typedef struct kstat_intr { uint_t intrs[KSTAT_NUM_INTRS]; /* interrupt counters */ } kstat_intr_t; #define KSTAT_INTR_PTR(kptr) ((kstat_intr_t *)(kptr)->ks_data) /* * I/O statistics. */ typedef struct kstat_io { /* * Basic counters. * * The counters should be updated at the end of service * (e.g., just prior to calling biodone()). */ u_longlong_t nread; /* number of bytes read */ u_longlong_t nwritten; /* number of bytes written */ uint_t reads; /* number of read operations */ uint_t writes; /* number of write operations */ /* * Accumulated time and queue length statistics. * * Accumulated time statistics are kept as a running sum * of "active" time. Queue length statistics are kept as a * running sum of the product of queue length and elapsed time * at that length -- i.e., a Riemann sum for queue length * integrated against time. (You can also think of the active time * as a Riemann sum, for the boolean function (queue_length > 0) * integrated against time, or you can think of it as the * Lebesgue measure of the set on which queue_length > 0.) * * ^ * | _________ * 8 | i4 | * | | | * Queue 6 | | * Length | _________ | | * 4 | i2 |_______| | * | | i3 | * 2_______| | * | i1 | * |_______________________________| * Time-> t1 t2 t3 t4 * * At each change of state (entry or exit from the queue), * we add the elapsed time (since the previous state change) * to the active time if the queue length was non-zero during * that interval; and we add the product of the elapsed time * times the queue length to the running length*time sum. * * This method is generalizable to measuring residency * in any defined system: instead of queue lengths, think * of "outstanding RPC calls to server X". * * A large number of I/O subsystems have at least two basic * "lists" of transactions they manage: one for transactions * that have been accepted for processing but for which processing * has yet to begin, and one for transactions which are actively * being processed (but not done). For this reason, two cumulative * time statistics are defined here: wait (pre-service) time, * and run (service) time. * * All times are 64-bit nanoseconds (hrtime_t), as returned by * gethrtime(). * * The units of cumulative busy time are accumulated nanoseconds. * The units of cumulative length*time products are elapsed time * times queue length. * * Updates to the fields below are performed implicitly by calls to * these five functions: * * kstat_waitq_enter() * kstat_waitq_exit() * kstat_runq_enter() * kstat_runq_exit() * * kstat_waitq_to_runq() (see below) * kstat_runq_back_to_waitq() (see below) * * Since kstat_waitq_exit() is typically followed immediately * by kstat_runq_enter(), there is a single kstat_waitq_to_runq() * function which performs both operations. This is a performance * win since only one timestamp is required. * * In some instances, it may be necessary to move a request from * the run queue back to the wait queue, e.g. for write throttling. * For these situations, call kstat_runq_back_to_waitq(). * * These fields should never be updated by any other means. */ hrtime_t wtime; /* cumulative wait (pre-service) time */ hrtime_t wlentime; /* cumulative wait length*time product */ hrtime_t wlastupdate; /* last time wait queue changed */ hrtime_t rtime; /* cumulative run (service) time */ hrtime_t rlentime; /* cumulative run length*time product */ hrtime_t rlastupdate; /* last time run queue changed */ uint_t wcnt; /* count of elements in wait state */ uint_t rcnt; /* count of elements in run state */ } kstat_io_t; #define KSTAT_IO_PTR(kptr) ((kstat_io_t *)(kptr)->ks_data) /* * Event timer statistics - cumulative elapsed time and number of events. * * Updates to these fields are performed implicitly by calls to * kstat_timer_start() and kstat_timer_stop(). */ typedef struct kstat_timer { char name[KSTAT_STRLEN]; /* event name */ uchar_t resv; /* reserved */ u_longlong_t num_events; /* number of events */ hrtime_t elapsed_time; /* cumulative elapsed time */ hrtime_t min_time; /* shortest event duration */ hrtime_t max_time; /* longest event duration */ hrtime_t start_time; /* previous event start time */ hrtime_t stop_time; /* previous event stop time */ } kstat_timer_t; #define KSTAT_TIMER_PTR(kptr) ((kstat_timer_t *)(kptr)->ks_data) #if defined(_KERNEL) #include extern kid_t kstat_chain_id; /* bumped at each state change */ extern void kstat_init(void); /* initialize kstat framework */ /* * Adding and deleting kstats. * * The typical sequence to add a kstat is: * * ksp = kstat_create(module, instance, name, class, type, ndata, flags); * if (ksp) { * ... provider initialization, if necessary * kstat_install(ksp); * } * * There are three logically distinct steps here: * * Step 1: System Initialization (kstat_create) * * kstat_create() performs system initialization. kstat_create() * allocates memory for the entire kstat (header plus data), initializes * all header fields, initializes the data section to all zeroes, assigns * a unique KID, and puts the kstat onto the system's kstat chain. * The returned kstat is marked invalid (KSTAT_FLAG_INVALID is set), * because the provider (caller) has not yet had a chance to initialize * the data section. * * By default, kstats are exported to all zones on the system. A kstat may be * created via kstat_create_zone() to specify a zone to which the statistics * should be exported. kstat_zone_add() may be used to specify additional * zones to which the statistics are to be exported. * * Step 2: Provider Initialization * * The provider performs any necessary initialization of the data section, * e.g. setting the name fields in a KSTAT_TYPE_NAMED. Virtual kstats set * the ks_data field at this time. The provider may also set the ks_update, * ks_snapshot, ks_private, and ks_lock fields if necessary. * * Step 3: Installation (kstat_install) * * Once the kstat is completely initialized, kstat_install() clears the * INVALID flag, thus making the kstat accessible to the outside world. * kstat_install() also clears the DORMANT flag for persistent kstats. * * Removing a kstat from the system * * kstat_delete(ksp) removes ksp from the kstat chain and frees all * associated system resources. NOTE: When you call kstat_delete(), * you must NOT be holding that kstat's ks_lock. Otherwise, you may * deadlock with a kstat reader. * * Persistent kstats * * From the provider's point of view, persistence is transparent. The only * difference between ephemeral (normal) kstats and persistent kstats * is that you pass KSTAT_FLAG_PERSISTENT to kstat_create(). Magically, * this has the effect of making your data visible even when you're * not home. Persistence is important to tools like iostat, which want * to get a meaningful picture of disk activity. Without persistence, * raw disk i/o statistics could never accumulate: they would come and * go with each open/close of the raw device. * * The magic of persistence works by slightly altering the behavior of * kstat_create() and kstat_delete(). The first call to kstat_create() * creates a new kstat, as usual. However, kstat_delete() does not * actually delete the kstat: it performs one final update of the data * (i.e., calls the ks_update routine), marks the kstat as dormant, and * sets the ks_lock, ks_update, ks_private, and ks_snapshot fields back * to their default values (since they might otherwise point to garbage, * e.g. if the provider is going away). kstat clients can still access * the dormant kstat just like a live kstat; they just continue to see * the final data values as long as the kstat remains dormant. * All subsequent kstat_create() calls simply find the already-existing, * dormant kstat and return a pointer to it, without altering any fields. * The provider then performs its usual initialization sequence, and * calls kstat_install(). kstat_install() uses the old data values to * initialize the native data (i.e., ks_update is called with KSTAT_WRITE), * thus making it seem like you were never gone. */ extern kstat_t *kstat_create(const char *, int, const char *, const char *, uchar_t, uint_t, uchar_t); extern kstat_t *kstat_create_zone(const char *, int, const char *, const char *, uchar_t, uint_t, uchar_t, zoneid_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); extern void kstat_named_setstr(kstat_named_t *knp, const char *src); extern void kstat_set_string(char *, const char *); extern void kstat_delete_byname(const char *, int, const char *); extern void kstat_delete_byname_zone(const char *, int, const char *, zoneid_t); extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); extern void kstat_timer_init(kstat_timer_t *, const char *); -extern void kstat_waitq_enter(kstat_io_t *); -extern void kstat_waitq_exit(kstat_io_t *); -extern void kstat_runq_enter(kstat_io_t *); -extern void kstat_runq_exit(kstat_io_t *); -extern void kstat_waitq_to_runq(kstat_io_t *); -extern void kstat_runq_back_to_waitq(kstat_io_t *); extern void kstat_timer_start(kstat_timer_t *); extern void kstat_timer_stop(kstat_timer_t *); extern void kstat_zone_add(kstat_t *, zoneid_t); extern void kstat_zone_remove(kstat_t *, zoneid_t); extern int kstat_zone_find(kstat_t *, zoneid_t); extern kstat_t *kstat_hold_bykid(kid_t kid, zoneid_t); extern kstat_t *kstat_hold_byname(const char *, int, const char *, zoneid_t); extern void kstat_rele(kstat_t *); #endif /* defined(_KERNEL) */ #ifdef __cplusplus } #endif #endif /* _SYS_KSTAT_H */ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index e96a1d7521d9..cc8e534e7eb5 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -1,1417 +1,1387 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Emulation of kernel services in userland. */ uint64_t physmem; char hw_serial[HW_HOSTID_LEN]; struct utsname hw_utsname; /* If set, all blocks read will be copied to the specified directory. */ char *vn_dumpdir = NULL; /* this only exists to have its address taken */ struct proc p0; /* * ========================================================================= * threads * ========================================================================= * * TS_STACK_MIN is dictated by the minimum allowed pthread stack size. While * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for * the expected stack depth while small enough to avoid exhausting address * space with high thread counts. */ #define TS_STACK_MIN MAX(PTHREAD_STACK_MIN, 32768) #define TS_STACK_MAX (256 * 1024) /*ARGSUSED*/ kthread_t * zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state) { pthread_attr_t attr; pthread_t tid; char *stkstr; int detachstate = PTHREAD_CREATE_DETACHED; VERIFY0(pthread_attr_init(&attr)); if (state & TS_JOINABLE) detachstate = PTHREAD_CREATE_JOINABLE; VERIFY0(pthread_attr_setdetachstate(&attr, detachstate)); /* * We allow the default stack size in user space to be specified by * setting the ZFS_STACK_SIZE environment variable. This allows us * the convenience of observing and debugging stack overruns in * user space. Explicitly specified stack sizes will be honored. * The usage of ZFS_STACK_SIZE is discussed further in the * ENVIRONMENT VARIABLES sections of the ztest(1) man page. */ if (stksize == 0) { stkstr = getenv("ZFS_STACK_SIZE"); if (stkstr == NULL) stksize = TS_STACK_MAX; else stksize = MAX(atoi(stkstr), TS_STACK_MIN); } VERIFY3S(stksize, >, 0); stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE); /* * If this ever fails, it may be because the stack size is not a * multiple of system page size. */ VERIFY0(pthread_attr_setstacksize(&attr, stksize)); VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE)); VERIFY0(pthread_create(&tid, &attr, (void *(*)(void *))func, arg)); VERIFY0(pthread_attr_destroy(&attr)); return ((void *)(uintptr_t)tid); } /* * ========================================================================= * kstats * ========================================================================= */ /*ARGSUSED*/ kstat_t * kstat_create(const char *module, int instance, const char *name, const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) { return (NULL); } /*ARGSUSED*/ void kstat_install(kstat_t *ksp) {} /*ARGSUSED*/ void kstat_delete(kstat_t *ksp) {} -/*ARGSUSED*/ -void -kstat_waitq_enter(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_waitq_exit(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_runq_enter(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_runq_exit(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_waitq_to_runq(kstat_io_t *kiop) -{} - -/*ARGSUSED*/ -void -kstat_runq_back_to_waitq(kstat_io_t *kiop) -{} - void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) {} /* * ========================================================================= * mutexes * ========================================================================= */ void mutex_init(kmutex_t *mp, char *name, int type, void *cookie) { VERIFY0(pthread_mutex_init(&mp->m_lock, NULL)); memset(&mp->m_owner, 0, sizeof (pthread_t)); } void mutex_destroy(kmutex_t *mp) { VERIFY0(pthread_mutex_destroy(&mp->m_lock)); } void mutex_enter(kmutex_t *mp) { VERIFY0(pthread_mutex_lock(&mp->m_lock)); mp->m_owner = pthread_self(); } int mutex_tryenter(kmutex_t *mp) { int error; error = pthread_mutex_trylock(&mp->m_lock); if (error == 0) { mp->m_owner = pthread_self(); return (1); } else { VERIFY3S(error, ==, EBUSY); return (0); } } void mutex_exit(kmutex_t *mp) { memset(&mp->m_owner, 0, sizeof (pthread_t)); VERIFY0(pthread_mutex_unlock(&mp->m_lock)); } /* * ========================================================================= * rwlocks * ========================================================================= */ void rw_init(krwlock_t *rwlp, char *name, int type, void *arg) { VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL)); rwlp->rw_readers = 0; rwlp->rw_owner = 0; } void rw_destroy(krwlock_t *rwlp) { VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock)); } void rw_enter(krwlock_t *rwlp, krw_t rw) { if (rw == RW_READER) { VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock)); atomic_inc_uint(&rwlp->rw_readers); } else { VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock)); rwlp->rw_owner = pthread_self(); } } void rw_exit(krwlock_t *rwlp) { if (RW_READ_HELD(rwlp)) atomic_dec_uint(&rwlp->rw_readers); else rwlp->rw_owner = 0; VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock)); } int rw_tryenter(krwlock_t *rwlp, krw_t rw) { int error; if (rw == RW_READER) error = pthread_rwlock_tryrdlock(&rwlp->rw_lock); else error = pthread_rwlock_trywrlock(&rwlp->rw_lock); if (error == 0) { if (rw == RW_READER) atomic_inc_uint(&rwlp->rw_readers); else rwlp->rw_owner = pthread_self(); return (1); } VERIFY3S(error, ==, EBUSY); return (0); } /* ARGSUSED */ uint32_t zone_get_hostid(void *zonep) { /* * We're emulating the system's hostid in userland. */ return (strtoul(hw_serial, NULL, 10)); } int rw_tryupgrade(krwlock_t *rwlp) { return (0); } /* * ========================================================================= * condition variables * ========================================================================= */ void cv_init(kcondvar_t *cv, char *name, int type, void *arg) { VERIFY0(pthread_cond_init(cv, NULL)); } void cv_destroy(kcondvar_t *cv) { VERIFY0(pthread_cond_destroy(cv)); } void cv_wait(kcondvar_t *cv, kmutex_t *mp) { memset(&mp->m_owner, 0, sizeof (pthread_t)); VERIFY0(pthread_cond_wait(cv, &mp->m_lock)); mp->m_owner = pthread_self(); } int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp) { cv_wait(cv, mp); return (1); } int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) { int error; struct timeval tv; struct timespec ts; clock_t delta; delta = abstime - ddi_get_lbolt(); if (delta <= 0) return (-1); VERIFY(gettimeofday(&tv, NULL) == 0); ts.tv_sec = tv.tv_sec + delta / hz; ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } memset(&mp->m_owner, 0, sizeof (pthread_t)); error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); mp->m_owner = pthread_self(); if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } /*ARGSUSED*/ int cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { int error; struct timeval tv; struct timespec ts; hrtime_t delta; ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE); delta = tim; if (flag & CALLOUT_FLAG_ABSOLUTE) delta -= gethrtime(); if (delta <= 0) return (-1); VERIFY0(gettimeofday(&tv, NULL)); ts.tv_sec = tv.tv_sec + delta / NANOSEC; ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC); if (ts.tv_nsec >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } memset(&mp->m_owner, 0, sizeof (pthread_t)); error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); mp->m_owner = pthread_self(); if (error == ETIMEDOUT) return (-1); VERIFY0(error); return (1); } void cv_signal(kcondvar_t *cv) { VERIFY0(pthread_cond_signal(cv)); } void cv_broadcast(kcondvar_t *cv) { VERIFY0(pthread_cond_broadcast(cv)); } /* * ========================================================================= * procfs list * ========================================================================= */ void seq_printf(struct seq_file *m, const char *fmt, ...) {} void procfs_list_install(const char *module, const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, int (*show)(struct seq_file *f, void *p), int (*show_header)(struct seq_file *f), int (*clear)(procfs_list_t *procfs_list), size_t procfs_list_node_off) { mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&procfs_list->pl_list, procfs_list_node_off + sizeof (procfs_list_node_t), procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); procfs_list->pl_next_id = 1; procfs_list->pl_node_offset = procfs_list_node_off; } void procfs_list_uninstall(procfs_list_t *procfs_list) {} void procfs_list_destroy(procfs_list_t *procfs_list) { ASSERT(list_is_empty(&procfs_list->pl_list)); list_destroy(&procfs_list->pl_list); mutex_destroy(&procfs_list->pl_lock); } #define NODE_ID(procfs_list, obj) \ (((procfs_list_node_t *)(((char *)obj) + \ (procfs_list)->pl_node_offset))->pln_id) void procfs_list_add(procfs_list_t *procfs_list, void *p) { ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; list_insert_tail(&procfs_list->pl_list, p); } /* * ========================================================================= * vnode operations * ========================================================================= */ /* * ========================================================================= * Figure out which debugging statements to print * ========================================================================= */ static char *dprintf_string; static int dprintf_print_all; int dprintf_find_string(const char *string) { char *tmp_str = dprintf_string; int len = strlen(string); /* * Find out if this is a string we want to print. * String format: file1.c,function_name1,file2.c,file3.c */ while (tmp_str != NULL) { if (strncmp(tmp_str, string, len) == 0 && (tmp_str[len] == ',' || tmp_str[len] == '\0')) return (1); tmp_str = strchr(tmp_str, ','); if (tmp_str != NULL) tmp_str++; /* Get rid of , */ } return (0); } void dprintf_setup(int *argc, char **argv) { int i, j; /* * Debugging can be specified two ways: by setting the * environment variable ZFS_DEBUG, or by including a * "debug=..." argument on the command line. The command * line setting overrides the environment variable. */ for (i = 1; i < *argc; i++) { int len = strlen("debug="); /* First look for a command line argument */ if (strncmp("debug=", argv[i], len) == 0) { dprintf_string = argv[i] + len; /* Remove from args */ for (j = i; j < *argc; j++) argv[j] = argv[j+1]; argv[j] = NULL; (*argc)--; } } if (dprintf_string == NULL) { /* Look for ZFS_DEBUG environment variable */ dprintf_string = getenv("ZFS_DEBUG"); } /* * Are we just turning on all debugging? */ if (dprintf_find_string("on")) dprintf_print_all = 1; if (dprintf_string != NULL) zfs_flags |= ZFS_DEBUG_DPRINTF; } /* * ========================================================================= * debug printfs * ========================================================================= */ void __dprintf(boolean_t dprint, const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; va_list adx; /* * Get rid of annoying "../common/" prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } if (dprint) { /* dprintf messages are printed immediately */ if (!dprintf_print_all && !dprintf_find_string(newfile) && !dprintf_find_string(func)) return; /* Print out just the function name if requested */ flockfile(stdout); if (dprintf_find_string("pid")) (void) printf("%d ", getpid()); if (dprintf_find_string("tid")) (void) printf("%ju ", (uintmax_t)(uintptr_t)pthread_self()); if (dprintf_find_string("cpu")) (void) printf("%u ", getcpuid()); if (dprintf_find_string("time")) (void) printf("%llu ", gethrtime()); if (dprintf_find_string("long")) (void) printf("%s, line %d: ", newfile, line); (void) printf("dprintf: %s: ", func); va_start(adx, fmt); (void) vprintf(fmt, adx); va_end(adx); funlockfile(stdout); } else { /* zfs_dbgmsg is logged for dumping later */ size_t size; char *buf; int i; size = 1024; buf = umem_alloc(size, UMEM_NOFAIL); i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); if (i < size) { va_start(adx, fmt); (void) vsnprintf(buf + i, size - i, fmt, adx); va_end(adx); } __zfs_dbgmsg(buf); umem_free(buf, size); } } /* * ========================================================================= * cmn_err() and panic() * ========================================================================= */ static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; void vpanic(const char *fmt, va_list adx) { (void) fprintf(stderr, "error: "); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "\n"); abort(); /* think of it as a "user-level crash dump" */ } void panic(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vpanic(fmt, adx); va_end(adx); } void vcmn_err(int ce, const char *fmt, va_list adx) { if (ce == CE_PANIC) vpanic(fmt, adx); if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ (void) fprintf(stderr, "%s", ce_prefix[ce]); (void) vfprintf(stderr, fmt, adx); (void) fprintf(stderr, "%s", ce_suffix[ce]); } } /*PRINTFLIKE2*/ void cmn_err(int ce, const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(ce, fmt, adx); va_end(adx); } /* * ========================================================================= * misc routines * ========================================================================= */ void delay(clock_t ticks) { (void) poll(0, 0, ticks * (1000 / hz)); } /* * Find highest one bit set. * Returns bit number + 1 of highest bit that is set, otherwise returns 0. * The __builtin_clzll() function is supported by both GCC and Clang. */ int highbit64(uint64_t i) { if (i == 0) return (0); return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); } /* * Find lowest one bit set. * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. * The __builtin_ffsll() function is supported by both GCC and Clang. */ int lowbit64(uint64_t i) { if (i == 0) return (0); return (__builtin_ffsll(i)); } const char *random_path = "/dev/random"; const char *urandom_path = "/dev/urandom"; static int random_fd = -1, urandom_fd = -1; void random_init(void) { VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1); VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1); } void random_fini(void) { close(random_fd); close(urandom_fd); random_fd = -1; urandom_fd = -1; } static int random_get_bytes_common(uint8_t *ptr, size_t len, int fd) { size_t resid = len; ssize_t bytes; ASSERT(fd != -1); while (resid != 0) { bytes = read(fd, ptr, resid); ASSERT3S(bytes, >=, 0); ptr += bytes; resid -= bytes; } return (0); } int random_get_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, random_fd)); } int random_get_pseudo_bytes(uint8_t *ptr, size_t len) { return (random_get_bytes_common(ptr, len, urandom_fd)); } int ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) { char *end; *result = strtoul(hw_serial, &end, base); if (*result == 0) return (errno); return (0); } int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) { char *end; *result = strtoull(str, &end, base); if (*result == 0) return (errno); return (0); } utsname_t * utsname(void) { return (&hw_utsname); } /* * ========================================================================= * kernel emulation setup & teardown * ========================================================================= */ static int umem_out_of_memory(void) { char errmsg[] = "out of memory -- generating core dump\n"; (void) fprintf(stderr, "%s", errmsg); abort(); return (0); } void kernel_init(int mode) { extern uint_t rrw_tsd_key; umem_nofail_callback(umem_out_of_memory); physmem = sysconf(_SC_PHYS_PAGES); dprintf("physmem = %llu pages (%.2f GB)\n", physmem, (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0); random_init(); VERIFY0(uname(&hw_utsname)); system_taskq_init(); icp_init(); zstd_init(); spa_init((spa_mode_t)mode); fletcher_4_init(); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); } void kernel_fini(void) { fletcher_4_fini(); spa_fini(); zstd_fini(); icp_fini(); system_taskq_fini(); random_fini(); } uid_t crgetuid(cred_t *cr) { return (0); } uid_t crgetruid(cred_t *cr) { return (0); } gid_t crgetgid(cred_t *cr) { return (0); } int crgetngroups(cred_t *cr) { return (0); } gid_t * crgetgroups(cred_t *cr) { return (NULL); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (0); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { return (0); } int secpolicy_zfs(const cred_t *cr) { return (0); } int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) { return (0); } ksiddomain_t * ksid_lookupdomain(const char *dom) { ksiddomain_t *kd; kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); kd->kd_name = spa_strdup(dom); return (kd); } void ksiddomain_rele(ksiddomain_t *ksid) { spa_strfree(ksid->kd_name); umem_free(ksid, sizeof (ksiddomain_t)); } char * kmem_vasprintf(const char *fmt, va_list adx) { char *buf = NULL; va_list adx_copy; va_copy(adx_copy, adx); VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); va_end(adx_copy); return (buf); } char * kmem_asprintf(const char *fmt, ...) { char *buf = NULL; va_list adx; va_start(adx, fmt); VERIFY(vasprintf(&buf, fmt, adx) != -1); va_end(adx); return (buf); } /* ARGSUSED */ int zfs_onexit_fd_hold(int fd, minor_t *minorp) { *minorp = 0; return (0); } /* ARGSUSED */ void zfs_onexit_fd_rele(int fd) { } /* ARGSUSED */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle) { return (0); } fstrans_cookie_t spl_fstrans_mark(void) { return ((fstrans_cookie_t)0); } void spl_fstrans_unmark(fstrans_cookie_t cookie) { } int __spl_pf_fstrans_check(void) { return (0); } int kmem_cache_reap_active(void) { return (0); } void *zvol_tag = "zvol_tag"; void zvol_create_minor(const char *name) { } void zvol_create_minors_recursive(const char *name) { } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { } void zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, boolean_t async) { } /* * Open file * * path - fully qualified path to file * flags - file attributes O_READ / O_WRITE / O_EXCL * fpp - pointer to return file pointer * * Returns 0 on success underlying error on failure. */ int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) { int fd = -1; int dump_fd = -1; int err; int old_umask = 0; zfs_file_t *fp; struct stat64 st; if (!(flags & O_CREAT) && stat64(path, &st) == -1) return (errno); if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) flags |= O_DIRECT; if (flags & O_CREAT) old_umask = umask(0); fd = open64(path, flags, mode); if (fd == -1) return (errno); if (flags & O_CREAT) (void) umask(old_umask); if (vn_dumpdir != NULL) { char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); char *inpath = basename((char *)(uintptr_t)path); (void) snprintf(dumppath, MAXPATHLEN, "%s/%s", vn_dumpdir, inpath); dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); umem_free(dumppath, MAXPATHLEN); if (dump_fd == -1) { err = errno; close(fd); return (err); } } else { dump_fd = -1; } (void) fcntl(fd, F_SETFD, FD_CLOEXEC); fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); fp->f_fd = fd; fp->f_dump_fd = dump_fd; *fpp = fp; return (0); } void zfs_file_close(zfs_file_t *fp) { close(fp->f_fd); if (fp->f_dump_fd != -1) close(fp->f_dump_fd); umem_free(fp, sizeof (zfs_file_t)); } /* * Stateful write - use os internal file pointer to determine where to * write and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) { ssize_t rc; rc = write(fp->f_fd, buf, count); if (rc < 0) return (errno); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless write - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to write to * buf - buffer to write * count - # of bytes to write * off - file offset to write to (only valid for seekable types) * resid - pointer to count of unwritten bytes * * Returns 0 on success errno on failure. */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t pos, ssize_t *resid) { ssize_t rc, split, done; int sectors; /* * To simulate partial disk writes, we split writes into two * system calls so that the process can be killed in between. * This is used by ztest to simulate realistic failure modes. */ sectors = count >> SPA_MINBLOCKSHIFT; split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; rc = pwrite64(fp->f_fd, buf, split, pos); if (rc != -1) { done = rc; rc = pwrite64(fp->f_fd, (char *)buf + split, count - split, pos + split); } #ifdef __linux__ if (rc == -1 && errno == EINVAL) { /* * Under Linux, this most likely means an alignment issue * (memory or disk) due to O_DIRECT, so we abort() in order * to catch the offender. */ abort(); } #endif if (rc < 0) return (errno); done += rc; if (resid) { *resid = count - done; } else if (done != count) { return (EIO); } return (0); } /* * Stateful read - use os internal file pointer to determine where to * read and update on successful completion. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to read * resid - pointer to count of unread bytes (if short read) * * Returns 0 on success errno on failure. */ int zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) { int rc; rc = read(fp->f_fd, buf, count); if (rc < 0) return (errno); if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * Stateless read - os internal file pointer is not updated. * * fp - pointer to file (pipe, socket, etc) to read from * buf - buffer to write * count - # of bytes to write * off - file offset to read from (only valid for seekable types) * resid - pointer to count of unwritten bytes (if short write) * * Returns 0 on success errno on failure. */ int zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, ssize_t *resid) { ssize_t rc; rc = pread64(fp->f_fd, buf, count, off); if (rc < 0) { #ifdef __linux__ /* * Under Linux, this most likely means an alignment issue * (memory or disk) due to O_DIRECT, so we abort() in order to * catch the offender. */ if (errno == EINVAL) abort(); #endif return (errno); } if (fp->f_dump_fd != -1) { int status; status = pwrite64(fp->f_dump_fd, buf, rc, off); ASSERT(status != -1); } if (resid) { *resid = count - rc; } else if (rc != count) { return (EIO); } return (0); } /* * lseek - set / get file pointer * * fp - pointer to file (pipe, socket, etc) to read from * offp - value to seek to, returns current value plus passed offset * whence - see man pages for standard lseek whence values * * Returns 0 on success errno on failure (ESPIPE for non seekable types) */ int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) { loff_t rc; rc = lseek(fp->f_fd, *offp, whence); if (rc < 0) return (errno); *offp = rc; return (0); } /* * Get file attributes * * filp - file pointer * zfattr - pointer to file attr structure * * Currently only used for fetching size and file mode * * Returns 0 on success or error code of underlying getattr call on failure. */ int zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) { struct stat64 st; if (fstat64_blk(fp->f_fd, &st) == -1) return (errno); zfattr->zfa_size = st.st_size; zfattr->zfa_mode = st.st_mode; return (0); } /* * Sync file to disk * * filp - file pointer * flags - O_SYNC and or O_DSYNC * * Returns 0 on success or error code of underlying sync call on failure. */ int zfs_file_fsync(zfs_file_t *fp, int flags) { int rc; rc = fsync(fp->f_fd); if (rc < 0) return (errno); return (0); } /* * fallocate - allocate or free space on disk * * fp - file pointer * mode (non-standard options for hole punching etc) * offset - offset to start allocating or freeing from * len - length to free / allocate * * OPTIONAL */ int zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) { #ifdef __linux__ return (fallocate(fp->f_fd, mode, offset, len)); #else return (EOPNOTSUPP); #endif } /* * Request current file pointer offset * * fp - pointer to file * * Returns current file offset. */ loff_t zfs_file_off(zfs_file_t *fp) { return (lseek(fp->f_fd, SEEK_CUR, 0)); } /* * unlink file * * path - fully qualified file path * * Returns 0 on success. * * OPTIONAL */ int zfs_file_unlink(const char *path) { return (remove(path)); } /* * Get reference to file pointer * * fd - input file descriptor * fpp - pointer to file pointer * * Returns 0 on success EBADF on failure. * Unsupported in user space. */ int zfs_file_get(int fd, zfs_file_t **fpp) { abort(); return (EOPNOTSUPP); } /* * Drop reference to file pointer * * fd - input file descriptor * * Unsupported in user space. */ void zfs_file_put(int fd) { abort(); } void zfsvfs_update_fromname(const char *oldname, const char *newname) { } diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c index e591921ace1b..059ada235c4a 100644 --- a/module/os/freebsd/spl/spl_kstat.c +++ b/module/os/freebsd/spl/spl_kstat.c @@ -1,572 +1,510 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Links to Illumos.org for more information on kstat function: * [1] https://illumos.org/man/1M/kstat * [2] https://illumos.org/man/9f/kstat_create */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics"); SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics"); void __kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) { ksp->ks_raw_ops.headers = headers; ksp->ks_raw_ops.data = data; ksp->ks_raw_ops.addr = addr; } void __kstat_set_seq_raw_ops(kstat_t *ksp, int (*headers)(struct seq_file *f), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) { ksp->ks_raw_ops.seq_headers = headers; ksp->ks_raw_ops.data = data; ksp->ks_raw_ops.addr = addr; } static int kstat_default_update(kstat_t *ksp, int rw) { ASSERT3P(ksp, !=, NULL); if (rw == KSTAT_WRITE) return (EACCES); return (0); } static int kstat_resize_raw(kstat_t *ksp) { if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) return (ENOMEM); free(ksp->ks_raw_buf, M_TEMP); ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); ksp->ks_raw_buf = malloc(ksp->ks_raw_bufsize, M_TEMP, M_WAITOK); return (0); } static void * kstat_raw_default_addr(kstat_t *ksp, loff_t n) { if (n == 0) return (ksp->ks_data); return (NULL); } static int kstat_sysctl(SYSCTL_HANDLER_ARGS) { kstat_t *ksp = arg1; kstat_named_t *ksent; uint64_t val; ksent = ksp->ks_data; /* Select the correct element */ ksent += arg2; /* Update the aggsums before reading */ (void) ksp->ks_update(ksp, KSTAT_READ); val = ksent->value.ui64; return (sysctl_handle_64(oidp, &val, 0, req)); } static int kstat_sysctl_string(SYSCTL_HANDLER_ARGS) { kstat_t *ksp = arg1; kstat_named_t *ksent = ksp->ks_data; char *val; uint32_t len = 0; /* Select the correct element */ ksent += arg2; /* Update the aggsums before reading */ (void) ksp->ks_update(ksp, KSTAT_READ); val = KSTAT_NAMED_STR_PTR(ksent); len = KSTAT_NAMED_STR_BUFLEN(ksent); val[len-1] = '\0'; return (sysctl_handle_string(oidp, val, len, req)); } static int kstat_sysctl_io(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; kstat_t *ksp = arg1; kstat_io_t *kip = ksp->ks_data; int rc; sb = sbuf_new_auto(); if (sb == NULL) return (ENOMEM); /* Update the aggsums before reading */ (void) ksp->ks_update(ksp, KSTAT_READ); /* though wlentime & friends are signed, they will never be negative */ sbuf_printf(sb, "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", kip->nread, kip->nwritten, kip->reads, kip->writes, kip->wtime, kip->wlentime, kip->wlastupdate, kip->rtime, kip->rlentime, kip->rlastupdate, kip->wcnt, kip->rcnt); rc = sbuf_finish(sb); if (rc == 0) rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); sbuf_delete(sb); return (rc); } static int kstat_sysctl_raw(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; void *data; kstat_t *ksp = arg1; void *(*addr_op)(kstat_t *ksp, loff_t index); int n, has_header, rc = 0; sb = sbuf_new_auto(); if (sb == NULL) return (ENOMEM); if (ksp->ks_raw_ops.addr) addr_op = ksp->ks_raw_ops.addr; else addr_op = kstat_raw_default_addr; mutex_enter(ksp->ks_lock); /* Update the aggsums before reading */ (void) ksp->ks_update(ksp, KSTAT_READ); ksp->ks_raw_bufsize = PAGE_SIZE; ksp->ks_raw_buf = malloc(PAGE_SIZE, M_TEMP, M_WAITOK); n = 0; has_header = (ksp->ks_raw_ops.headers || ksp->ks_raw_ops.seq_headers); restart_headers: if (ksp->ks_raw_ops.headers) { rc = ksp->ks_raw_ops.headers( ksp->ks_raw_buf, ksp->ks_raw_bufsize); } else if (ksp->ks_raw_ops.seq_headers) { struct seq_file f; f.sf_buf = ksp->ks_raw_buf; f.sf_size = ksp->ks_raw_bufsize; rc = ksp->ks_raw_ops.seq_headers(&f); } if (has_header) { if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart_headers; if (rc == 0) sbuf_printf(sb, "\n%s", ksp->ks_raw_buf); } while ((data = addr_op(ksp, n)) != NULL) { restart: if (ksp->ks_raw_ops.data) { rc = ksp->ks_raw_ops.data(ksp->ks_raw_buf, ksp->ks_raw_bufsize, data); if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (rc == 0) sbuf_printf(sb, "%s", ksp->ks_raw_buf); } else { ASSERT3U(ksp->ks_ndata, ==, 1); sbuf_hexdump(sb, ksp->ks_data, ksp->ks_data_size, NULL, 0); } n++; } free(ksp->ks_raw_buf, M_TEMP); mutex_exit(ksp->ks_lock); sbuf_trim(sb); rc = sbuf_finish(sb); if (rc == 0) rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); sbuf_delete(sb); return (rc); } kstat_t * __kstat_create(const char *module, int instance, const char *name, const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags) { char buf[KSTAT_STRLEN]; struct sysctl_oid *root; kstat_t *ksp; char *pool; KASSERT(instance == 0, ("instance=%d", instance)); if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) ASSERT3U(ks_ndata, ==, 1); if (class == NULL) class = "misc"; /* * Allocate the main structure. We don't need to keep a copy of * module in here, because it is only used for sysctl node creation * done in this function. */ ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO); ksp->ks_crtime = gethrtime(); ksp->ks_snaptime = ksp->ks_crtime; ksp->ks_instance = instance; (void) strlcpy(ksp->ks_name, name, KSTAT_STRLEN); (void) strlcpy(ksp->ks_class, class, KSTAT_STRLEN); ksp->ks_type = ks_type; ksp->ks_flags = flags; ksp->ks_update = kstat_default_update; mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); ksp->ks_lock = &ksp->ks_private_lock; switch (ksp->ks_type) { case KSTAT_TYPE_RAW: ksp->ks_ndata = 1; ksp->ks_data_size = ks_ndata; break; case KSTAT_TYPE_NAMED: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); break; case KSTAT_TYPE_INTR: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); break; case KSTAT_TYPE_IO: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); break; case KSTAT_TYPE_TIMER: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); break; default: panic("Undefined kstat type %d\n", ksp->ks_type); } if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) ksp->ks_data = NULL; else ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); /* * Some kstats use a module name like "zfs/poolname" to distinguish a * set of kstats belonging to a specific pool. Split on '/' to add an * extra node for the pool name if needed. */ (void) strlcpy(buf, module, KSTAT_STRLEN); module = buf; pool = strchr(module, '/'); if (pool != NULL) *pool++ = '\0'; /* * Create sysctl tree for those statistics: * * kstat.[.].. */ sysctl_ctx_init(&ksp->ks_sysctl_ctx); root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0, ""); if (root == NULL) { printf("%s: Cannot create kstat.%s tree!\n", __func__, module); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } if (pool != NULL) { root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), OID_AUTO, pool, CTLFLAG_RW, 0, ""); if (root == NULL) { printf("%s: Cannot create kstat.%s.%s tree!\n", __func__, module, pool); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } } root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), OID_AUTO, class, CTLFLAG_RW, 0, ""); if (root == NULL) { if (pool != NULL) printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__, module, pool, class); else printf("%s: Cannot create kstat.%s.%s tree!\n", __func__, module, class); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } if (ksp->ks_type == KSTAT_TYPE_NAMED) { root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), OID_AUTO, name, CTLFLAG_RW, 0, ""); if (root == NULL) { if (pool != NULL) printf("%s: Cannot create kstat.%s.%s.%s.%s " "tree!\n", __func__, module, pool, class, name); else printf("%s: Cannot create kstat.%s.%s.%s " "tree!\n", __func__, module, class, name); sysctl_ctx_free(&ksp->ks_sysctl_ctx); free(ksp, M_KSTAT); return (NULL); } } ksp->ks_sysctl_root = root; return (ksp); } static void kstat_install_named(kstat_t *ksp) { kstat_named_t *ksent; char *namelast; int typelast; ksent = ksp->ks_data; VERIFY((ksp->ks_flags & KSTAT_FLAG_VIRTUAL) || ksent != NULL); typelast = 0; namelast = NULL; for (int i = 0; i < ksp->ks_ndata; i++, ksent++) { if (ksent->data_type != 0) { typelast = ksent->data_type; namelast = ksent->name; } switch (typelast) { case KSTAT_DATA_CHAR: /* Not Implemented */ break; case KSTAT_DATA_INT32: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_S32 | CTLFLAG_RD | CTLFLAG_MPSAFE, ksp, i, kstat_sysctl, "I", namelast); break; case KSTAT_DATA_UINT32: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_U32 | CTLFLAG_RD | CTLFLAG_MPSAFE, ksp, i, kstat_sysctl, "IU", namelast); break; case KSTAT_DATA_INT64: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, ksp, i, kstat_sysctl, "Q", namelast); break; case KSTAT_DATA_UINT64: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, ksp, i, kstat_sysctl, "QU", namelast); break; case KSTAT_DATA_LONG: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, ksp, i, kstat_sysctl, "L", namelast); break; case KSTAT_DATA_ULONG: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, ksp, i, kstat_sysctl, "LU", namelast); break; case KSTAT_DATA_STRING: SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, namelast, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, ksp, i, kstat_sysctl_string, "A", namelast); break; default: panic("unsupported type: %d", typelast); } } } void kstat_install(kstat_t *ksp) { struct sysctl_oid *root; if (ksp->ks_ndata == UINT32_MAX) VERIFY3U(ksp->ks_type, ==, KSTAT_TYPE_RAW); switch (ksp->ks_type) { case KSTAT_TYPE_NAMED: return (kstat_install_named(ksp)); case KSTAT_TYPE_RAW: if (ksp->ks_raw_ops.data) { root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, ksp->ks_name, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, ksp, 0, kstat_sysctl_raw, "A", ksp->ks_name); } else { root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, ksp->ks_name, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, ksp, 0, kstat_sysctl_raw, "", ksp->ks_name); } break; case KSTAT_TYPE_IO: root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(ksp->ks_sysctl_root), OID_AUTO, ksp->ks_name, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, ksp, 0, kstat_sysctl_io, "A", ksp->ks_name); break; case KSTAT_TYPE_TIMER: case KSTAT_TYPE_INTR: default: panic("unsupported kstat type %d\n", ksp->ks_type); } VERIFY3P(root, !=, NULL); ksp->ks_sysctl_root = root; } void kstat_delete(kstat_t *ksp) { sysctl_ctx_free(&ksp->ks_sysctl_ctx); ksp->ks_lock = NULL; mutex_destroy(&ksp->ks_private_lock); if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) kmem_free(ksp->ks_data, ksp->ks_data_size); free(ksp, M_KSTAT); } - -void -kstat_waitq_enter(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t wcnt; - - new = gethrtime(); - delta = new - kiop->wlastupdate; - kiop->wlastupdate = new; - wcnt = kiop->wcnt++; - if (wcnt != 0) { - kiop->wlentime += delta * wcnt; - kiop->wtime += delta; - } -} - -void -kstat_waitq_exit(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t wcnt; - - new = gethrtime(); - delta = new - kiop->wlastupdate; - kiop->wlastupdate = new; - wcnt = kiop->wcnt--; - ASSERT3S(wcnt, >, 0); - kiop->wlentime += delta * wcnt; - kiop->wtime += delta; -} - -void -kstat_runq_enter(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t rcnt; - - new = gethrtime(); - delta = new - kiop->rlastupdate; - kiop->rlastupdate = new; - rcnt = kiop->rcnt++; - if (rcnt != 0) { - kiop->rlentime += delta * rcnt; - kiop->rtime += delta; - } -} - -void -kstat_runq_exit(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t rcnt; - - new = gethrtime(); - delta = new - kiop->rlastupdate; - kiop->rlastupdate = new; - rcnt = kiop->rcnt--; - ASSERT3S(rcnt, >, 0); - kiop->rlentime += delta * rcnt; - kiop->rtime += delta; -} diff --git a/module/os/linux/spl/spl-kstat.c b/module/os/linux/spl/spl-kstat.c index c7f1aadf784e..0c46708326d8 100644 --- a/module/os/linux/spl/spl-kstat.c +++ b/module/os/linux/spl/spl-kstat.c @@ -1,781 +1,715 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Kstat Implementation. * * Links to Illumos.org for more information on kstat function: * [1] https://illumos.org/man/1M/kstat * [2] https://illumos.org/man/9f/kstat_create */ #include #include #include #include #include static kmutex_t kstat_module_lock; static struct list_head kstat_module_list; static kid_t kstat_id; static int kstat_resize_raw(kstat_t *ksp) { if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) return (ENOMEM); vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); return (0); } -void -kstat_waitq_enter(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t wcnt; - - new = gethrtime(); - delta = new - kiop->wlastupdate; - kiop->wlastupdate = new; - wcnt = kiop->wcnt++; - if (wcnt != 0) { - kiop->wlentime += delta * wcnt; - kiop->wtime += delta; - } -} -EXPORT_SYMBOL(kstat_waitq_enter); - -void -kstat_waitq_exit(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t wcnt; - - new = gethrtime(); - delta = new - kiop->wlastupdate; - kiop->wlastupdate = new; - wcnt = kiop->wcnt--; - ASSERT((int)wcnt > 0); - kiop->wlentime += delta * wcnt; - kiop->wtime += delta; -} -EXPORT_SYMBOL(kstat_waitq_exit); - -void -kstat_runq_enter(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t rcnt; - - new = gethrtime(); - delta = new - kiop->rlastupdate; - kiop->rlastupdate = new; - rcnt = kiop->rcnt++; - if (rcnt != 0) { - kiop->rlentime += delta * rcnt; - kiop->rtime += delta; - } -} -EXPORT_SYMBOL(kstat_runq_enter); - -void -kstat_runq_exit(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t rcnt; - - new = gethrtime(); - delta = new - kiop->rlastupdate; - kiop->rlastupdate = new; - rcnt = kiop->rcnt--; - ASSERT((int)rcnt > 0); - kiop->rlentime += delta * rcnt; - kiop->rtime += delta; -} -EXPORT_SYMBOL(kstat_runq_exit); - static int kstat_seq_show_headers(struct seq_file *f) { kstat_t *ksp = (kstat_t *)f->private; int rc = 0; ASSERT(ksp->ks_magic == KS_MAGIC); seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n", ksp->ks_kid, ksp->ks_type, ksp->ks_flags, ksp->ks_ndata, (int)ksp->ks_data_size, ksp->ks_crtime, ksp->ks_snaptime); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: restart: if (ksp->ks_raw_ops.headers) { rc = ksp->ks_raw_ops.headers( ksp->ks_raw_buf, ksp->ks_raw_bufsize); if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (!rc) seq_puts(f, ksp->ks_raw_buf); } else { seq_printf(f, "raw data\n"); } break; case KSTAT_TYPE_NAMED: seq_printf(f, "%-31s %-4s %s\n", "name", "type", "data"); break; case KSTAT_TYPE_INTR: seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n", "hard", "soft", "watchdog", "spurious", "multsvc"); break; case KSTAT_TYPE_IO: seq_printf(f, "%-8s %-8s %-8s %-8s %-8s %-8s " "%-8s %-8s %-8s %-8s %-8s %-8s\n", "nread", "nwritten", "reads", "writes", "wtime", "wlentime", "wupdate", "rtime", "rlentime", "rupdate", "wcnt", "rcnt"); break; case KSTAT_TYPE_TIMER: seq_printf(f, "%-31s %-8s " "%-8s %-8s %-8s %-8s %-8s\n", "name", "events", "elapsed", "min", "max", "start", "stop"); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } return (-rc); } static int kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l) { int i, j; for (i = 0; ; i++) { seq_printf(f, "%03x:", i); for (j = 0; j < 16; j++) { if (i * 16 + j >= l) { seq_printf(f, "\n"); goto out; } seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]); } seq_printf(f, "\n"); } out: return (0); } static int kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp) { seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type); switch (knp->data_type) { case KSTAT_DATA_CHAR: knp->value.c[15] = '\0'; /* NULL terminate */ seq_printf(f, "%-16s", knp->value.c); break; /* * NOTE - We need to be more careful able what tokens are * used for each arch, for now this is correct for x86_64. */ case KSTAT_DATA_INT32: seq_printf(f, "%d", knp->value.i32); break; case KSTAT_DATA_UINT32: seq_printf(f, "%u", knp->value.ui32); break; case KSTAT_DATA_INT64: seq_printf(f, "%lld", (signed long long)knp->value.i64); break; case KSTAT_DATA_UINT64: seq_printf(f, "%llu", (unsigned long long)knp->value.ui64); break; case KSTAT_DATA_LONG: seq_printf(f, "%ld", knp->value.l); break; case KSTAT_DATA_ULONG: seq_printf(f, "%lu", knp->value.ul); break; case KSTAT_DATA_STRING: KSTAT_NAMED_STR_PTR(knp) [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0'; seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp)); break; default: PANIC("Undefined kstat data type %d\n", knp->data_type); } seq_printf(f, "\n"); return (0); } static int kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip) { seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n", kip->intrs[KSTAT_INTR_HARD], kip->intrs[KSTAT_INTR_SOFT], kip->intrs[KSTAT_INTR_WATCHDOG], kip->intrs[KSTAT_INTR_SPURIOUS], kip->intrs[KSTAT_INTR_MULTSVC]); return (0); } static int kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip) { /* though wlentime & friends are signed, they will never be negative */ seq_printf(f, "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", kip->nread, kip->nwritten, kip->reads, kip->writes, kip->wtime, kip->wlentime, kip->wlastupdate, kip->rtime, kip->rlentime, kip->rlastupdate, kip->wcnt, kip->rcnt); return (0); } static int kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) { seq_printf(f, "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n", ktp->name, ktp->num_events, ktp->elapsed_time, ktp->min_time, ktp->max_time, ktp->start_time, ktp->stop_time); return (0); } static int kstat_seq_show(struct seq_file *f, void *p) { kstat_t *ksp = (kstat_t *)f->private; int rc = 0; ASSERT(ksp->ks_magic == KS_MAGIC); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: restart: if (ksp->ks_raw_ops.data) { rc = ksp->ks_raw_ops.data( ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (!rc) seq_puts(f, ksp->ks_raw_buf); } else { ASSERT(ksp->ks_ndata == 1); rc = kstat_seq_show_raw(f, ksp->ks_data, ksp->ks_data_size); } break; case KSTAT_TYPE_NAMED: rc = kstat_seq_show_named(f, (kstat_named_t *)p); break; case KSTAT_TYPE_INTR: rc = kstat_seq_show_intr(f, (kstat_intr_t *)p); break; case KSTAT_TYPE_IO: rc = kstat_seq_show_io(f, (kstat_io_t *)p); break; case KSTAT_TYPE_TIMER: rc = kstat_seq_show_timer(f, (kstat_timer_t *)p); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } return (-rc); } static int kstat_default_update(kstat_t *ksp, int rw) { ASSERT(ksp != NULL); if (rw == KSTAT_WRITE) return (EACCES); return (0); } static void * kstat_seq_data_addr(kstat_t *ksp, loff_t n) { void *rc = NULL; switch (ksp->ks_type) { case KSTAT_TYPE_RAW: if (ksp->ks_raw_ops.addr) rc = ksp->ks_raw_ops.addr(ksp, n); else rc = ksp->ks_data; break; case KSTAT_TYPE_NAMED: rc = ksp->ks_data + n * sizeof (kstat_named_t); break; case KSTAT_TYPE_INTR: rc = ksp->ks_data + n * sizeof (kstat_intr_t); break; case KSTAT_TYPE_IO: rc = ksp->ks_data + n * sizeof (kstat_io_t); break; case KSTAT_TYPE_TIMER: rc = ksp->ks_data + n * sizeof (kstat_timer_t); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } return (rc); } static void * kstat_seq_start(struct seq_file *f, loff_t *pos) { loff_t n = *pos; kstat_t *ksp = (kstat_t *)f->private; ASSERT(ksp->ks_magic == KS_MAGIC); mutex_enter(ksp->ks_lock); if (ksp->ks_type == KSTAT_TYPE_RAW) { ksp->ks_raw_bufsize = PAGE_SIZE; ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); } /* Dynamically update kstat, on error existing kstats are used */ (void) ksp->ks_update(ksp, KSTAT_READ); ksp->ks_snaptime = gethrtime(); if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n && kstat_seq_show_headers(f)) return (NULL); if (n >= ksp->ks_ndata) return (NULL); return (kstat_seq_data_addr(ksp, n)); } static void * kstat_seq_next(struct seq_file *f, void *p, loff_t *pos) { kstat_t *ksp = (kstat_t *)f->private; ASSERT(ksp->ks_magic == KS_MAGIC); ++*pos; if (*pos >= ksp->ks_ndata) return (NULL); return (kstat_seq_data_addr(ksp, *pos)); } static void kstat_seq_stop(struct seq_file *f, void *v) { kstat_t *ksp = (kstat_t *)f->private; ASSERT(ksp->ks_magic == KS_MAGIC); if (ksp->ks_type == KSTAT_TYPE_RAW) vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); mutex_exit(ksp->ks_lock); } static struct seq_operations kstat_seq_ops = { .show = kstat_seq_show, .start = kstat_seq_start, .next = kstat_seq_next, .stop = kstat_seq_stop, }; static kstat_module_t * kstat_find_module(char *name) { kstat_module_t *module = NULL; list_for_each_entry(module, &kstat_module_list, ksm_module_list) { if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0) return (module); } return (NULL); } static kstat_module_t * kstat_create_module(char *name) { kstat_module_t *module; struct proc_dir_entry *pde; pde = proc_mkdir(name, proc_spl_kstat); if (pde == NULL) return (NULL); module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); module->ksm_proc = pde; strlcpy(module->ksm_name, name, KSTAT_STRLEN+1); INIT_LIST_HEAD(&module->ksm_kstat_list); list_add_tail(&module->ksm_module_list, &kstat_module_list); return (module); } static void kstat_delete_module(kstat_module_t *module) { ASSERT(list_empty(&module->ksm_kstat_list)); remove_proc_entry(module->ksm_name, proc_spl_kstat); list_del(&module->ksm_module_list); kmem_free(module, sizeof (kstat_module_t)); } static int proc_kstat_open(struct inode *inode, struct file *filp) { struct seq_file *f; int rc; rc = seq_open(filp, &kstat_seq_ops); if (rc) return (rc); f = filp->private_data; f->private = PDE_DATA(inode); return (0); } static ssize_t proc_kstat_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct seq_file *f = filp->private_data; kstat_t *ksp = f->private; int rc; ASSERT(ksp->ks_magic == KS_MAGIC); mutex_enter(ksp->ks_lock); rc = ksp->ks_update(ksp, KSTAT_WRITE); mutex_exit(ksp->ks_lock); if (rc) return (-rc); *ppos += len; return (len); } static const kstat_proc_op_t proc_kstat_operations = { #ifdef HAVE_PROC_OPS_STRUCT .proc_open = proc_kstat_open, .proc_write = proc_kstat_write, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, #else .open = proc_kstat_open, .write = proc_kstat_write, .read = seq_read, .llseek = seq_lseek, .release = seq_release, #endif }; void __kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)) { ksp->ks_raw_ops.headers = headers; ksp->ks_raw_ops.data = data; ksp->ks_raw_ops.addr = addr; } EXPORT_SYMBOL(__kstat_set_raw_ops); void kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, const char *name) { kpep->kpe_owner = NULL; kpep->kpe_proc = NULL; INIT_LIST_HEAD(&kpep->kpe_list); strncpy(kpep->kpe_module, module, KSTAT_STRLEN); strncpy(kpep->kpe_name, name, KSTAT_STRLEN); } EXPORT_SYMBOL(kstat_proc_entry_init); kstat_t * __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags) { kstat_t *ksp; ASSERT(ks_module); ASSERT(ks_instance == 0); ASSERT(ks_name); if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) ASSERT(ks_ndata == 1); ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP); if (ksp == NULL) return (ksp); mutex_enter(&kstat_module_lock); ksp->ks_kid = kstat_id; kstat_id++; mutex_exit(&kstat_module_lock); ksp->ks_magic = KS_MAGIC; mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); ksp->ks_lock = &ksp->ks_private_lock; ksp->ks_crtime = gethrtime(); ksp->ks_snaptime = ksp->ks_crtime; ksp->ks_instance = ks_instance; strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); ksp->ks_type = ks_type; ksp->ks_flags = ks_flags; ksp->ks_update = kstat_default_update; ksp->ks_private = NULL; ksp->ks_raw_ops.headers = NULL; ksp->ks_raw_ops.data = NULL; ksp->ks_raw_ops.addr = NULL; ksp->ks_raw_buf = NULL; ksp->ks_raw_bufsize = 0; kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: ksp->ks_ndata = 1; ksp->ks_data_size = ks_ndata; break; case KSTAT_TYPE_NAMED: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); break; case KSTAT_TYPE_INTR: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); break; case KSTAT_TYPE_IO: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); break; case KSTAT_TYPE_TIMER: ksp->ks_ndata = ks_ndata; ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); break; default: PANIC("Undefined kstat type %d\n", ksp->ks_type); } if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { ksp->ks_data = NULL; } else { ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); if (ksp->ks_data == NULL) { kmem_free(ksp, sizeof (*ksp)); ksp = NULL; } } return (ksp); } EXPORT_SYMBOL(__kstat_create); static int kstat_detect_collision(kstat_proc_entry_t *kpep) { kstat_module_t *module; kstat_proc_entry_t *tmp = NULL; char *parent; char *cp; parent = kmem_asprintf("%s", kpep->kpe_module); if ((cp = strrchr(parent, '/')) == NULL) { kmem_strfree(parent); return (0); } cp[0] = '\0'; if ((module = kstat_find_module(parent)) != NULL) { list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) { kmem_strfree(parent); return (EEXIST); } } } kmem_strfree(parent); return (0); } /* * Add a file to the proc filesystem under the kstat namespace (i.e. * /proc/spl/kstat/). The file need not necessarily be implemented as a * kstat. */ void kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, const kstat_proc_op_t *proc_ops, void *data) { kstat_module_t *module; kstat_proc_entry_t *tmp = NULL; ASSERT(kpep); mutex_enter(&kstat_module_lock); module = kstat_find_module(kpep->kpe_module); if (module == NULL) { if (kstat_detect_collision(kpep) != 0) { cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \ " collision", kpep->kpe_module, kpep->kpe_name); goto out; } module = kstat_create_module(kpep->kpe_module); if (module == NULL) goto out; } /* * Only one entry by this name per-module, on failure the module * shouldn't be deleted because we know it has at least one entry. */ list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0) goto out; } list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list); kpep->kpe_owner = module; kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode, module->ksm_proc, proc_ops, data); if (kpep->kpe_proc == NULL) { list_del_init(&kpep->kpe_list); if (list_empty(&module->ksm_kstat_list)) kstat_delete_module(module); } out: mutex_exit(&kstat_module_lock); } EXPORT_SYMBOL(kstat_proc_entry_install); void __kstat_install(kstat_t *ksp) { ASSERT(ksp); mode_t mode; /* Specify permission modes for different kstats */ if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) { mode = 0600; } else { mode = 0644; } kstat_proc_entry_install( &ksp->ks_proc, mode, &proc_kstat_operations, ksp); } EXPORT_SYMBOL(__kstat_install); void kstat_proc_entry_delete(kstat_proc_entry_t *kpep) { kstat_module_t *module = kpep->kpe_owner; if (kpep->kpe_proc) remove_proc_entry(kpep->kpe_name, module->ksm_proc); mutex_enter(&kstat_module_lock); list_del_init(&kpep->kpe_list); /* * Remove top level module directory if it wasn't empty before, but now * is. */ if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list)) kstat_delete_module(module); mutex_exit(&kstat_module_lock); } EXPORT_SYMBOL(kstat_proc_entry_delete); void __kstat_delete(kstat_t *ksp) { kstat_proc_entry_delete(&ksp->ks_proc); if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) kmem_free(ksp->ks_data, ksp->ks_data_size); ksp->ks_lock = NULL; mutex_destroy(&ksp->ks_private_lock); kmem_free(ksp, sizeof (*ksp)); } EXPORT_SYMBOL(__kstat_delete); int spl_kstat_init(void) { mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL); INIT_LIST_HEAD(&kstat_module_list); kstat_id = 0; return (0); } void spl_kstat_fini(void) { ASSERT(list_empty(&kstat_module_list)); mutex_destroy(&kstat_module_lock); } diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index c3eacc14239e..534ac72fee7b 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -1,1029 +1,979 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include #include #include #include /* * Keeps stats on last N reads per spa_t, disabled by default. */ int zfs_read_history = 0; /* * Include cache hits in history, disabled by default. */ int zfs_read_history_hits = 0; /* * Keeps stats on the last 100 txgs by default. */ int zfs_txg_history = 100; /* * Keeps stats on the last N MMP updates, disabled by default. */ int zfs_multihost_history = 0; /* * ========================================================================== * SPA Read History Routines * ========================================================================== */ /* * Read statistics - Information exported regarding each arc_read call */ typedef struct spa_read_history { hrtime_t start; /* time read completed */ uint64_t objset; /* read from this objset */ uint64_t object; /* read of this object number */ uint64_t level; /* block's indirection level */ uint64_t blkid; /* read of this block id */ char origin[24]; /* read originated from here */ uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ pid_t pid; /* PID of task doing read */ char comm[16]; /* process name of task doing read */ procfs_list_node_t srh_node; } spa_read_history_t; static int spa_read_history_show_header(struct seq_file *f) { seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", "level", "blkid", "aflags", "origin", "pid", "process"); return (0); } static int spa_read_history_show(struct seq_file *f, void *data) { spa_read_history_t *srh = (spa_read_history_t *)data; seq_printf(f, "%-8llu %-16llu 0x%-6llx " "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", (u_longlong_t)srh->srh_node.pln_id, srh->start, (longlong_t)srh->objset, (longlong_t)srh->object, (longlong_t)srh->level, (longlong_t)srh->blkid, srh->aflags, srh->origin, srh->pid, srh->comm); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) { spa_read_history_t *srh; while (shl->size > size) { srh = list_remove_head(&shl->procfs_list.pl_list); ASSERT3P(srh, !=, NULL); kmem_free(srh, sizeof (spa_read_history_t)); shl->size--; } if (size == 0) ASSERT(list_is_empty(&shl->procfs_list.pl_list)); } static int spa_read_history_clear(procfs_list_t *procfs_list) { spa_history_list_t *shl = procfs_list->pl_private; mutex_enter(&procfs_list->pl_lock); spa_read_history_truncate(shl, 0); mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_read_history_init(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.read_history; shl->size = 0; shl->procfs_list.pl_private = shl; procfs_list_install("zfs", spa_name(spa), "reads", 0600, &shl->procfs_list, spa_read_history_show, spa_read_history_show_header, spa_read_history_clear, offsetof(spa_read_history_t, srh_node)); } static void spa_read_history_destroy(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.read_history; procfs_list_uninstall(&shl->procfs_list); spa_read_history_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); } void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) { spa_history_list_t *shl = &spa->spa_stats.read_history; spa_read_history_t *srh; ASSERT3P(spa, !=, NULL); ASSERT3P(zb, !=, NULL); if (zfs_read_history == 0 && shl->size == 0) return; if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) return; srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); srh->start = gethrtime(); srh->objset = zb->zb_objset; srh->object = zb->zb_object; srh->level = zb->zb_level; srh->blkid = zb->zb_blkid; srh->aflags = aflags; srh->pid = getpid(); mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, srh); shl->size++; spa_read_history_truncate(shl, zfs_read_history); mutex_exit(&shl->procfs_list.pl_lock); } /* * ========================================================================== * SPA TXG History Routines * ========================================================================== */ /* * Txg statistics - Information exported regarding each txg sync */ typedef struct spa_txg_history { uint64_t txg; /* txg id */ txg_state_t state; /* active txg state */ uint64_t nread; /* number of bytes read */ uint64_t nwritten; /* number of bytes written */ uint64_t reads; /* number of read operations */ uint64_t writes; /* number of write operations */ uint64_t ndirty; /* number of dirty bytes */ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ procfs_list_node_t sth_node; } spa_txg_history_t; static int spa_txg_history_show_header(struct seq_file *f) { seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", "ndirty", "nread", "nwritten", "reads", "writes", "otime", "qtime", "wtime", "stime"); return (0); } static int spa_txg_history_show(struct seq_file *f, void *data) { spa_txg_history_t *sth = (spa_txg_history_t *)data; uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; char state; switch (sth->state) { case TXG_STATE_BIRTH: state = 'B'; break; case TXG_STATE_OPEN: state = 'O'; break; case TXG_STATE_QUIESCED: state = 'Q'; break; case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; case TXG_STATE_SYNCED: state = 'S'; break; case TXG_STATE_COMMITTED: state = 'C'; break; default: state = '?'; break; } if (sth->times[TXG_STATE_OPEN]) open = sth->times[TXG_STATE_OPEN] - sth->times[TXG_STATE_BIRTH]; if (sth->times[TXG_STATE_QUIESCED]) quiesce = sth->times[TXG_STATE_QUIESCED] - sth->times[TXG_STATE_OPEN]; if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - sth->times[TXG_STATE_QUIESCED]; if (sth->times[TXG_STATE_SYNCED]) sync = sth->times[TXG_STATE_SYNCED] - sth->times[TXG_STATE_WAIT_FOR_SYNC]; seq_printf(f, "%-8llu %-16llu %-5c %-12llu " "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, (u_longlong_t)sth->ndirty, (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, (u_longlong_t)sync); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) { spa_txg_history_t *sth; while (shl->size > size) { sth = list_remove_head(&shl->procfs_list.pl_list); ASSERT3P(sth, !=, NULL); kmem_free(sth, sizeof (spa_txg_history_t)); shl->size--; } if (size == 0) ASSERT(list_is_empty(&shl->procfs_list.pl_list)); } static int spa_txg_history_clear(procfs_list_t *procfs_list) { spa_history_list_t *shl = procfs_list->pl_private; mutex_enter(&procfs_list->pl_lock); spa_txg_history_truncate(shl, 0); mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_txg_history_init(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.txg_history; shl->size = 0; shl->procfs_list.pl_private = shl; procfs_list_install("zfs", spa_name(spa), "txgs", 0644, &shl->procfs_list, spa_txg_history_show, spa_txg_history_show_header, spa_txg_history_clear, offsetof(spa_txg_history_t, sth_node)); } static void spa_txg_history_destroy(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.txg_history; procfs_list_uninstall(&shl->procfs_list); spa_txg_history_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); } /* * Add a new txg to historical record. */ void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) { spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; if (zfs_txg_history == 0 && shl->size == 0) return; sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); sth->txg = txg; sth->state = TXG_STATE_OPEN; sth->times[TXG_STATE_BIRTH] = birth_time; mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sth); shl->size++; spa_txg_history_truncate(shl, zfs_txg_history); mutex_exit(&shl->procfs_list.pl_lock); } /* * Set txg state completion time and increment current state. */ int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time) { spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; sth = list_prev(&shl->procfs_list.pl_list, sth)) { if (sth->txg == txg) { sth->times[completed_state] = completed_time; sth->state++; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * Set txg IO stats. */ static int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) { spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; sth = list_prev(&shl->procfs_list.pl_list, sth)) { if (sth->txg == txg) { sth->nread = nread; sth->nwritten = nwritten; sth->reads = reads; sth->writes = writes; sth->ndirty = ndirty; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } txg_stat_t * spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) { txg_stat_t *ts; if (zfs_txg_history == 0) return (NULL); ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_get_stats(spa->spa_root_vdev, &ts->vs1); spa_config_exit(spa, SCL_CONFIG, FTAG); ts->txg = txg; ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); return (ts); } void spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) { if (ts == NULL) return; if (zfs_txg_history == 0) { kmem_free(ts, sizeof (txg_stat_t)); return; } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_get_stats(spa->spa_root_vdev, &ts->vs2); spa_config_exit(spa, SCL_CONFIG, FTAG); spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); spa_txg_history_set_io(spa, ts->txg, ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], ts->ndirty); kmem_free(ts, sizeof (txg_stat_t)); } /* * ========================================================================== * SPA TX Assign Histogram Routines * ========================================================================== */ /* * Tx statistics - Information exported regarding dmu_tx_assign time. */ /* * When the kstat is written zero all buckets. When the kstat is read * count the number of trailing buckets set to zero and update ks_ndata * such that they are not output. */ static int spa_tx_assign_update(kstat_t *ksp, int rw) { spa_t *spa = ksp->ks_private; spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; int i; if (rw == KSTAT_WRITE) { for (i = 0; i < shk->count; i++) ((kstat_named_t *)shk->priv)[i].value.ui64 = 0; } for (i = shk->count; i > 0; i--) if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0) break; ksp->ks_ndata = i; ksp->ks_data_size = i * sizeof (kstat_named_t); return (0); } static void spa_tx_assign_init(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; char *name; kstat_named_t *ks; kstat_t *ksp; int i; mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); shk->count = 42; /* power of two buckets for 1ns to 2,199s */ shk->size = shk->count * sizeof (kstat_named_t); shk->priv = kmem_alloc(shk->size, KM_SLEEP); name = kmem_asprintf("zfs/%s", spa_name(spa)); for (i = 0; i < shk->count; i++) { ks = &((kstat_named_t *)shk->priv)[i]; ks->data_type = KSTAT_DATA_UINT64; ks->value.ui64 = 0; (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", (u_longlong_t)1 << i); } ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); shk->kstat = ksp; if (ksp) { ksp->ks_lock = &shk->lock; ksp->ks_data = shk->priv; ksp->ks_ndata = shk->count; ksp->ks_data_size = shk->size; ksp->ks_private = spa; ksp->ks_update = spa_tx_assign_update; kstat_install(ksp); } kmem_strfree(name); } static void spa_tx_assign_destroy(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; kstat_t *ksp; ksp = shk->kstat; if (ksp) kstat_delete(ksp); kmem_free(shk->priv, shk->size); mutex_destroy(&shk->lock); } void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) { spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; uint64_t idx = 0; while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) idx++; atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64); } -/* - * ========================================================================== - * SPA IO History Routines - * ========================================================================== - */ -static int -spa_io_history_update(kstat_t *ksp, int rw) -{ - if (rw == KSTAT_WRITE) - memset(ksp->ks_data, 0, ksp->ks_data_size); - - return (0); -} - -static void -spa_io_history_init(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - char *name; - kstat_t *ksp; - - mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); - - name = kmem_asprintf("zfs/%s", spa_name(spa)); - - ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); - shk->kstat = ksp; - - if (ksp) { - ksp->ks_lock = &shk->lock; - ksp->ks_private = spa; - ksp->ks_update = spa_io_history_update; - kstat_install(ksp); - } - kmem_strfree(name); -} - -static void -spa_io_history_destroy(spa_t *spa) -{ - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - - if (shk->kstat) - kstat_delete(shk->kstat); - - mutex_destroy(&shk->lock); -} - /* * ========================================================================== * SPA MMP History Routines * ========================================================================== */ /* * MMP statistics - Information exported regarding attempted MMP writes * For MMP writes issued, fields used as per comments below. * For MMP writes skipped, an entry represents a span of time when * writes were skipped for same reason (error from mmp_random_leaf). * Differences are: * timestamp time first write skipped, if >1 skipped in a row * mmp_delay delay value at timestamp * vdev_guid number of writes skipped * io_error one of enum mmp_error * duration time span (ns) of skipped writes */ typedef struct spa_mmp_history { uint64_t mmp_node_id; /* unique # for updates */ uint64_t txg; /* txg of last sync */ uint64_t timestamp; /* UTC time MMP write issued */ uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ uint64_t vdev_guid; /* unique ID of leaf vdev */ char *vdev_path; int vdev_label; /* vdev label */ int io_error; /* error status of MMP write */ hrtime_t error_start; /* hrtime of start of error period */ hrtime_t duration; /* time from submission to completion */ procfs_list_node_t smh_node; } spa_mmp_history_t; static int spa_mmp_history_show_header(struct seq_file *f) { seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); return (0); } static int spa_mmp_history_show(struct seq_file *f, void *data) { spa_mmp_history_t *smh = (spa_mmp_history_t *)data; char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " "%-10lld %s\n"; char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " "%-10lld %s\n"; seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, (smh->vdev_path ? smh->vdev_path : "-")); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) { spa_mmp_history_t *smh; while (shl->size > size) { smh = list_remove_head(&shl->procfs_list.pl_list); if (smh->vdev_path) kmem_strfree(smh->vdev_path); kmem_free(smh, sizeof (spa_mmp_history_t)); shl->size--; } if (size == 0) ASSERT(list_is_empty(&shl->procfs_list.pl_list)); } static int spa_mmp_history_clear(procfs_list_t *procfs_list) { spa_history_list_t *shl = procfs_list->pl_private; mutex_enter(&procfs_list->pl_lock); spa_mmp_history_truncate(shl, 0); mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_mmp_history_init(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; shl->size = 0; shl->procfs_list.pl_private = shl; procfs_list_install("zfs", spa_name(spa), "multihost", 0644, &shl->procfs_list, spa_mmp_history_show, spa_mmp_history_show_header, spa_mmp_history_clear, offsetof(spa_mmp_history_t, smh_node)); } static void spa_mmp_history_destroy(spa_t *spa) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; procfs_list_uninstall(&shl->procfs_list); spa_mmp_history_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); } /* * Set duration in existing "skip" record to how long we have waited for a leaf * vdev to become available. * * Important that we start search at the tail of the list where new * records are inserted, so this is normally an O(1) operation. */ int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; spa_mmp_history_t *smh; int error = ENOENT; if (zfs_multihost_history == 0 && shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; smh = list_prev(&shl->procfs_list.pl_list, smh)) { if (smh->mmp_node_id == mmp_node_id) { ASSERT3U(smh->io_error, !=, 0); smh->duration = gethrtime() - smh->error_start; smh->vdev_guid++; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * Set MMP write duration and error status in existing record. * See comment re: search order above spa_mmp_history_set_skip(). */ int spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, hrtime_t duration) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; spa_mmp_history_t *smh; int error = ENOENT; if (zfs_multihost_history == 0 && shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; smh = list_prev(&shl->procfs_list.pl_list, smh)) { if (smh->mmp_node_id == mmp_node_id) { ASSERT(smh->io_error == 0); smh->io_error = io_error; smh->duration = duration; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * Add a new MMP historical record. * error == 0 : a write was issued. * error != 0 : a write was not issued because no leaves were found. */ void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, int error) { spa_history_list_t *shl = &spa->spa_stats.mmp_history; spa_mmp_history_t *smh; if (zfs_multihost_history == 0 && shl->size == 0) return; smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); smh->txg = txg; smh->timestamp = timestamp; smh->mmp_delay = mmp_delay; if (vd) { smh->vdev_guid = vd->vdev_guid; if (vd->vdev_path) smh->vdev_path = kmem_strdup(vd->vdev_path); } smh->vdev_label = label; smh->mmp_node_id = mmp_node_id; if (error) { smh->io_error = error; smh->error_start = gethrtime(); smh->vdev_guid = 1; } mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, smh); shl->size++; spa_mmp_history_truncate(shl, zfs_multihost_history); mutex_exit(&shl->procfs_list.pl_lock); } static void * spa_state_addr(kstat_t *ksp, loff_t n) { if (n == 0) return (ksp->ks_private); /* return the spa_t */ return (NULL); } static int spa_state_data(char *buf, size_t size, void *data) { spa_t *spa = (spa_t *)data; (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); return (0); } /* * Return the state of the pool in /proc/spl/kstat/zfs//state. * * This is a lock-less read of the pool's state (unlike using 'zpool', which * can potentially block for seconds). Because it doesn't block, it can useful * as a pool heartbeat value. */ static void spa_state_init(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.state; char *name; kstat_t *ksp; mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); name = kmem_asprintf("zfs/%s", spa_name(spa)); ksp = kstat_create(name, 0, "state", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); shk->kstat = ksp; if (ksp) { ksp->ks_lock = &shk->lock; ksp->ks_data = NULL; ksp->ks_private = spa; ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); kstat_install(ksp); } kmem_strfree(name); } static void spa_health_destroy(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.state; kstat_t *ksp = shk->kstat; if (ksp) kstat_delete(ksp); mutex_destroy(&shk->lock); } static spa_iostats_t spa_iostats_template = { { "trim_extents_written", KSTAT_DATA_UINT64 }, { "trim_bytes_written", KSTAT_DATA_UINT64 }, { "trim_extents_skipped", KSTAT_DATA_UINT64 }, { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, { "trim_extents_failed", KSTAT_DATA_UINT64 }, { "trim_bytes_failed", KSTAT_DATA_UINT64 }, { "autotrim_extents_written", KSTAT_DATA_UINT64 }, { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, }; #define SPA_IOSTATS_ADD(stat, val) \ atomic_add_64(&iostats->stat.value.ui64, (val)); void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed) { spa_history_kstat_t *shk = &spa->spa_stats.iostats; kstat_t *ksp = shk->kstat; spa_iostats_t *iostats; if (ksp == NULL) return; iostats = ksp->ks_data; if (type == TRIM_TYPE_MANUAL) { SPA_IOSTATS_ADD(trim_extents_written, extents_written); SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); } else if (type == TRIM_TYPE_AUTO) { SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); } else { SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); } } static int spa_iostats_update(kstat_t *ksp, int rw) { if (rw == KSTAT_WRITE) { memcpy(ksp->ks_data, &spa_iostats_template, sizeof (spa_iostats_t)); } return (0); } static void spa_iostats_init(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.iostats; mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); char *name = kmem_asprintf("zfs/%s", spa_name(spa)); kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); shk->kstat = ksp; if (ksp) { int size = sizeof (spa_iostats_t); ksp->ks_lock = &shk->lock; ksp->ks_private = spa; ksp->ks_update = spa_iostats_update; ksp->ks_data = kmem_alloc(size, KM_SLEEP); memcpy(ksp->ks_data, &spa_iostats_template, size); kstat_install(ksp); } kmem_strfree(name); } static void spa_iostats_destroy(spa_t *spa) { spa_history_kstat_t *shk = &spa->spa_stats.iostats; kstat_t *ksp = shk->kstat; if (ksp) { kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); kstat_delete(ksp); } mutex_destroy(&shk->lock); } void spa_stats_init(spa_t *spa) { spa_read_history_init(spa); spa_txg_history_init(spa); spa_tx_assign_init(spa); - spa_io_history_init(spa); spa_mmp_history_init(spa); spa_state_init(spa); spa_iostats_init(spa); } void spa_stats_destroy(spa_t *spa) { spa_iostats_destroy(spa); spa_health_destroy(spa); spa_tx_assign_destroy(spa); spa_txg_history_destroy(spa); spa_read_history_destroy(spa); - spa_io_history_destroy(spa); spa_mmp_history_destroy(spa); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW, "Historical statistics for the last N reads"); ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW, "Include cache hits in read history"); ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW, "Historical statistics for the last N txgs"); ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW, "Historical statistics for last N multihost writes"); /* END CSTYLED */ diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 25a4bc69cc23..198861edb816 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -1,1164 +1,1117 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include -#include -#include #include /* * ZFS I/O Scheduler * --------------- * * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The * I/O scheduler determines when and in what order those operations are * issued. The I/O scheduler divides operations into five I/O classes * prioritized in the following order: sync read, sync write, async read, * async write, and scrub/resilver. Each queue defines the minimum and * maximum number of concurrent operations that may be issued to the device. * In addition, the device has an aggregate maximum. Note that the sum of the * per-queue minimums must not exceed the aggregate maximum. If the * sum of the per-queue maximums exceeds the aggregate maximum, then the * number of active i/os may reach zfs_vdev_max_active, in which case no * further i/os will be issued regardless of whether all per-queue * minimums have been met. * * For many physical devices, throughput increases with the number of * concurrent operations, but latency typically suffers. Further, physical * devices typically have a limit at which more concurrent operations have no * effect on throughput or can actually cause it to decrease. * * The scheduler selects the next operation to issue by first looking for an * I/O class whose minimum has not been satisfied. Once all are satisfied and * the aggregate maximum has not been hit, the scheduler looks for classes * whose maximum has not been satisfied. Iteration through the I/O classes is * done in the order specified above. No further operations are issued if the * aggregate maximum number of concurrent operations has been hit or if there * are no operations queued for an I/O class that has not hit its maximum. * Every time an i/o is queued or an operation completes, the I/O scheduler * looks for new operations to issue. * * All I/O classes have a fixed maximum number of outstanding operations * except for the async write class. Asynchronous writes represent the data * that is committed to stable storage during the syncing stage for * transaction groups (see txg.c). Transaction groups enter the syncing state * periodically so the number of queued async writes will quickly burst up and * then bleed down to zero. Rather than servicing them as quickly as possible, * the I/O scheduler changes the maximum number of active async write i/os * according to the amount of dirty data in the pool (see dsl_pool.c). Since * both throughput and latency typically increase with the number of * concurrent operations issued to physical devices, reducing the burstiness * in the number of concurrent operations also stabilizes the response time of * operations from other -- and in particular synchronous -- queues. In broad * strokes, the I/O scheduler will issue more concurrent operations from the * async write queue as there's more dirty data in the pool. * * Async Writes * * The number of concurrent operations issued for the async write I/O class * follows a piece-wise linear function defined by a few adjustable points. * * | o---------| <-- zfs_vdev_async_write_max_active * ^ | /^ | * | | / | | * active | / | | * I/O | / | | * count | / | | * | / | | * |------------o | | <-- zfs_vdev_async_write_min_active * 0|____________^______|_________| * 0% | | 100% of zfs_dirty_data_max * | | * | `-- zfs_vdev_async_write_active_max_dirty_percent * `--------- zfs_vdev_async_write_active_min_dirty_percent * * Until the amount of dirty data exceeds a minimum percentage of the dirty * data allowed in the pool, the I/O scheduler will limit the number of * concurrent operations to the minimum. As that threshold is crossed, the * number of concurrent operations issued increases linearly to the maximum at * the specified maximum percentage of the dirty data allowed in the pool. * * Ideally, the amount of dirty data on a busy pool will stay in the sloped * part of the function between zfs_vdev_async_write_active_min_dirty_percent * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the * maximum percentage, this indicates that the rate of incoming data is * greater than the rate that the backend storage can handle. In this case, we * must further throttle incoming writes (see dmu_tx_delay() for details). */ /* * The maximum number of i/os active to each device. Ideally, this will be >= * the sum of each queue's max_active. */ uint32_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the * number of active i/os is < zfs_vdev_max_active, then the min_active comes * into play. We will send min_active from each queue round-robin, and then * send from queues in the order defined by zio_priority_t up to max_active. * Some queues have additional mechanisms to limit number of active I/Os in * addition to min_active and max_active, see below. * * In general, smaller max_active's will lead to lower latency of synchronous * operations. Larger max_active's may lead to higher overall throughput, * depending on underlying storage. * * The ratio of the queues' max_actives determines the balance of performance * between reads, writes, and scrubs. E.g., increasing * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete * more quickly, but reads and writes to have higher latency and lower * throughput. */ uint32_t zfs_vdev_sync_read_min_active = 10; uint32_t zfs_vdev_sync_read_max_active = 10; uint32_t zfs_vdev_sync_write_min_active = 10; uint32_t zfs_vdev_sync_write_max_active = 10; uint32_t zfs_vdev_async_read_min_active = 1; uint32_t zfs_vdev_async_read_max_active = 3; uint32_t zfs_vdev_async_write_min_active = 2; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 3; uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; uint32_t zfs_vdev_initializing_min_active = 1; uint32_t zfs_vdev_initializing_max_active = 1; uint32_t zfs_vdev_trim_min_active = 1; uint32_t zfs_vdev_trim_max_active = 2; uint32_t zfs_vdev_rebuild_min_active = 1; uint32_t zfs_vdev_rebuild_max_active = 3; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent * dirty data, use zfs_vdev_async_write_min_active. When it has more than * zfs_vdev_async_write_active_max_dirty_percent, use * zfs_vdev_async_write_max_active. The value is linearly interpolated * between min and max. */ int zfs_vdev_async_write_active_min_dirty_percent = 30; int zfs_vdev_async_write_active_max_dirty_percent = 60; /* * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), * the number of concurrently-active I/O's is limited to *_min_active, unless * the vdev is "idle". When there are no interactive I/Os active (sync or * async), and zfs_vdev_nia_delay I/Os have completed since the last * interactive I/O, then the vdev is considered to be "idle", and the number * of concurrently-active non-interactive I/O's is increased to *_max_active. */ uint_t zfs_vdev_nia_delay = 5; /* * Some HDDs tend to prioritize sequential I/O so high that concurrent * random I/O latency reaches several seconds. On some HDDs it happens * even if sequential I/Os are submitted one at a time, and so setting * *_max_active to 1 does not help. To prevent non-interactive I/Os, like * scrub, from monopolizing the device no more than zfs_vdev_nia_credit * I/Os can be sent while there are outstanding incomplete interactive * I/Os. This enforced wait ensures the HDD services the interactive I/O * within a reasonable amount of time. */ uint_t zfs_vdev_nia_credit = 5; /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * For read I/Os, we also aggregate across small adjacency gaps; for writes * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = 1 << 20; int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; /* * Define the queue depth percentage for each top-level. This percentage is * used in conjunction with zfs_vdev_async_max_active to determine how many * allocations a specific top-level vdev should handle. Once the queue depth * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 * then allocator will stop allocating blocks on that top-level device. * The default kernel setting is 1000% which will yield 100 allocations per * device. For userland testing, the default setting is 300% which equates * to 30 allocations per device. */ #ifdef _KERNEL int zfs_vdev_queue_depth_pct = 1000; #else int zfs_vdev_queue_depth_pct = 300; #endif /* * When performing allocations for a given metaslab, we want to make sure that * there are enough IOs to aggregate together to improve throughput. We want to * ensure that there are at least 128k worth of IOs that can be aggregated, and * we assume that the average allocation size is 4k, so we need the queue depth * to be 32 per allocator to get good aggregation of sequential writes. */ int zfs_vdev_def_queue_depth = 32; /* * Allow TRIM I/Os to be aggregated. This should normally not be needed since * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted * by the TRIM code in zfs_trim.c. */ int zfs_vdev_aggregate_trim = 0; static int vdev_queue_offset_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; int cmp = TREE_CMP(z1->io_offset, z2->io_offset); if (likely(cmp)) return (cmp); return (TREE_PCMP(z1, z2)); } static inline avl_tree_t * vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) { return (&vq->vq_class[p].vqc_queued_tree); } static inline avl_tree_t * vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) { ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM); if (t == ZIO_TYPE_READ) return (&vq->vq_read_offset_tree); else if (t == ZIO_TYPE_WRITE) return (&vq->vq_write_offset_tree); else return (&vq->vq_trim_offset_tree); } static int vdev_queue_timestamp_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); if (likely(cmp)) return (cmp); return (TREE_PCMP(z1, z2)); } static int vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_min_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_min_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_min_active); case ZIO_PRIORITY_ASYNC_WRITE: return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active : MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active)); case ZIO_PRIORITY_REMOVAL: return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active : MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active)); case ZIO_PRIORITY_INITIALIZING: return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active: MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active)); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); case ZIO_PRIORITY_REBUILD: return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active : MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active)); default: panic("invalid priority %u", p); return (0); } } static int vdev_queue_max_async_writes(spa_t *spa) { int writes; uint64_t dirty = 0; dsl_pool_t *dp = spa_get_dsl(spa); uint64_t min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint64_t max_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_max_dirty_percent / 100; /* * Async writes may occur before the assignment of the spa's * dsl_pool_t if a self-healing zio is issued prior to the * completion of dmu_objset_open_impl(). */ if (dp == NULL) return (zfs_vdev_async_write_max_active); /* * Sync tasks correspond to interactive user actions. To reduce the * execution time of those actions we push data out as fast as possible. */ dirty = dp->dp_dirty_total; if (dirty > max_bytes || spa_has_pending_synctask(spa)) return (zfs_vdev_async_write_max_active); if (dirty < min_bytes) return (zfs_vdev_async_write_min_active); /* * linear interpolation: * slope = (max_writes - min_writes) / (max_bytes - min_bytes) * move right by min_bytes * move up by min_writes */ writes = (dirty - min_bytes) * (zfs_vdev_async_write_max_active - zfs_vdev_async_write_min_active) / (max_bytes - min_bytes) + zfs_vdev_async_write_min_active; ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); return (writes); } static int vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_max_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_max_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active)); } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) return (MAX(1, zfs_vdev_scrub_min_active)); return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_REMOVAL: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active)); } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) return (MAX(1, zfs_vdev_removal_min_active)); return (zfs_vdev_removal_max_active); case ZIO_PRIORITY_INITIALIZING: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active)); } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) return (MAX(1, zfs_vdev_initializing_min_active)); return (zfs_vdev_initializing_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); case ZIO_PRIORITY_REBUILD: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active)); } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) return (MAX(1, zfs_vdev_rebuild_min_active)); return (zfs_vdev_rebuild_max_active); default: panic("invalid priority %u", p); return (0); } } /* * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if * there is no eligible class. */ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { spa_t *spa = vq->vq_vdev->vdev_spa; zio_priority_t p, n; if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* * Find a queue that has not reached its minimum # outstanding i/os. * Do round-robin to reduce starvation due to zfs_vdev_max_active * and vq_nia_credit limits. */ for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_min_active(vq, p)) { vq->vq_last_prio = p; return (p); } } /* * If we haven't found a queue, look for one that hasn't reached its * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_max_active(spa, vq, p)) { vq->vq_last_prio = p; return (p); } } /* No eligible queued i/os */ return (ZIO_PRIORITY_NUM_QUEUEABLE); } void vdev_queue_init(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; zio_priority_t p; mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent); avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_queue_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); /* * The synchronous/trim i/o queues are dispatched in FIFO rather * than LBA order. This provides more consistent latency for * these i/os. */ if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE || p == ZIO_PRIORITY_TRIM) { compfn = vdev_queue_timestamp_compare; } else { compfn = vdev_queue_offset_compare; } avl_create(vdev_queue_class_tree(vq, p), compfn, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } vq->vq_last_offset = 0; } void vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) avl_destroy(vdev_queue_class_tree(vq, p)); avl_destroy(&vq->vq_active_tree); avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM)); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { - spa_t *spa = zio->io_spa; - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); - - if (shk->kstat != NULL) { - mutex_enter(&shk->lock); - kstat_waitq_enter(shk->kstat->ks_data); - mutex_exit(&shk->lock); - } } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - spa_t *spa = zio->io_spa; - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); - - if (shk->kstat != NULL) { - mutex_enter(&shk->lock); - kstat_waitq_exit(shk->kstat->ks_data); - mutex_exit(&shk->lock); - } } static boolean_t vdev_queue_is_interactive(zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SCRUB: case ZIO_PRIORITY_REMOVAL: case ZIO_PRIORITY_INITIALIZING: case ZIO_PRIORITY_REBUILD: return (B_FALSE); default: return (B_TRUE); } } static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { - spa_t *spa = zio->io_spa; - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active++; if (vdev_queue_is_interactive(zio->io_priority)) { if (++vq->vq_ia_active == 1) vq->vq_nia_credit = 1; } else if (vq->vq_ia_active > 0) { vq->vq_nia_credit--; } avl_add(&vq->vq_active_tree, zio); - - if (shk->kstat != NULL) { - mutex_enter(&shk->lock); - kstat_runq_enter(shk->kstat->ks_data); - mutex_exit(&shk->lock); - } } static void vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { - spa_t *spa = zio->io_spa; - spa_history_kstat_t *shk = &spa->spa_stats.io_history; - ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active--; if (vdev_queue_is_interactive(zio->io_priority)) { if (--vq->vq_ia_active == 0) vq->vq_nia_credit = 0; else vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; avl_remove(&vq->vq_active_tree, zio); - - if (shk->kstat != NULL) { - kstat_io_t *ksio = shk->kstat->ks_data; - - mutex_enter(&shk->lock); - kstat_runq_exit(ksio); - if (zio->io_type == ZIO_TYPE_READ) { - ksio->reads++; - ksio->nread += zio->io_size; - } else if (zio->io_type == ZIO_TYPE_WRITE) { - ksio->writes++; - ksio->nwritten += zio->io_size; - } - mutex_exit(&shk->lock); - } } static void vdev_queue_agg_io_done(zio_t *aio) { abd_free(aio->io_abd); } /* * Compute the range spanned by two i/os, which is the endpoint of the last * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. */ #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) /* * Sufficiently adjacent io_offset's in ZIOs will be aggregated. We do this * by creating a gang ABD from the adjacent ZIOs io_abd's. By using * a gang ABD we avoid doing memory copies to and from the parent, * child ZIOs. The gang ABD also accounts for gaps between adjacent * io_offsets by simply getting the zero ABD for writes or allocating * a new ABD for reads and placing them in the gang ABD as well. */ static zio_t * vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { zio_t *first, *last, *aio, *dio, *mandatory, *nio; zio_link_t *zl = NULL; uint64_t maxgap = 0; uint64_t size; uint64_t limit; int maxblocksize; boolean_t stretch = B_FALSE; avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; uint64_t next_offset; abd_t *abd; maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); if (vq->vq_vdev->vdev_nonrot) limit = zfs_vdev_aggregation_limit_non_rotating; else limit = zfs_vdev_aggregation_limit; limit = MAX(MIN(limit, maxblocksize), 0); if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) return (NULL); /* * While TRIM commands could be aggregated based on offset this * behavior is disabled until it's determined to be beneficial. */ if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) return (NULL); /* * I/Os to distributed spares are directly dispatched to the dRAID * leaf vdevs for aggregation. See the comment at the end of the * zio_vdev_io_start() function. */ ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops); first = last = zio; if (zio->io_type == ZIO_TYPE_READ) maxgap = zfs_vdev_read_gap_limit; /* * We can aggregate I/Os that are sufficiently adjacent and of * the same flavor, as expressed by the AGG_INHERIT flags. * The latter requirement is necessary so that certain * attributes of the I/O, such as whether it's a normal I/O * or a scrub/resilver, can be preserved in the aggregate. * We can include optional I/Os, but don't allow them * to begin a range as they add no benefit in that situation. */ /* * We keep track of the last non-optional I/O. */ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; /* * Walk backwards through sufficiently contiguous I/Os * recording the last non-optional I/O. */ while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && IO_GAP(dio, first) <= maxgap && dio->io_type == zio->io_type) { first = dio; if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = first; } /* * Skip any initial optional I/Os. */ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { first = AVL_NEXT(t, first); ASSERT(first != NULL); } /* * Walk forward through sufficiently contiguous I/Os. * The aggregation limit does not apply to optional i/os, so that * we can issue contiguous writes even if they are larger than the * aggregation limit. */ while ((dio = AVL_NEXT(t, last)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && IO_SPAN(first, dio) <= maxblocksize && IO_GAP(last, dio) <= maxgap && dio->io_type == zio->io_type) { last = dio; if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = last; } /* * Now that we've established the range of the I/O aggregation * we must decide what to do with trailing optional I/Os. * For reads, there's nothing to do. While we are unable to * aggregate further, it's possible that a trailing optional * I/O would allow the underlying device to aggregate with * subsequent I/Os. We must therefore determine if the next * non-optional I/O is close enough to make aggregation * worthwhile. */ if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { zio_t *nio = last; while ((dio = AVL_NEXT(t, nio)) != NULL && IO_GAP(nio, dio) == 0 && IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { nio = dio; if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { stretch = B_TRUE; break; } } } if (stretch) { /* * We are going to include an optional io in our aggregated * span, thus closing the write gap. Only mandatory i/os can * start aggregated spans, so make sure that the next i/o * after our span is mandatory. */ dio = AVL_NEXT(t, last); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { /* do not include the optional i/o */ while (last != mandatory && last != first) { ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); last = AVL_PREV(t, last); ASSERT(last != NULL); } } if (first == last) return (NULL); size = IO_SPAN(first, last); ASSERT3U(size, <=, maxblocksize); abd = abd_alloc_gang(); if (abd == NULL) return (NULL); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, abd, size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; next_offset = first->io_offset; do { dio = nio; nio = AVL_NEXT(t, dio); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); if (dio->io_offset != next_offset) { /* allocate a buffer for a read gap */ ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ); ASSERT3U(dio->io_offset, >, next_offset); abd = abd_alloc_for_io( dio->io_offset - next_offset, B_TRUE); abd_gang_add(aio->io_abd, abd, B_TRUE); } if (dio->io_abd && (dio->io_size != abd_get_size(dio->io_abd))) { /* abd size not the same as IO size */ ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size); abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size); abd_gang_add(aio->io_abd, abd, B_TRUE); } else { if (dio->io_flags & ZIO_FLAG_NODATA) { /* allocate a buffer for a write gap */ ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3P(dio->io_abd, ==, NULL); abd_gang_add(aio->io_abd, abd_get_zeros(dio->io_size), B_TRUE); } else { /* * We pass B_FALSE to abd_gang_add() * because we did not allocate a new * ABD, so it is assumed the caller * will free this ABD. */ abd_gang_add(aio->io_abd, dio->io_abd, B_FALSE); } } next_offset = dio->io_offset + dio->io_size; } while (dio != last); ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size); /* * We need to drop the vdev queue's lock during zio_execute() to * avoid a deadlock that we could encounter due to lock order * reversal between vq_lock and io_lock in zio_change_priority(). */ mutex_exit(&vq->vq_lock); while ((dio = zio_walk_parents(aio, &zl)) != NULL) { ASSERT3U(dio->io_type, ==, aio->io_type); zio_vdev_io_bypass(dio); zio_execute(dio); } mutex_enter(&vq->vq_lock); return (aio); } static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq) { zio_t *zio, *aio; zio_priority_t p; avl_index_t idx; avl_tree_t *tree; again: ASSERT(MUTEX_HELD(&vq->vq_lock)); p = vdev_queue_class_to_issue(vq); if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { /* No eligible queued i/os */ return (NULL); } /* * For LBA-ordered queues (async / scrub / initializing), issue the * i/o which follows the most recently issued i/o in LBA (offset) order. * * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. */ tree = vdev_queue_class_tree(vq, p); vq->vq_io_search.io_timestamp = 0; vq->vq_io_search.io_offset = vq->vq_last_offset - 1; VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); zio = avl_nearest(tree, idx, AVL_AFTER); if (zio == NULL) zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); if (aio != NULL) zio = aio; else vdev_queue_io_remove(vq, zio); /* * If the I/O is or was optional and therefore has no data, we need to * simply discard it. We need to drop the vdev queue's lock to avoid a * deadlock that we could encounter since this I/O will complete * immediately. */ if (zio->io_flags & ZIO_FLAG_NODATA) { mutex_exit(&vq->vq_lock); zio_vdev_io_bypass(zio); zio_execute(zio); mutex_enter(&vq->vq_lock); goto again; } vdev_queue_pending_add(vq, zio); vq->vq_last_offset = zio->io_offset + zio->io_size; return (zio); } zio_t * vdev_queue_io(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) return (zio); /* * Children i/os inherent their parent's priority, which might * not match the child's i/o type. Fix it up here. */ if (zio->io_type == ZIO_TYPE_READ) { ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM); if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && zio->io_priority != ZIO_PRIORITY_REMOVAL && zio->io_priority != ZIO_PRIORITY_INITIALIZING && zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } } else if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM); if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && zio->io_priority != ZIO_PRIORITY_REMOVAL && zio->io_priority != ZIO_PRIORITY_INITIALIZING && zio->io_priority != ZIO_PRIORITY_REBUILD) { zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } } else { ASSERT(zio->io_type == ZIO_TYPE_TRIM); ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); } zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; mutex_enter(&vq->vq_lock); zio->io_timestamp = gethrtime(); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); mutex_exit(&vq->vq_lock); if (nio == NULL) return (NULL); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); return (NULL); } return (nio); } void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); zio->io_delta = gethrtime() - zio->io_timestamp; vq->vq_io_complete_ts = gethrtime(); vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); } else { zio_vdev_io_reissue(nio); zio_execute(nio); } mutex_enter(&vq->vq_lock); } mutex_exit(&vq->vq_lock); } void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; avl_tree_t *tree; /* * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio * code to issue IOs without adding them to the vdev queue. In this * case, the zio is already going to be issued as quickly as possible * and so it doesn't need any reprioritization to help. */ if (zio->io_priority == ZIO_PRIORITY_NOW) return; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (zio->io_type == ZIO_TYPE_READ) { if (priority != ZIO_PRIORITY_SYNC_READ && priority != ZIO_PRIORITY_ASYNC_READ && priority != ZIO_PRIORITY_SCRUB) priority = ZIO_PRIORITY_ASYNC_READ; } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE); if (priority != ZIO_PRIORITY_SYNC_WRITE && priority != ZIO_PRIORITY_ASYNC_WRITE) priority = ZIO_PRIORITY_ASYNC_WRITE; } mutex_enter(&vq->vq_lock); /* * If the zio is in none of the queues we can simply change * the priority. If the zio is waiting to be submitted we must * remove it from the queue and re-insert it with the new priority. * Otherwise, the zio is currently active and we cannot change its * priority. */ tree = vdev_queue_class_tree(vq, zio->io_priority); if (avl_find(tree, zio, NULL) == zio) { avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); zio->io_priority = priority; avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { zio->io_priority = priority; } mutex_exit(&vq->vq_lock); } /* * As these two methods are only used for load calculations we're not * concerned if we get an incorrect value on 32bit platforms due to lack of * vq_lock mutex use here, instead we prefer to keep it lock free for * performance. */ int vdev_queue_length(vdev_t *vd) { return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); } uint64_t vdev_queue_last_offset(vdev_t *vd) { return (vd->vdev_queue.vq_last_offset); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW, "Max vdev I/O aggregation size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW, "Allow TRIM I/O to be aggregated"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW, "Aggregate read I/O over gap"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW, "Aggregate write I/O over gap"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW, "Maximum number of active I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, ZMOD_RW, "Async write concurrency max threshold"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, ZMOD_RW, "Async write concurrency min threshold"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW, "Max active async read I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW, "Min active async read I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW, "Max active async write I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW, "Min active async write I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW, "Max active initializing I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW, "Min active initializing I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW, "Max active removal I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW, "Min active removal I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW, "Max active scrub I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW, "Min active scrub I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW, "Max active sync read I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW, "Min active sync read I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW, "Max active sync write I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW, "Min active sync write I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW, "Max active trim/discard I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW, "Min active trim/discard I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, "Max active rebuild I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, "Min active rebuild I/Os per vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW, "Number of non-interactive I/Os to allow in sequence"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW, "Number of non-interactive I/Os before _max_active"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); /* END CSTYLED */