diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h index f9b09d4304f7..4dda0756f3c2 100644 --- a/include/os/freebsd/spl/sys/ccompile.h +++ b/include/os/freebsd/spl/sys/ccompile.h @@ -1,374 +1,378 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_CCOMPILE_H #define _SYS_CCOMPILE_H /* * This file contains definitions designed to enable different compilers * to be used harmoniously on Solaris systems. */ #ifdef __cplusplus extern "C" { #endif /* * Allow for version tests for compiler bugs and features. */ #if defined(__GNUC__) #define __GNUC_VERSION \ (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #else #define __GNUC_VERSION 0 #endif #if defined(__ATTRIBUTE_IMPLEMENTED) || defined(__GNUC__) /* * analogous to lint's PRINTFLIKEn */ #define __sun_attr___PRINTFLIKE__(__n) \ __attribute__((__format__(printf, __n, (__n)+1))) #define __sun_attr___VPRINTFLIKE__(__n) \ __attribute__((__format__(printf, __n, 0))) /* * Handle the kernel printf routines that can take '%b' too */ #if __GNUC_VERSION < 30402 /* * XX64 at least this doesn't work correctly yet with 3.4.1 anyway! */ #define __sun_attr___KPRINTFLIKE__ __sun_attr___PRINTFLIKE__ #define __sun_attr___KVPRINTFLIKE__ __sun_attr___VPRINTFLIKE__ #else #define __sun_attr___KPRINTFLIKE__(__n) \ __attribute__((__format__(cmn_err, __n, (__n)+1))) #define __sun_attr___KVPRINTFLIKE__(__n) \ __attribute__((__format__(cmn_err, __n, 0))) #endif /* * This one's pretty obvious -- the function never returns */ #define __sun_attr___noreturn__ __attribute__((__noreturn__)) /* * This is an appropriate label for functions that do not * modify their arguments, e.g. strlen() */ #define __sun_attr___pure__ __attribute__((__pure__)) /* * This is a stronger form of __pure__. Can be used for functions * that do not modify their arguments and don't depend on global * memory. */ #define __sun_attr___const__ __attribute__((__const__)) /* * structure packing like #pragma pack(1) */ #define __sun_attr___packed__ __attribute__((__packed__)) #define ___sun_attr_inner(__a) __sun_attr_##__a #define __sun_attr__(__a) ___sun_attr_inner __a #else /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ #define __sun_attr__(__a) #endif /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ /* * Shorthand versions for readability */ #define __PRINTFLIKE(__n) __sun_attr__((__PRINTFLIKE__(__n))) #define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n))) #define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n))) #define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n))) #ifdef _KERNEL #define __NORETURN __sun_attr__((__noreturn__)) #endif #define __CONST __sun_attr__((__const__)) #define __PURE __sun_attr__((__pure__)) #if (defined(ZFS_DEBUG) || !defined(NDEBUG))&& !defined(DEBUG) #define DEBUG #endif #define EXPORT_SYMBOL(x) #define MODULE_AUTHOR(s) #define MODULE_DESCRIPTION(s) #define MODULE_LICENSE(s) #define module_param(a, b, c) #define module_param_call(a, b, c, d, e) #define module_param_named(a, b, c, d) #define MODULE_PARM_DESC(a, b) #define asm __asm #ifdef ZFS_DEBUG #undef NDEBUG #endif #ifndef EINTEGRITY #define EINTEGRITY 97 /* EINTEGRITY is new in 13 */ #endif /* * These are bespoke errnos used in ZFS. We map them to their closest FreeBSD * equivalents. This gives us more useful error messages from strerror(3). */ #define ECKSUM EINTEGRITY #define EFRAGS ENOSPC /* Similar for ENOACTIVE */ #define ENOTACTIVE ECANCELED #define EREMOTEIO EREMOTE #define ECHRNG ENXIO #define ETIME ETIMEDOUT #define O_LARGEFILE 0 #define O_RSYNC 0 #define O_DSYNC 0 #define KMALLOC_MAX_SIZE MAXPHYS #ifdef _KERNEL +#if !defined(LOCORE) && !defined(_ASM) typedef unsigned long long u_longlong_t; typedef long long longlong_t; -#include + typedef void zfs_kernel_param_t; +#endif #define param_set_charp(a, b) (0) #define ATTR_UID AT_UID #define ATTR_GID AT_GID #define ATTR_MODE AT_MODE #define ATTR_XVATTR AT_XVATTR #define ATTR_CTIME AT_CTIME #define ATTR_MTIME AT_MTIME #define ATTR_ATIME AT_ATIME #define vmem_free zfs_kmem_free #define vmem_zalloc(size, flags) zfs_kmem_alloc(size, flags | M_ZERO) #define vmem_alloc zfs_kmem_alloc #define MUTEX_NOLOCKDEP 0 #define RW_NOLOCKDEP 0 +#if !defined(LOCORE) && !defined(_ASM) +#include +#include #if __FreeBSD_version < 1300051 #define vm_page_valid(m) (m)->valid = VM_PAGE_BITS_ALL #define vm_page_do_sunbusy(m) #define vm_page_none_valid(m) ((m)->valid == 0) #else #define vm_page_do_sunbusy(m) vm_page_sunbusy(m) #endif #if __FreeBSD_version < 1300074 #define VOP_UNLOCK1(x) VOP_UNLOCK(x, 0) #else #define VOP_UNLOCK1(x) VOP_UNLOCK(x) #endif #if __FreeBSD_version < 1300064 #define VN_IS_DOOMED(vp) ((vp)->v_iflag & VI_DOOMED) #endif #if __FreeBSD_version < 1300068 #define VFS_VOP_VECTOR_REGISTER(x) #endif #if __FreeBSD_version >= 1300076 #define getnewvnode_reserve_() getnewvnode_reserve() #else #define getnewvnode_reserve_() getnewvnode_reserve(1) #endif - struct hlist_node { struct hlist_node *next, **pprev; }; struct hlist_head { struct hlist_node *first; }; typedef struct { volatile int counter; } atomic_t; /* BEGIN CSTYLED */ #define hlist_for_each(p, head) \ for (p = (head)->first; p; p = (p)->next) #define hlist_entry(ptr, type, field) container_of(ptr, type, field) #define container_of(ptr, type, member) \ ({ \ const __typeof(((type *)0)->member) *__p = (ptr); \ (type *)((uintptr_t)__p - offsetof(type, member)); \ }) /* END CSTYLED */ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { n->next = h->first; if (h->first != NULL) h->first->pprev = &n->next; WRITE_ONCE(h->first, n); n->pprev = &h->first; } static inline void hlist_del(struct hlist_node *n) { WRITE_ONCE(*(n->pprev), n->next); if (n->next != NULL) n->next->pprev = n->pprev; } /* BEGIN CSTYLED */ #define READ_ONCE(x) ({ \ __typeof(x) __var = ({ \ barrier(); \ ACCESS_ONCE(x); \ }); \ barrier(); \ __var; \ }) #define HLIST_HEAD_INIT { } #define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT #define INIT_HLIST_HEAD(head) (head)->first = NULL #define INIT_HLIST_NODE(node) \ do { \ (node)->next = NULL; \ (node)->pprev = NULL; \ } while (0) /* END CSTYLED */ static inline int atomic_read(const atomic_t *v) { return (READ_ONCE(v->counter)); } static inline int atomic_inc(atomic_t *v) { return (atomic_fetchadd_int(&v->counter, 1) + 1); } static inline int atomic_dec(atomic_t *v) { return (atomic_fetchadd_int(&v->counter, -1) - 1); } - +#endif #else typedef long loff_t; typedef long rlim64_t; typedef int bool_t; typedef int enum_t; #ifndef __cplusplus #define __init #endif #define __exit #define FALSE 0 #define TRUE 1 /* * XXX We really need to consolidate on standard * error codes in the common code */ #define ENOSTR ENOTCONN #define ENODATA EINVAL #define __XSI_VISIBLE 1000 #define __BSD_VISIBLE 1 #define __POSIX_VISIBLE 201808 #define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) #define open64 open #define pwrite64 pwrite #define ftruncate64 ftruncate #define lseek64 lseek #define pread64 pread #define stat64 stat #define lstat64 lstat #define statfs64 statfs #define readdir64 readdir #define dirent64 dirent #define P2ALIGN(x, align) ((x) & -(align)) #define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) #define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) #define P2PHASE(x, align) ((x) & ((align) - 1)) #define P2NPHASE(x, align) (-(x) & ((align) - 1)) #define ISP2(x) (((x) & ((x) - 1)) == 0) #define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) #define P2BOUNDARY(off, len, align) \ (((off) ^ ((off) + (len) - 1)) > (align) - 1) /* * Typed version of the P2* macros. These macros should be used to ensure * that the result is correctly calculated based on the data type of (x), * which is passed in as the last argument, regardless of the data * type of the alignment. For example, if (x) is of type uint64_t, * and we want to round it up to a page boundary using "PAGESIZE" as * the alignment, we can do either * * P2ROUNDUP(x, (uint64_t)PAGESIZE) * or * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) */ #define P2ALIGN_TYPED(x, align, type) \ ((type)(x) & -(type)(align)) #define P2PHASE_TYPED(x, align, type) \ ((type)(x) & ((type)(align) - 1)) #define P2NPHASE_TYPED(x, align, type) \ (-(type)(x) & ((type)(align) - 1)) #define P2ROUNDUP_TYPED(x, align, type) \ ((((type)(x) - 1) | ((type)(align) - 1)) + 1) #define P2END_TYPED(x, align, type) \ (-(~(type)(x) & -(type)(align))) #define P2PHASEUP_TYPED(x, align, phase, type) \ ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) #define P2CROSS_TYPED(x, y, align, type) \ (((type)(x) ^ (type)(y)) > (type)(align) - 1) #define P2SAMEHIGHBIT_TYPED(x, y, type) \ (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) #define RLIM64_INFINITY RLIM_INFINITY #define ERESTART EAGAIN #define ABS(a) ((a) < 0 ? -(a) : (a)) #endif #ifdef __cplusplus } #endif #endif /* _SYS_CCOMPILE_H */ diff --git a/include/os/freebsd/spl/sys/condvar.h b/include/os/freebsd/spl/sys/condvar.h index 4375bd6a3154..2e99a1a34e18 100644 --- a/include/os/freebsd/spl/sys/condvar.h +++ b/include/os/freebsd/spl/sys/condvar.h @@ -1,205 +1,204 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * Copyright (c) 2013 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_CONDVAR_H_ #define _OPENSOLARIS_SYS_CONDVAR_H_ #include #include #include #include -#include /* * cv_timedwait() is similar to cv_wait() except that it additionally expects * a timeout value specified in ticks. When woken by cv_signal() or * cv_broadcast() it returns 1, otherwise when the timeout is reached -1 is * returned. * * cv_timedwait_sig() behaves the same as cv_timedwait() but blocks * interruptibly and can be woken by a signal (EINTR, ERESTART). When * this occurs 0 is returned. * * cv_timedwait_io() and cv_timedwait_sig_io() are variants of cv_timedwait() * and cv_timedwait_sig() which should be used when waiting for outstanding * IO to complete. They are responsible for updating the iowait accounting * when this is supported by the platform. * * cv_timedwait_hires() and cv_timedwait_sig_hires() are high resolution * versions of cv_timedwait() and cv_timedwait_sig(). They expect the timeout * to be specified as a hrtime_t allowing for timeouts of less than a tick. * * N.B. The return values differ slightly from the illumos implementation * which returns the time remaining, instead of 1, when woken. They both * return -1 on timeout. Consumers which need to know the time remaining * are responsible for tracking it themselves. */ static __inline sbintime_t zfs_nstosbt(int64_t _ns) { sbintime_t sb = 0; #ifdef KASSERT KASSERT(_ns >= 0, ("Negative values illegal for nstosbt: %jd", _ns)); #endif if (_ns >= SBT_1S) { sb = (_ns / 1000000000) * SBT_1S; _ns = _ns % 1000000000; } /* 9223372037 = ceil(2^63 / 1000000000) */ sb += ((_ns * 9223372037ull) + 0x7fffffff) >> 31; return (sb); } typedef struct cv kcondvar_t; #define CALLOUT_FLAG_ABSOLUTE C_ABSOLUTE typedef enum { CV_DEFAULT, CV_DRIVER } kcv_type_t; #define zfs_cv_init(cv, name, type, arg) do { \ const char *_name; \ ASSERT((type) == CV_DEFAULT); \ for (_name = #cv; *_name != '\0'; _name++) { \ if (*_name >= 'a' && *_name <= 'z') \ break; \ } \ if (*_name == '\0') \ _name = #cv; \ cv_init((cv), _name); \ } while (0) #define cv_init(cv, name, type, arg) zfs_cv_init(cv, name, type, arg) static inline int cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) { return (_cv_wait_sig(cvp, &(mp)->lock_object) == 0); } static inline int cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t timo) { int rc; timo -= ddi_get_lbolt(); if (timo <= 0) return (-1); rc = _cv_timedwait_sbt((cvp), &(mp)->lock_object, \ tick_sbt * (timo), 0, C_HARDCLOCK); if (rc == EWOULDBLOCK) return (-1); return (1); } static inline int cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t timo) { int rc; timo -= ddi_get_lbolt(); if (timo <= 0) return (-1); rc = _cv_timedwait_sig_sbt(cvp, &(mp)->lock_object, \ tick_sbt * (timo), 0, C_HARDCLOCK); if (rc == EWOULDBLOCK) return (-1); if (rc == EINTR || rc == ERESTART) return (0); return (1); } #define cv_timedwait_io cv_timedwait #define cv_timedwait_sig_io cv_timedwait_sig static inline int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { hrtime_t hrtime; int rc; ASSERT(tim >= res); hrtime = gethrtime(); if (flag == 0) tim += hrtime; if (hrtime >= tim) return (-1); rc = cv_timedwait_sbt(cvp, mp, zfs_nstosbt(tim), zfs_nstosbt(res), C_ABSOLUTE); if (rc == EWOULDBLOCK) return (-1); KASSERT(rc == 0, ("unexpected rc value %d", rc)); return (1); } static inline int cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag) { sbintime_t sbt; hrtime_t hrtime; int rc; ASSERT(tim >= res); hrtime = gethrtime(); if (flag == 0) tim += hrtime; if (hrtime >= tim) return (-1); sbt = zfs_nstosbt(tim); rc = cv_timedwait_sig_sbt(cvp, mp, sbt, zfs_nstosbt(res), C_ABSOLUTE); switch (rc) { case EWOULDBLOCK: return (-1); case EINTR: case ERESTART: return (0); default: KASSERT(rc == 0, ("unexpected rc value %d", rc)); return (1); } } #endif /* _OPENSOLARIS_SYS_CONDVAR_H_ */ diff --git a/include/os/freebsd/spl/sys/file.h b/include/os/freebsd/spl/sys/file.h index 10a82c204859..0d0be44f8a64 100644 --- a/include/os/freebsd/spl/sys/file.h +++ b/include/os/freebsd/spl/sys/file.h @@ -1,50 +1,52 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_FILE_H_ #define _OPENSOLARIS_SYS_FILE_H_ +#include +#include_next #include_next #define FKIOCTL 0x80000000 /* ioctl addresses are from kernel */ typedef struct file file_t; #include static __inline file_t * getf_caps(int fd, cap_rights_t *rightsp) { struct file *fp; if (fget(curthread, fd, rightsp, &fp) == 0) return (fp); return (NULL); } #endif /* !_OPENSOLARIS_SYS_FILE_H_ */ diff --git a/include/os/freebsd/spl/sys/mount.h b/include/os/freebsd/spl/sys/mount.h index 4732d283bb9a..42614e4739f1 100644 --- a/include/os/freebsd/spl/sys/mount.h +++ b/include/os/freebsd/spl/sys/mount.h @@ -1,41 +1,42 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_MOUNT_H_ #define _OPENSOLARIS_SYS_MOUNT_H_ #include #include_next +#ifdef BUILDING_ZFS #include - +#endif #define MS_FORCE MNT_FORCE #define MS_REMOUNT MNT_UPDATE typedef struct fid fid_t; #endif /* !_OPENSOLARIS_SYS_MOUNT_H_ */ diff --git a/include/os/freebsd/spl/sys/sunddi.h b/include/os/freebsd/spl/sys/sunddi.h index bb76cd9641c7..41d0f4512977 100644 --- a/include/os/freebsd/spl/sys/sunddi.h +++ b/include/os/freebsd/spl/sys/sunddi.h @@ -1,67 +1,69 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * For details, see . * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_SUNDDI_H #define _SPL_SUNDDI_H #include #include #include #include +#ifdef BUILDING_ZFS #include +#endif typedef int ddi_devid_t; #define DDI_DEV_T_NONE ((dev_t)-1) #define DDI_DEV_T_ANY ((dev_t)-2) #define DI_MAJOR_T_UNKNOWN ((major_t)0) #define DDI_PROP_DONTPASS 0x0001 #define DDI_PROP_CANSLEEP 0x0002 #define DDI_SUCCESS 0 #define DDI_FAILURE -1 #define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL) #define ddi_prop_free(x) (void)0 #define ddi_root_node() (void)0 extern int ddi_strtoul(const char *, char **, int, unsigned long *); extern int ddi_strtol(const char *, char **, int, long *); extern int ddi_strtoull(const char *, char **, int, unsigned long long *); extern int ddi_strtoll(const char *, char **, int, long long *); extern int ddi_copyin(const void *from, void *to, size_t len, int flags); extern int ddi_copyout(const void *from, void *to, size_t len, int flags); extern void ddi_sysevent_init(void); int ddi_soft_state_init(void **statep, size_t size, size_t nitems); void ddi_soft_state_fini(void **statep); void *ddi_get_soft_state(void *state, int item); int ddi_soft_state_zalloc(void *state, int item); void ddi_soft_state_free(void *state, int item); #endif /* SPL_SUNDDI_H */ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index a46b92051340..7c83113ac86a 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -1,347 +1,348 @@ .if !defined(WITH_CTF) WITH_CTF=1 .endif .include SRCDIR=${.CURDIR} INCDIR=${.CURDIR:H}/include KMOD= openzfs .PATH: ${SRCDIR}/avl \ ${SRCDIR}/lua \ ${SRCDIR}/nvpair \ ${SRCDIR}/os/freebsd/spl \ ${SRCDIR}/os/freebsd/zfs \ ${SRCDIR}/unicode \ ${SRCDIR}/zcommon \ ${SRCDIR}/zfs CFLAGS+= -I${.OBJDIR:H}/include CFLAGS+= -I${INCDIR} CFLAGS+= -I${INCDIR}/spl CFLAGS+= -I${INCDIR}/os/freebsd CFLAGS+= -I${INCDIR}/os/freebsd/spl CFLAGS+= -I${INCDIR}/os/freebsd/zfs CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 CFLAGS+= -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ CFLAGS+= -D_SYS_VMEM_H_ -D_MACHINE_ENDIAN_H_ -DKDTRACE_HOOKS -DSMP .if ${MACHINE_ARCH} == "amd64" CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F .endif .if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true" CFLAGS+= -DINVARIANTS -DWITNESS -g -O0 -DZFS_DEBUG -DOPENSOLARIS_WITNESS .else CFLAGS += -DNDEBUG .endif .if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true" # kernel must also be built with this option for this to work CFLAGS+= -DDEBUG_VFS_LOCKS .endif .if defined(WITH_GCOV) && ${WITH_GCOV} == "true" CFLAGS+= -fprofile-arcs -ftest-coverage .endif DEBUG_FLAGS=-g .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ ${MACHINE_ARCH} == "arm" CFLAGS+= -DBITS_PER_LONG=32 .else CFLAGS+= -DBITS_PER_LONG=64 .endif SRCS= vnode_if.h device_if.h bus_if.h # avl SRCS+= avl.c #lua SRCS+= lapi.c \ lauxlib.c \ lbaselib.c \ lcode.c \ lcompat.c \ lcorolib.c \ lctype.c \ ldebug.c \ ldo.c \ lfunc.c \ lgc.c \ llex.c \ lmem.c \ lobject.c \ lopcodes.c \ lparser.c \ lstate.c \ lstring.c \ lstrlib.c \ ltable.c \ ltablib.c \ ltm.c \ lvm.c \ lzio.c #nvpair SRCS+= nvpair.c \ fnvpair.c \ nvpair_alloc_spl.c \ nvpair_alloc_fixed.c #os/freebsd/spl SRCS+= acl_common.c \ btree.c \ callb.c \ list.c \ spl_acl.c \ spl_cmn_err.c \ spl_dtrace.c \ spl_kmem.c \ spl_kstat.c \ spl_misc.c \ spl_policy.c \ spl_string.c \ spl_sunddi.c \ spl_sysevent.c \ spl_taskq.c \ spl_uio.c \ spl_vfs.c \ spl_vm.c \ spl_zone.c \ sha256c.c \ sha512c.c \ spl_procfs_list.c \ spl_zlib.c .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ ${MACHINE_ARCH} == "arm" SRCS+= spl_atomic.c .endif #os/freebsd/zfs SRCS+= abd_os.c \ crypto_os.c \ dmu_os.c \ hkdf.c \ kmod_core.c \ spa_os.c \ sysctl_os.c \ vdev_file.c \ vdev_label_os.c \ vdev_geom.c \ zfs_acl.c \ zfs_ctldir.c \ zfs_dir.c \ zfs_ioctl_os.c \ zfs_log.c \ zfs_replay.c \ zfs_vfsops.c \ zfs_vnops.c \ zfs_znode.c \ zio_crypt.c \ zvol_os.c #unicode SRCS+= uconv.c \ u8_textprep.c #zcommon SRCS+= zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ zfs_fletcher.c \ zfs_fletcher_avx512.c \ zfs_fletcher_intel.c \ zfs_fletcher_sse.c \ zfs_fletcher_superscalar.c \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ zpool_prop.c \ zprop_common.c #zfs SRCS+= abd.c \ aggsum.c \ arc.c \ arc_os.c \ blkptr.c \ bplist.c \ bpobj.c \ cityhash.c \ dbuf.c \ dbuf_stats.c \ bptree.c \ bqueue.c \ dataset_kstats.c \ ddt.c \ ddt_zap.c \ dmu.c \ dmu_diff.c \ dmu_object.c \ dmu_objset.c \ dmu_recv.c \ dmu_redact.c \ dmu_send.c \ dmu_traverse.c \ dmu_tx.c \ dmu_zfetch.c \ dnode.c \ dnode_sync.c \ dsl_dataset.c \ dsl_deadlist.c \ dsl_deleg.c \ dsl_bookmark.c \ dsl_dir.c \ dsl_crypt.c \ dsl_destroy.c \ dsl_pool.c \ dsl_prop.c \ dsl_scan.c \ dsl_synctask.c \ dsl_userhold.c \ fm.c \ gzip.c \ lzjb.c \ lz4.c \ metaslab.c \ mmp.c \ multilist.c \ objlist.c \ pathname.c \ range_tree.c \ refcount.c \ rrwlock.c \ sa.c \ sha256.c \ skein_zfs.c \ spa.c \ spa_boot.c \ spa_checkpoint.c \ spa_config.c \ spa_errlog.c \ spa_history.c \ spa_log_spacemap.c \ spa_misc.c \ spa_stats.c \ space_map.c \ space_reftree.c \ txg.c \ uberblock.c \ unique.c \ vdev.c \ vdev_cache.c \ vdev_indirect.c \ vdev_indirect_births.c \ vdev_indirect_mapping.c \ vdev_initialize.c \ vdev_label.c \ vdev_mirror.c \ vdev_missing.c \ vdev_queue.c \ vdev_raidz.c \ vdev_raidz_math.c \ vdev_raidz_math_scalar.c \ vdev_raidz_math_avx2.c \ vdev_raidz_math_avx512bw.c \ vdev_raidz_math_avx512f.c \ vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ vdev_removal.c \ vdev_root.c \ vdev_trim.c \ zap.c \ zap_leaf.c \ zap_micro.c \ zcp.c \ zcp_get.c \ zcp_global.c \ zcp_iter.c \ zcp_set.c \ zcp_synctask.c \ zfeature.c \ zfs_byteswap.c \ zfs_debug.c \ zfs_file_os.c \ zfs_fm.c \ zfs_fuid.c \ zfs_fuid_os.c \ zfs_ioctl.c \ zfs_onexit.c \ zfs_quota.c \ zfs_ratelimit.c \ zfs_rlock.c \ zfs_sa.c \ zil.c \ zio.c \ zio_checksum.c \ zio_compress.c \ zio_inject.c \ zle.c \ zrlock.c \ zthr.c \ zvol.c beforeinstall: .if ${MK_DEBUG_FILES} != "no" mtree -eu \ -f /etc/mtree/BSD.debug.dist \ -p ${DESTDIR}/usr/lib .endif .include CFLAGS.gcc+= -Wno-pointer-to-int-cast CFLAGS.lapi.c= -Wno-cast-qual CFLAGS.lcompat.c= -Wno-cast-qual CFLAGS.lobject.c= -Wno-cast-qual CFLAGS.ltable.c= -Wno-cast-qual CFLAGS.lvm.c= -Wno-cast-qual CFLAGS.nvpair.c= -Wno-cast-qual CFLAGS.spl_string.c= -Wno-cast-qual CFLAGS.spl_vm.c= -Wno-cast-qual CFLAGS.spl_zlib.c= -Wno-cast-qual CFLAGS.abd.c= -Wno-cast-qual CFLAGS.zfs_log.c= -Wno-cast-qual CFLAGS.zfs_vnops.c= -Wno-pointer-arith CFLAGS.u8_textprep.c= -Wno-cast-qual CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zprop_common.c= -Wno-cast-qual CFLAGS.ddt.c= -Wno-cast-qual CFLAGS.dmu.c= -Wno-cast-qual CFLAGS.dmu_traverse.c= -Wno-cast-qual CFLAGS.dsl_dir.c= -Wno-cast-qual CFLAGS.dsl_deadlist.c= -Wno-cast-qual CFLAGS.dsl_prop.c= -Wno-cast-qual CFLAGS.fm.c= -Wno-cast-qual CFLAGS.lz4.c= -Wno-cast-qual CFLAGS.spa.c= -Wno-cast-qual CFLAGS.spa_misc.c= -Wno-cast-qual +CFLAGS.sysctl_os.c= -include ../zfs_config.h CFLAGS.vdev_raidz.c= -Wno-cast-qual CFLAGS.vdev_raidz_math.c= -Wno-cast-qual CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier CFLAGS.zap_leaf.c= -Wno-cast-qual CFLAGS.zap_micro.c= -Wno-cast-qual CFLAGS.zcp.c= -Wno-cast-qual CFLAGS.zfs_fm.c= -Wno-cast-qual CFLAGS.zfs_ioctl.c= -Wno-cast-qual CFLAGS.zil.c= -Wno-cast-qual CFLAGS.zio.c= -Wno-cast-qual CFLAGS.zrlock.c= -Wno-cast-qual diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c index ab4702574b08..8d33916d85d1 100644 --- a/module/os/freebsd/spl/spl_misc.c +++ b/module/os/freebsd/spl/spl_misc.c @@ -1,107 +1,112 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include -char hw_serial[11] = "0"; - static struct opensolaris_utsname hw_utsname = { .machine = MACHINE }; +#ifndef KERNEL_STATIC +char hw_serial[11] = "0"; + +utsname_t * +utsname(void) +{ + return (&hw_utsname); +} +#endif + static void opensolaris_utsname_init(void *arg) { hw_utsname.sysname = ostype; hw_utsname.nodename = prison0.pr_hostname; hw_utsname.release = osrelease; snprintf(hw_utsname.version, sizeof (hw_utsname.version), "%d", osreldate); } char * kmem_strdup(const char *s) { char *buf; buf = kmem_alloc(strlen(s) + 1, KM_SLEEP); strcpy(buf, s); return (buf); } int ddi_copyin(const void *from, void *to, size_t len, int flags) { /* Fake ioctl() issued by kernel, 'from' is a kernel address */ if (flags & FKIOCTL) { memcpy(to, from, len); return (0); } return (copyin(from, to, len)); } int ddi_copyout(const void *from, void *to, size_t len, int flags) { /* Fake ioctl() issued by kernel, 'from' is a kernel address */ if (flags & FKIOCTL) { memcpy(to, from, len); return (0); } return (copyout(from, to, len)); } int spl_panic(const char *file, const char *func, int line, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); va_end(ap); } -utsname_t * -utsname(void) -{ - return (&hw_utsname); -} + SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, opensolaris_utsname_init, NULL); diff --git a/module/os/freebsd/spl/spl_sysevent.c b/module/os/freebsd/spl/spl_sysevent.c index d3748276a71c..4bb0658c8797 100644 --- a/module/os/freebsd/spl/spl_sysevent.c +++ b/module/os/freebsd/spl/spl_sysevent.c @@ -1,259 +1,260 @@ /* * Copyright (c) 2010 Pawel Jakub Dawidek * Copyright (c) 2020 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include #include #include #include #include #include #include static int log_sysevent(nvlist_t *event) { struct sbuf *sb; const char *type; char typestr[128]; nvpair_t *elem = NULL; sb = sbuf_new_auto(); if (sb == NULL) return (ENOMEM); type = NULL; while ((elem = nvlist_next_nvpair(event, elem)) != NULL) { switch (nvpair_type(elem)) { case DATA_TYPE_BOOLEAN: { boolean_t value; (void) nvpair_value_boolean_value(elem, &value); sbuf_printf(sb, " %s=%s", nvpair_name(elem), value ? "true" : "false"); break; } case DATA_TYPE_UINT8: { uint8_t value; (void) nvpair_value_uint8(elem, &value); sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value); break; } case DATA_TYPE_INT32: { int32_t value; (void) nvpair_value_int32(elem, &value); sbuf_printf(sb, " %s=%jd", nvpair_name(elem), (intmax_t)value); break; } case DATA_TYPE_UINT32: { uint32_t value; (void) nvpair_value_uint32(elem, &value); sbuf_printf(sb, " %s=%ju", nvpair_name(elem), (uintmax_t)value); break; } case DATA_TYPE_INT64: { int64_t value; (void) nvpair_value_int64(elem, &value); sbuf_printf(sb, " %s=%jd", nvpair_name(elem), (intmax_t)value); break; } case DATA_TYPE_UINT64: { uint64_t value; (void) nvpair_value_uint64(elem, &value); sbuf_printf(sb, " %s=%ju", nvpair_name(elem), (uintmax_t)value); break; } case DATA_TYPE_STRING: { char *value; (void) nvpair_value_string(elem, &value); sbuf_printf(sb, " %s=%s", nvpair_name(elem), value); if (strcmp(FM_CLASS, nvpair_name(elem)) == 0) type = value; break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *value; uint_t ii, nelem; (void) nvpair_value_uint8_array(elem, &value, &nelem); sbuf_printf(sb, " %s=", nvpair_name(elem)); for (ii = 0; ii < nelem; ii++) sbuf_printf(sb, "%02hhx", value[ii]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *value; uint_t ii, nelem; (void) nvpair_value_uint16_array(elem, &value, &nelem); sbuf_printf(sb, " %s=", nvpair_name(elem)); for (ii = 0; ii < nelem; ii++) sbuf_printf(sb, "%04hx", value[ii]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *value; uint_t ii, nelem; (void) nvpair_value_uint32_array(elem, &value, &nelem); sbuf_printf(sb, " %s=", nvpair_name(elem)); for (ii = 0; ii < nelem; ii++) sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *value; uint_t ii, nelem; (void) nvpair_value_int64_array(elem, &value, &nelem); sbuf_printf(sb, " %s=", nvpair_name(elem)); for (ii = 0; ii < nelem; ii++) sbuf_printf(sb, "%016lld", (long long)value[ii]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *value; uint_t ii, nelem; (void) nvpair_value_uint64_array(elem, &value, &nelem); sbuf_printf(sb, " %s=", nvpair_name(elem)); for (ii = 0; ii < nelem; ii++) sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]); break; } case DATA_TYPE_STRING_ARRAY: { char **strarr; uint_t ii, nelem; (void) nvpair_value_string_array(elem, &strarr, &nelem); for (ii = 0; ii < nelem; ii++) { if (strarr[ii] == NULL) { sbuf_printf(sb, " "); continue; } sbuf_printf(sb, " %s", strarr[ii]); if (strcmp(FM_CLASS, strarr[ii]) == 0) type = strarr[ii]; } break; } case DATA_TYPE_NVLIST: /* XXX - requires recursing in log_sysevent */ break; default: printf("%s: type %d is not implemented\n", __func__, nvpair_type(elem)); break; } } if (sbuf_finish(sb) != 0) { sbuf_delete(sb); return (ENOMEM); } if (type == NULL) type = ""; if (strncmp(type, "ESC_ZFS_", 8) == 0) { snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8); type = typestr; } devctl_notify("ZFS", "ZFS", type, sbuf_data(sb)); sbuf_delete(sb); return (0); } static void sysevent_worker(void *arg __unused) { zfs_zevent_t *ze; nvlist_t *event; uint64_t dropped = 0; uint64_t dst_size; int error; zfs_zevent_init(&ze); for (;;) { dst_size = 131072; dropped = 0; event = NULL; error = zfs_zevent_next(ze, &event, &dst_size, &dropped); if (error) { error = zfs_zevent_wait(ze); if (error == ESHUTDOWN) break; } else { VERIFY(event != NULL); log_sysevent(event); nvlist_free(event); } } zfs_zevent_destroy(ze); kthread_exit(); } void ddi_sysevent_init(void) { kproc_kthread_add(sysevent_worker, NULL, &zfsproc, NULL, 0, 0, "zfskern", "sysevent"); } diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 23b580c437a0..87e517b58b9a 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -1,252 +1,253 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include extern struct vfsops zfs_vfsops; uint_t zfs_arc_free_target = 0; int64_t last_free_memory; free_memory_reason_t last_free_reason; static void arc_free_target_init(void *unused __unused) { zfs_arc_free_target = vm_cnt.v_free_target; } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); /* * We don't have a tunable for arc_free_target due to the dependency on * pagedaemon initialisation. */ static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) { uint_t val; int err; val = zfs_arc_free_target; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < minfree) return (EINVAL); if (val > vm_cnt.v_page_count) return (EINVAL); zfs_arc_free_target = val; return (0); } SYSCTL_DECL(_vfs_zfs); /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint_t), sysctl_vfs_zfs_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim"); /* END CSTYLED */ int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; int64_t n __unused; free_memory_reason_t r = FMR_UNKNOWN; /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; } #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ n = uma_avail() - (long)(uma_limit() / 4); if (n < lowest) { lowest = n; r = FMR_HEAP_ARENA; } #endif last_free_memory = lowest; last_free_reason = r; DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); return (lowest); } /* * Return a default max arc size based on the amount of physical memory. */ uint64_t arc_default_max(uint64_t min, uint64_t allmem) { uint64_t size; if (allmem >= 1 << 30) size = allmem - (1 << 30); else size = min; return (MAX(allmem * 5 / 8, size)); } /* * Helper function for arc_prune_async() it is responsible for safely * handling the execution of a registered arc_prune_func_t. */ static void arc_prune_task(void *arg) { int64_t nr_scan = *(int64_t *)arg; arc_reduce_target_size(ptob(nr_scan)); free(arg, M_TEMP); vnlru_free(nr_scan, &zfs_vfsops); } /* * Notify registered consumers they must drop holds on a portion of the ARC * buffered they reference. This provides a mechanism to ensure the ARC can * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asynchronously so it may be safely called * in the context of the arc_reclaim_thread(). A reference is taken here * for each registered arc_prune_t and the arc_prune_task() is responsible * for releasing it once the registered arc_prune_func_t has completed. */ void arc_prune_async(int64_t adjust) { int64_t *adjustptr; if ((adjustptr = malloc(sizeof (int64_t), M_TEMP, M_NOWAIT)) == NULL) return; *adjustptr = adjust; taskq_dispatch(arc_prune_taskq, arc_prune_task, adjustptr, TQ_SLEEP); ARCSTAT_BUMP(arcstat_prune); } uint64_t arc_all_memory(void) { return ((uint64_t)ptob(physmem)); } int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) { return (0); } uint64_t arc_free_memory(void) { /* XXX */ return (0); } static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { int64_t free_memory, to_free; arc_no_grow = B_TRUE; arc_warm = B_TRUE; arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); free_memory = arc_available_memory(); to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); arc_reduce_target_size(to_free); mutex_enter(&arc_adjust_lock); arc_adjust_needed = B_TRUE; zthr_wakeup(arc_adjust_zthr); /* * It is unsafe to block here in arbitrary threads, because we can come * here from ARC itself and may hold ARC locks and thus risk a deadlock * with ARC reclaim thread. */ if (curproc == pageproc) (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock); mutex_exit(&arc_adjust_lock); } void arc_lowmem_init(void) { arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); } void arc_lowmem_fini(void) { if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); } diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 6d5c88e21c3c..54b62f248553 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -1,708 +1,708 @@ /* * Copyright (c) 2020 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include <../zfs_config.h> /* BEGIN CSTYLED */ SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "ZFS multihost protection"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS livelist condense"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, "ZFS VDEV mirror"); +#ifdef ZFS_META_VERSION SYSCTL_DECL(_vfs_zfs_version); SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD, (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version"); - +#endif extern arc_state_t ARC_anon; extern arc_state_t ARC_mru; extern arc_state_t ARC_mru_ghost; extern arc_state_t ARC_mfu; extern arc_state_t ARC_mfu_ghost; extern arc_state_t ARC_l2c_only; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ /* arc.c */ /* legacy compat */ extern unsigned long l2arc_write_max; /* def max write size */ extern unsigned long l2arc_write_boost; /* extra warmup write */ extern unsigned long l2arc_headroom; /* # of dev writes */ extern unsigned long l2arc_headroom_boost; extern unsigned long l2arc_feed_secs; /* interval seconds */ extern unsigned long l2arc_feed_min_ms; /* min interval msecs */ extern int l2arc_noprefetch; /* don't cache prefetch bufs */ extern int l2arc_feed_again; /* turbo warmup */ extern int l2arc_norw; /* no reads during writes */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, &l2arc_write_max, 0, "max write size (LEGACY)"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, &l2arc_write_boost, 0, "extra write during warmup (LEGACY)"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, &l2arc_headroom, 0, "number of dev writes (LEGACY)"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, &l2arc_feed_secs, 0, "interval seconds (LEGACY)"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, &l2arc_feed_again, 0, "turbo warmup (LEGACY)"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, &l2arc_norw, 0, "no reads during writes (LEGACY)"); #if 0 extern int zfs_compressed_arc_enabled; SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RW, &zfs_compressed_arc_enabled, 1, "compressed arc buffers (LEGACY)"); #endif SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); extern int arc_no_grow_shift; extern int arc_shrink_shift; extern arc_stats_t arc_stats; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ #define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { uint32_t val; int err; val = arc_no_grow_shift; err = sysctl_handle_32(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val >= arc_shrink_shift) return (EINVAL); arc_no_grow_shift = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, 0, sizeof (uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", "log2(fraction of ARC which must be free to allow growing)"); /* dbuf.c */ /* dmu.c */ /* dmu_zfetch.c */ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); /* max bytes to prefetch per stream (default 8MB) */ extern uint32_t zfetch_max_distance; SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)"); /* max bytes to prefetch indirects for per stream (default 64MB) */ extern uint32_t zfetch_max_idistance; SYSCTL_UINT(_vfs_zfs_prefetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream"); /* dsl_pool.c */ /* dnode.c */ extern int zfs_default_bs; SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, &zfs_default_bs, 0, "Default dnode block shift"); extern int zfs_default_ibs; SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, &zfs_default_ibs, 0, "Default dnode indirect block shift"); /* dsl_scan.c */ /* metaslab.c */ /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered * around the disk. So a sane default for the space map block size * is 8~16K. */ extern int zfs_metaslab_sm_blksz_no_log; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_no_log, 0, "Block size for space map in pools with log space map disabled. " "Power of 2 and greater than 4096."); /* * When the log space map feature is enabled, we accumulate a lot of * changes per metaslab that are flushed once in a while so we benefit * from a bigger block size like 128K for the metaslab space maps. */ extern int zfs_metaslab_sm_blksz_with_log; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_with_log, 0, "Block size for space map in pools with log space map enabled. " "Power of 2 and greater than 4096."); /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ extern int zfs_condense_pct; SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, &zfs_condense_pct, 0, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); extern int zfs_remove_max_segment; SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN, &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to" " allocate when removing a device"); extern int zfs_removal_suspend_progress; SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while" " in the middle of a removal"); /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ extern uint64_t metaslab_df_alloc_threshold; SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0, "Minimum size which forces the dynamic allocator to change it's allocation strategy"); /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ extern int metaslab_df_free_pct; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, &metaslab_df_free_pct, 0, "The minimum free space, in percent, which must be available in a " "space map to continue allocations in a first-fit fashion"); /* * Percentage of all cpus that can be used by the metaslab taskq. */ extern int metaslab_load_pct; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, &metaslab_load_pct, 0, "Percentage of cpus that can be used by the metaslab taskq"); /* * Max number of metaslabs per group to preload. */ extern int metaslab_preload_limit; SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, &metaslab_preload_limit, 0, "Max number of metaslabs per group to preload"); /* refcount.c */ extern int reference_tracking_enable; SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN, &reference_tracking_enable, 0, "Track reference holders to refcount_t objects, used mostly by ZFS"); /* spa.c */ extern int zfs_ccw_retry_interval; SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, &zfs_ccw_retry_interval, 0, "Configuration cache file write, retry after failure, interval (seconds)"); extern uint64_t zfs_max_missing_tvds_cachefile; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0, "allow importing pools with missing top-level vdevs in cache file"); extern uint64_t zfs_max_missing_tvds_scan; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0, "allow importing pools with missing top-level vdevs during scan"); /* spa_misc.c */ extern int zfs_flags; static int sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) { int err, val; val = zfs_flags; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); /* * ZFS_DEBUG_MODIFY must be enabled prior to boot so all * arc buffers in the system have the necessary additional * checksum data. However, it is safe to disable at any * time. */ if (!(zfs_flags & ZFS_DEBUG_MODIFY)) val &= ~ZFS_DEBUG_MODIFY; zfs_flags = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0, sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); int param_set_deadman_synctime(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_deadman_synctime_ms; err = sysctl_handle_long(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); zfs_deadman_synctime_ms = val; spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms)); return (0); } int param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_deadman_ziotime_ms; err = sysctl_handle_long(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); zfs_deadman_ziotime_ms = val; spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms)); return (0); } int param_set_deadman_failmode(SYSCTL_HANDLER_ARGS) { char buf[16]; int rc; if (req->newptr == NULL) strlcpy(buf, zfs_deadman_failmode, sizeof (buf)); rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (rc || req->newptr == NULL) return (rc); if (strcmp(buf, zfs_deadman_failmode) == 0) return (0); if (!strcmp(buf, "wait")) zfs_deadman_failmode = "wait"; if (!strcmp(buf, "continue")) zfs_deadman_failmode = "continue"; if (!strcmp(buf, "panic")) zfs_deadman_failmode = "panic"; return (-param_set_deadman_failmode_common(buf)); } /* spacemap.c */ extern int space_map_ibs; SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, &space_map_ibs, 0, "Space map indirect block shift"); /* vdev.c */ #ifdef notyet extern uint64_t zfs_max_auto_ashift; extern uint64_t zfs_min_auto_ashift; static int sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_max_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val > ASHIFT_MAX || val < zfs_min_auto_ashift) return (EINVAL); zfs_max_auto_ashift = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t), sysctl_vfs_zfs_max_auto_ashift, "QU", "Max ashift used when optimising for logical -> physical sectors size on " "new top-level vdevs."); static int sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_min_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < ASHIFT_MIN || val > zfs_max_auto_ashift) return (EINVAL); zfs_min_auto_ashift = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t), sysctl_vfs_zfs_min_auto_ashift, "QU", "Min ashift used when creating new top-level vdevs."); #endif /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. */ extern int zfs_vdev_dtl_sm_blksz; SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_dtl_sm_blksz, 0, "Block size for DTL space map. Power of 2 and greater than 4096."); /* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. */ extern int zfs_vdev_standard_sm_blksz; SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 and greater than 4096."); extern int vdev_validate_skip; SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN, &vdev_validate_skip, 0, "Enable to bypass vdev_validate()."); /* vdev_cache.c */ /* vdev_mirror.c */ /* * The load configuration settings below are tuned by default for * the case where all devices are of the same rotational type. * * If there is a mixture of rotating and non-rotating media, setting * non_rotating_seek_inc to 0 may well provide better results as it * will direct more reads to the non-rotating vdevs which are more * likely to have a higher performance. */ /* vdev_queue.c */ #define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ extern uint32_t zfs_vdev_ ## name ## _min_active; \ SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ &zfs_vdev_ ## name ## _min_active, 0, \ "Initial number of I/O requests of type " #name \ " active for each device"); #define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ extern uint32_t zfs_vdev_ ## name ## _max_active; \ SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN, \ &zfs_vdev_ ## name ## _max_active, 0, \ "Maximum number of I/O requests of type " #name \ " active for each device"); #undef ZFS_VDEV_QUEUE_KNOB extern uint32_t zfs_vdev_max_active; SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, "The maximum number of I/Os of all types active for each device. (LEGACY)"); extern int zfs_vdev_def_queue_depth; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN, &zfs_vdev_def_queue_depth, 0, "Default queue depth for each allocator"); /*extern uint64_t zfs_multihost_history; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, multihost_history, CTLFLAG_RWTUN, &zfs_multihost_history, 0, "Historical staticists for the last N multihost updates");*/ #ifdef notyet SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW, &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation"); #endif /* zio.c */ #if defined(__LP64__) int zio_use_uma = 1; #else int zio_use_uma = 0; #endif SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, "Use uma(9) for ZIO allocations"); SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, "Exclude metadata buffers from dumps as well"); int param_set_arc_long(SYSCTL_HANDLER_ARGS) { int err; err = sysctl_handle_long(oidp, arg1, 0, req); if (err != 0 || req->newptr == NULL) return (err); arc_tuning_update(B_TRUE); return (0); } int param_set_arc_int(SYSCTL_HANDLER_ARGS) { int err; err = sysctl_handle_int(oidp, arg1, 0, req); if (err != 0 || req->newptr == NULL) return (err); arc_tuning_update(B_TRUE); return (0); } int param_set_slop_shift(SYSCTL_HANDLER_ARGS) { int val; int err; val = *(int *)arg1; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 1 || val > 31) return (EINVAL); *(int *)arg1 = val; return (0); } int param_set_multihost_interval(SYSCTL_HANDLER_ARGS) { int err; err = sysctl_handle_long(oidp, arg1, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (spa_mode_global != SPA_MODE_UNINIT) mmp_signal_all_threads(); return (0); } diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index 01851378e717..cca6bffd9214 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -1,326 +1,327 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #include #include +#include #include #include #include #include #include #include #include /* * Virtual device vector for files. */ static taskq_t *vdev_file_taskq; void vdev_file_init(void) { vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), minclsyspri, max_ncpus, INT_MAX, 0); } void vdev_file_fini(void) { taskq_destroy(vdev_file_taskq); } static void vdev_file_hold(vdev_t *vd) { ASSERT(vd->vdev_path != NULL); } static void vdev_file_rele(vdev_t *vd) { ASSERT(vd->vdev_path != NULL); } static mode_t vdev_file_open_mode(spa_mode_t spa_mode) { mode_t mode = 0; if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { mode = O_RDWR; } else if (spa_mode & SPA_MODE_READ) { mode = O_RDONLY; } else if (spa_mode & SPA_MODE_WRITE) { mode = O_WRONLY; } return (mode | O_LARGEFILE); } static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { vdev_file_t *vf; zfs_file_t *fp; zfs_file_attr_t zfa; int error; /* * Rotational optimizations only make sense on block devices. */ vd->vdev_nonrot = B_TRUE; /* * Allow TRIM on file based vdevs. This may not always be supported, * since it depends on your kernel version and underlying filesystem * type but it is always safe to attempt. */ vd->vdev_has_trim = B_TRUE; /* * Disable secure TRIM on file based vdevs. There is no way to * request this behavior from the underlying filesystem. */ vd->vdev_has_securetrim = B_FALSE; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); vf = vd->vdev_tsd; goto skip_open; } vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); /* * We always open the files from the root of the global zone, even if * we're in a local zone. If the user has gotten to this point, the * administrator has already decided that the pool should be available * to local zone users, so the underlying devices should be as well. */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); error = zfs_file_open(vd->vdev_path, vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } vf->vf_file = fp; #ifdef _KERNEL /* * Make sure it's a regular file. */ if (zfs_file_getattr(fp, &zfa)) { return (SET_ERROR(ENODEV)); } if (!S_ISREG(zfa.zfa_mode)) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (SET_ERROR(ENODEV)); } #endif skip_open: error = zfs_file_getattr(vf->vf_file, &zfa); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } *max_psize = *psize = zfa.zfa_size; *ashift = SPA_MINBLOCKSHIFT; return (0); } static void vdev_file_close(vdev_t *vd) { vdev_file_t *vf = vd->vdev_tsd; if (vd->vdev_reopening || vf == NULL) return; if (vf->vf_file != NULL) { zfs_file_close(vf->vf_file); } vd->vdev_delayed_close = B_FALSE; kmem_free(vf, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; } /* * Implements the interrupt side for file vdev types. This routine will be * called when the I/O completes allowing us to transfer the I/O to the * interrupt taskqs. For consistency, the code structure mimics disk vdev * types. */ static void vdev_file_io_intr(zio_t *zio) { zio_delay_interrupt(zio); } static void vdev_file_io_strategy(void *arg) { zio_t *zio = arg; vdev_t *vd = zio->io_vd; vdev_file_t *vf; void *buf; ssize_t resid; loff_t off; ssize_t size; int err; off = zio->io_offset; size = zio->io_size; resid = 0; vf = vd->vdev_tsd; ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); if (zio->io_type == ZIO_TYPE_READ) { buf = abd_borrow_buf(zio->io_abd, zio->io_size); err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); abd_return_buf_copy(zio->io_abd, buf, size); } else { buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); abd_return_buf(zio->io_abd, buf, size); } if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; vdev_file_io_intr(zio); } static void vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC); break; default: zio->io_error = SET_ERROR(ENOTSUP); } zio_execute(zio); return; } else if (zio->io_type == ZIO_TYPE_TRIM) { #ifdef notyet int mode = 0; ASSERT3U(zio->io_size, !=, 0); /* XXX FreeBSD has no fallocate routine in file ops */ zio->io_error = zfs_file_fallocate(vf->vf_file, mode, zio->io_offset, zio->io_size); #endif zio->io_error = SET_ERROR(ENOTSUP); zio_execute(zio); return; } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); zio->io_target_timestamp = zio_handle_io_delay(zio); VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, TQ_SLEEP), !=, 0); } /* ARGSUSED */ static void vdev_file_io_done(zio_t *zio) { } vdev_ops_t vdev_file_ops = { vdev_file_open, vdev_file_close, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, NULL, NULL, vdev_file_hold, vdev_file_rele, NULL, vdev_default_xlate, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; /* * From userland we access disks just like files. */ #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { vdev_file_open, vdev_file_close, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, NULL, NULL, vdev_file_hold, vdev_file_rele, NULL, vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; #endif diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 69a73103c446..8462755f8283 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1,1204 +1,1205 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright (c) 2012 Martin Matuska */ #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #ifndef g_topology_locked #define g_topology_locked() sx_xlocked(&topology_lock) #endif /* * Virtual device vector for GEOM. */ static g_attrchanged_t vdev_geom_attrchanged; struct g_class zfs_vdev_class = { .name = "ZFS::VDEV", .version = G_VERSION, .attrchanged = vdev_geom_attrchanged, }; struct consumer_vdev_elem { SLIST_ENTRY(consumer_vdev_elem) elems; vdev_t *vd; }; SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); /* BEGIN CSTYLED */ _Static_assert(sizeof (((struct g_consumer *)NULL)->private) == sizeof (struct consumer_priv_t*), "consumer_priv_t* can't be stored in g_consumer.private"); DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); SYSCTL_DECL(_vfs_zfs_vdev); /* Don't send BIO_FLUSH. */ static int vdev_geom_bio_flush_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); /* Don't send BIO_DELETE. */ static int vdev_geom_bio_delete_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); /* END CSTYLED */ /* Declare local functions */ static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); /* * Thread local storage used to indicate when a thread is probing geoms * for their guids. If NULL, this thread is not tasting geoms. If non NULL, * it is looking for a replacement for the vdev_t* that is its value. */ uint_t zfs_geom_probe_vdev_key; static void vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, boolean_t do_null_update) { boolean_t needs_update = B_FALSE; char *physpath; int error, physpath_len; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); if (error == 0) { char *old_physpath; /* g_topology lock ensures that vdev has not been closed */ g_topology_assert(); old_physpath = vd->vdev_physpath; vd->vdev_physpath = spa_strdup(physpath); if (old_physpath != NULL) { needs_update = (strcmp(old_physpath, vd->vdev_physpath) != 0); spa_strfree(old_physpath); } else needs_update = do_null_update; } g_free(physpath); /* * If the physical path changed, update the config. * Only request an update for previously unset physpaths if * requested by the caller. */ if (needs_update) spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); } static void vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) { struct consumer_priv_t *priv; struct consumer_vdev_elem *elem; priv = (struct consumer_priv_t *)&cp->private; if (SLIST_EMPTY(priv)) return; SLIST_FOREACH(elem, priv, elems) { vdev_t *vd = elem->vd; if (strcmp(attr, "GEOM::physpath") == 0) { vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE); return; } } } static void vdev_geom_resize(struct g_consumer *cp) { struct consumer_priv_t *priv; struct consumer_vdev_elem *elem; spa_t *spa; vdev_t *vd; priv = (struct consumer_priv_t *)&cp->private; if (SLIST_EMPTY(priv)) return; SLIST_FOREACH(elem, priv, elems) { vd = elem->vd; if (vd->vdev_state != VDEV_STATE_HEALTHY) continue; spa = vd->vdev_spa; if (!spa->spa_autoexpand) continue; vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); } } static void vdev_geom_orphan(struct g_consumer *cp) { struct consumer_priv_t *priv; // cppcheck-suppress uninitvar struct consumer_vdev_elem *elem; g_topology_assert(); priv = (struct consumer_priv_t *)&cp->private; if (SLIST_EMPTY(priv)) /* Vdev close in progress. Ignore the event. */ return; /* * Orphan callbacks occur from the GEOM event thread. * Concurrent with this call, new I/O requests may be * working their way through GEOM about to find out * (only once executed by the g_down thread) that we've * been orphaned from our disk provider. These I/Os * must be retired before we can detach our consumer. * This is most easily achieved by acquiring the * SPA ZIO configuration lock as a writer, but doing * so with the GEOM topology lock held would cause * a lock order reversal. Instead, rely on the SPA's * async removal support to invoke a close on this * vdev once it is safe to do so. */ // cppcheck-suppress All SLIST_FOREACH(elem, priv, elems) { // cppcheck-suppress uninitvar vdev_t *vd = elem->vd; vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } } static struct g_consumer * vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) { struct g_geom *gp; struct g_consumer *cp; int error; g_topology_assert(); ZFS_LOG(1, "Attaching to %s.", pp->name); if (sanity) { if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { ZFS_LOG(1, "Failing attach of %s. " "Incompatible sectorsize %d\n", pp->name, pp->sectorsize); return (NULL); } else if (pp->mediasize < SPA_MINDEVSIZE) { ZFS_LOG(1, "Failing attach of %s. " "Incompatible mediasize %ju\n", pp->name, pp->mediasize); return (NULL); } } /* Do we have geom already? No? Create one. */ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; if (strcmp(gp->name, "zfs::vdev") != 0) continue; break; } if (gp == NULL) { gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); gp->orphan = vdev_geom_orphan; gp->attrchanged = vdev_geom_attrchanged; gp->resize = vdev_geom_resize; cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); } else { /* Check if we are already connected to this provider. */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) { ZFS_LOG(1, "Found consumer for %s.", pp->name); break; } } if (cp == NULL) { cp = g_new_consumer(gp); error = g_attach(cp, pp); if (error != 0) { ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); vdev_geom_detach(cp, B_FALSE); return (NULL); } ZFS_LOG(1, "Created consumer for %s.", pp->name); } else { error = g_access(cp, 1, 0, 1); if (error != 0) { ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, __LINE__, error); return (NULL); } ZFS_LOG(1, "Used existing consumer for %s.", pp->name); } } if (vd != NULL) vd->vdev_tsd = cp; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; return (cp); } static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) { struct g_geom *gp; g_topology_assert(); ZFS_LOG(1, "Detaching from %s.", cp->provider && cp->provider->name ? cp->provider->name : "NULL"); gp = cp->geom; if (open_for_read) g_access(cp, -1, 0, -1); /* Destroy consumer on last close. */ if (cp->acr == 0 && cp->ace == 0) { if (cp->acw > 0) g_access(cp, 0, -cp->acw, 0); if (cp->provider != NULL) { ZFS_LOG(1, "Destroying consumer for %s.", cp->provider->name ? cp->provider->name : "NULL"); g_detach(cp); } g_destroy_consumer(cp); } /* Destroy geom if there are no consumers left. */ if (LIST_EMPTY(&gp->consumer)) { ZFS_LOG(1, "Destroyed geom %s.", gp->name); g_wither_geom(gp, ENXIO); } } static void vdev_geom_close_locked(vdev_t *vd) { struct g_consumer *cp; struct consumer_priv_t *priv; struct consumer_vdev_elem *elem, *elem_temp; g_topology_assert(); cp = vd->vdev_tsd; vd->vdev_delayed_close = B_FALSE; if (cp == NULL) return; ZFS_LOG(1, "Closing access to %s.", cp->provider->name); KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); priv = (struct consumer_priv_t *)&cp->private; vd->vdev_tsd = NULL; SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { if (elem->vd == vd) { SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); g_free(elem); } } vdev_geom_detach(cp, B_TRUE); } /* * Issue one or more bios to the vdev in parallel * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO * operation is described by parallel entries from each array. There may be * more bios actually issued than entries in the array */ static void vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, off_t *sizes, int *errors, int ncmds) { struct bio **bios; uint8_t *p; off_t off, maxio, s, end; int i, n_bios, j; size_t bios_size; maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); n_bios = 0; /* How many bios are required for all commands ? */ for (i = 0; i < ncmds; i++) n_bios += (sizes[i] + maxio - 1) / maxio; /* Allocate memory for the bios */ bios_size = n_bios * sizeof (struct bio *); bios = kmem_zalloc(bios_size, KM_SLEEP); /* Prepare and issue all of the bios */ for (i = j = 0; i < ncmds; i++) { off = offsets[i]; p = datas[i]; s = sizes[i]; end = off + s; ASSERT((off % cp->provider->sectorsize) == 0); ASSERT((s % cp->provider->sectorsize) == 0); for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { bios[j] = g_alloc_bio(); bios[j]->bio_cmd = cmds[i]; bios[j]->bio_done = NULL; bios[j]->bio_offset = off; bios[j]->bio_length = MIN(s, maxio); bios[j]->bio_data = (caddr_t)p; g_io_request(bios[j], cp); } } ASSERT(j == n_bios); /* Wait for all of the bios to complete, and clean them up */ for (i = j = 0; i < ncmds; i++) { off = offsets[i]; s = sizes[i]; end = off + s; for (; off < end; off += maxio, s -= maxio, j++) { errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i]; g_destroy_bio(bios[j]); } } kmem_free(bios, bios_size); } /* * Read the vdev config from a device. Return the number of valid labels that * were found. The vdev config will be returned in config if and only if at * least one valid label was found. */ static int vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) { struct g_provider *pp; nvlist_t *config; vdev_phys_t *vdev_lists[VDEV_LABELS]; char *buf; size_t buflen; uint64_t psize, state, txg; off_t offsets[VDEV_LABELS]; off_t size; off_t sizes[VDEV_LABELS]; int cmds[VDEV_LABELS]; int errors[VDEV_LABELS]; int l, nlabels; g_topology_assert_not(); pp = cp->provider; ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); size = sizeof (*vdev_lists[0]) + pp->sectorsize - ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; buflen = sizeof (vdev_lists[0]->vp_nvlist); /* Create all of the IO requests */ for (l = 0; l < VDEV_LABELS; l++) { cmds[l] = BIO_READ; vdev_lists[l] = kmem_alloc(size, KM_SLEEP); offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; sizes[l] = size; errors[l] = 0; ASSERT(offsets[l] % pp->sectorsize == 0); } /* Issue the IO requests */ vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, VDEV_LABELS); /* Parse the labels */ config = *configp = NULL; nlabels = 0; for (l = 0; l < VDEV_LABELS; l++) { if (errors[l] != 0) continue; buf = vdev_lists[l]->vp_nvlist; if (nvlist_unpack(buf, buflen, &config, 0) != 0) continue; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(config); continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(config); continue; } if (*configp != NULL) nvlist_free(*configp); *configp = config; nlabels++; } /* Free the label storage */ for (l = 0; l < VDEV_LABELS; l++) kmem_free(vdev_lists[l], size); return (nlabels); } static void resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) { nvlist_t **new_configs; uint64_t i; if (id < *count) return; new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *), KM_SLEEP); for (i = 0; i < *count; i++) new_configs[i] = (*configs)[i]; if (*configs != NULL) kmem_free(*configs, *count * sizeof (void *)); *configs = new_configs; *count = id + 1; } static void process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, const char *name, uint64_t *known_pool_guid) { nvlist_t *vdev_tree; uint64_t pool_guid; uint64_t vdev_guid; uint64_t id, txg, known_txg; char *pname; if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || strcmp(pname, name) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) goto ignore; if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) goto ignore; if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) goto ignore; VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if (*known_pool_guid != 0) { if (pool_guid != *known_pool_guid) goto ignore; } else *known_pool_guid = pool_guid; resize_configs(configs, count, id); if ((*configs)[id] != NULL) { VERIFY(nvlist_lookup_uint64((*configs)[id], ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); if (txg <= known_txg) goto ignore; nvlist_free((*configs)[id]); } (*configs)[id] = cfg; return; ignore: nvlist_free(cfg); } int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count) { struct g_class *mp; struct g_geom *gp; struct g_provider *pp; struct g_consumer *zcp; nvlist_t *vdev_cfg; uint64_t pool_guid; int nlabels; DROP_GIANT(); g_topology_lock(); *configs = NULL; *count = 0; pool_guid = 0; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->flags & G_PF_WITHER) continue; zcp = vdev_geom_attach(pp, NULL, B_TRUE); if (zcp == NULL) continue; g_topology_unlock(); nlabels = vdev_geom_read_config(zcp, &vdev_cfg); g_topology_lock(); vdev_geom_detach(zcp, B_TRUE); if (nlabels == 0) continue; ZFS_LOG(1, "successfully read vdev config"); process_vdev_config(configs, count, vdev_cfg, name, &pool_guid); } } } g_topology_unlock(); PICKUP_GIANT(); return (*count > 0 ? 0 : ENOENT); } enum match { NO_MATCH = 0, /* No matching labels found */ TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */ ZERO_MATCH = 1, /* Should never be returned */ ONE_MATCH = 2, /* 1 label matching the vdev_guid */ TWO_MATCH = 3, /* 2 label matching the vdev_guid */ THREE_MATCH = 4, /* 3 label matching the vdev_guid */ FULL_MATCH = 5 /* all labels match the vdev_guid */ }; static enum match vdev_attach_ok(vdev_t *vd, struct g_provider *pp) { nvlist_t *config; uint64_t pool_guid, top_guid, vdev_guid; struct g_consumer *cp; int nlabels; cp = vdev_geom_attach(pp, NULL, B_TRUE); if (cp == NULL) { ZFS_LOG(1, "Unable to attach tasting instance to %s.", pp->name); return (NO_MATCH); } g_topology_unlock(); nlabels = vdev_geom_read_config(cp, &config); g_topology_lock(); vdev_geom_detach(cp, B_TRUE); if (nlabels == 0) { ZFS_LOG(1, "Unable to read config from %s.", pp->name); return (NO_MATCH); } pool_guid = 0; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); top_guid = 0; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); vdev_guid = 0; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); nvlist_free(config); /* * Check that the label's pool guid matches the desired guid. * Inactive spares and L2ARCs do not have any pool guid in the label. */ if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", pp->name, (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); return (NO_MATCH); } /* * Check that the label's vdev guid matches the desired guid. * The second condition handles possible race on vdev detach, when * remaining vdev receives GUID of destroyed top level mirror vdev. */ if (vdev_guid == vd->vdev_guid) { ZFS_LOG(1, "guids match for provider %s.", pp->name); return (ZERO_MATCH + nlabels); } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); return (TOPGUID_MATCH); } ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); return (NO_MATCH); } static struct g_consumer * vdev_geom_attach_by_guids(vdev_t *vd) { struct g_class *mp; struct g_geom *gp; struct g_provider *pp, *best_pp; struct g_consumer *cp; const char *vdpath; enum match match, best_match; g_topology_assert(); vdpath = vd->vdev_path + sizeof ("/dev/") - 1; cp = NULL; best_pp = NULL; best_match = NO_MATCH; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { match = vdev_attach_ok(vd, pp); if (match > best_match) { best_match = match; best_pp = pp; } else if (match == best_match) { if (strcmp(pp->name, vdpath) == 0) { best_pp = pp; } } if (match == FULL_MATCH) goto out; } } } out: if (best_pp) { cp = vdev_geom_attach(best_pp, vd, B_TRUE); if (cp == NULL) { printf("ZFS WARNING: Unable to attach to %s.\n", best_pp->name); } } return (cp); } static struct g_consumer * vdev_geom_open_by_guids(vdev_t *vd) { struct g_consumer *cp; char *buf; size_t len; g_topology_assert(); ZFS_LOG(1, "Searching by guids [%ju:%ju].", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); cp = vdev_geom_attach_by_guids(vd); if (cp != NULL) { len = strlen(cp->provider->name) + strlen("/dev/") + 1; buf = kmem_alloc(len, KM_SLEEP); snprintf(buf, len, "/dev/%s", cp->provider->name); spa_strfree(vd->vdev_path); vd->vdev_path = buf; ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid, cp->provider->name); } else { ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); } return (cp); } static struct g_consumer * vdev_geom_open_by_path(vdev_t *vd, int check_guid) { struct g_provider *pp; struct g_consumer *cp; g_topology_assert(); cp = NULL; pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1); if (pp != NULL) { ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) cp = vdev_geom_attach(pp, vd, B_FALSE); } return (cp); } static int vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift) { struct g_provider *pp; struct g_consumer *cp; int error, has_trim; uint16_t rate; /* * Set the TLS to indicate downstack that we * should not access zvols */ VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if ((cp = vd->vdev_tsd) != NULL) { ASSERT(vd->vdev_reopening); goto skip_open; } DROP_GIANT(); g_topology_lock(); error = 0; if (vd->vdev_spa->spa_is_splitting || ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { /* * We are dealing with a vdev that hasn't been previously * opened (since boot), and we are not loading an * existing pool configuration. This looks like a * vdev add operation to a new or existing pool. * Assume the user really wants to do this, and find * GEOM provider by its name, ignoring GUID mismatches. * * XXPOLICY: It would be safer to only allow a device * that is unlabeled or labeled but missing * GUID information to be opened in this fashion, * unless we are doing a split, in which case we * should allow any guid. */ cp = vdev_geom_open_by_path(vd, 0); } else { /* * Try using the recorded path for this device, but only * accept it if its label data contains the expected GUIDs. */ cp = vdev_geom_open_by_path(vd, 1); if (cp == NULL) { /* * The device at vd->vdev_path doesn't have the * expected GUIDs. The disks might have merely * moved around so try all other GEOM providers * to find one with the right GUIDs. */ cp = vdev_geom_open_by_guids(vd); } } /* Clear the TLS now that tasting is done */ VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); if (cp == NULL) { ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); error = ENOENT; } else { struct consumer_priv_t *priv; struct consumer_vdev_elem *elem; int spamode; priv = (struct consumer_priv_t *)&cp->private; if (cp->private == NULL) SLIST_INIT(priv); elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO); elem->vd = vd; SLIST_INSERT_HEAD(priv, elem, elems); spamode = spa_mode(vd->vdev_spa); if (cp->provider->sectorsize > VDEV_PAD_SIZE || !ISP2(cp->provider->sectorsize)) { ZFS_LOG(1, "Provider %s has unsupported sectorsize.", cp->provider->name); vdev_geom_close_locked(vd); error = EINVAL; cp = NULL; } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { int i; for (i = 0; i < 5; i++) { error = g_access(cp, 0, 1, 0); if (error == 0) break; g_topology_unlock(); tsleep(vd, 0, "vdev", hz / 2); g_topology_lock(); } if (error != 0) { printf("ZFS WARNING: Unable to open %s for " "writing (error=%d).\n", cp->provider->name, error); vdev_geom_close_locked(vd); cp = NULL; } } } /* Fetch initial physical path information for this device. */ if (cp != NULL) { vdev_geom_attrchanged(cp, "GEOM::physpath"); /* Set other GEOM characteristics */ vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE); } g_topology_unlock(); PICKUP_GIANT(); if (cp == NULL) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", error); return (error); } skip_open: pp = cp->provider; /* * Determine the actual size of the device. */ *max_psize = *psize = pp->mediasize; /* * Determine the device's minimum transfer size and preferred * transfer size. */ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; #ifdef notyet if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && pp->stripeoffset == 0) *physical_ashift = highbit(pp->stripesize) - 1; #endif /* * Clear the nowritecache settings, so that on a vdev_reopen() * we will try again. */ vd->vdev_nowritecache = B_FALSE; /* Inform the ZIO pipeline that we are non-rotational. */ error = g_getattr("GEOM::rotation_rate", cp, &rate); if (error == 0 && rate == DISK_RR_NON_ROTATING) vd->vdev_nonrot = B_TRUE; else vd->vdev_nonrot = B_FALSE; /* Set when device reports it supports TRIM. */ error = g_getattr("GEOM::candelete", cp, &has_trim); vd->vdev_has_trim = (error == 0 && has_trim); /* Set when device reports it supports secure TRIM. */ /* unavailable on FreeBSD */ vd->vdev_has_securetrim = B_FALSE; return (0); } static void vdev_geom_close(vdev_t *vd) { struct g_consumer *cp; boolean_t locked; cp = vd->vdev_tsd; DROP_GIANT(); locked = g_topology_locked(); if (!locked) g_topology_lock(); if (!vd->vdev_reopening || (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || (cp->provider != NULL && cp->provider->error != 0)))) vdev_geom_close_locked(vd); if (!locked) g_topology_unlock(); PICKUP_GIANT(); } static void vdev_geom_io_intr(struct bio *bp) { vdev_t *vd; zio_t *zio; zio = bp->bio_caller1; vd = zio->io_vd; zio->io_error = bp->bio_error; if (zio->io_error == 0 && bp->bio_resid != 0) zio->io_error = SET_ERROR(EIO); switch (zio->io_error) { case ENOTSUP: /* * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know * that future attempts will never succeed. In this case * we set a persistent flag so that we don't bother with * requests in the future. */ switch (bp->bio_cmd) { case BIO_FLUSH: vd->vdev_nowritecache = B_TRUE; break; case BIO_DELETE: break; } break; case ENXIO: if (!vd->vdev_remove_wanted) { /* * If provider's error is set we assume it is being * removed. */ if (bp->bio_to->error != 0) { vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } else if (!vd->vdev_delayed_close) { vd->vdev_delayed_close = B_TRUE; } } break; } /* * We have to split bio freeing into two parts, because the ABD code * cannot be called in this context and vdev_op_io_done is not called * for ZIO_TYPE_IOCTL zio-s. */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { g_destroy_bio(bp); zio->io_bio = NULL; } zio_delay_interrupt(zio); } static void vdev_geom_io_start(zio_t *zio) { vdev_t *vd; struct g_consumer *cp; struct bio *bp; vd = zio->io_vd; switch (zio->io_type) { case ZIO_TYPE_IOCTL: /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } else { switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush || vdev_geom_bio_flush_disable) break; if (vd->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } goto sendreq; default: zio->io_error = SET_ERROR(ENOTSUP); } } zio_execute(zio); return; case ZIO_TYPE_TRIM: if (!vdev_geom_bio_delete_disable) { goto sendreq; } zio_execute(zio); return; default: ; /* PASSTHROUGH --- placate compiler */ } sendreq: ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM || zio->io_type == ZIO_TYPE_IOCTL); cp = vd->vdev_tsd; if (cp == NULL) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } bp = g_alloc_bio(); bp->bio_caller1 = zio; switch (zio->io_type) { case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: zio->io_target_timestamp = zio_handle_io_delay(zio); bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; if (zio->io_type == ZIO_TYPE_READ) { bp->bio_cmd = BIO_READ; bp->bio_data = abd_borrow_buf(zio->io_abd, zio->io_size); } else { bp->bio_cmd = BIO_WRITE; bp->bio_data = abd_borrow_buf_copy(zio->io_abd, zio->io_size); } break; case ZIO_TYPE_TRIM: bp->bio_cmd = BIO_DELETE; bp->bio_data = NULL; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; case ZIO_TYPE_IOCTL: bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; break; default: panic("invalid zio->io_type: %d\n", zio->io_type); } bp->bio_done = vdev_geom_io_intr; zio->io_bio = bp; g_io_request(bp, cp); } static void vdev_geom_io_done(zio_t *zio) { struct bio *bp = zio->io_bio; if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { ASSERT(bp == NULL); return; } if (bp == NULL) { ASSERT3S(zio->io_error, ==, ENXIO); return; } if (zio->io_type == ZIO_TYPE_READ) abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size); else abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size); g_destroy_bio(bp); zio->io_bio = NULL; } static void vdev_geom_hold(vdev_t *vd) { } static void vdev_geom_rele(vdev_t *vd) { } vdev_ops_t vdev_disk_ops = { vdev_geom_open, vdev_geom_close, vdev_default_asize, vdev_geom_io_start, vdev_geom_io_done, NULL, NULL, vdev_geom_hold, vdev_geom_rele, NULL, vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 3a65b0bb4321..b30c65e71eb1 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -1,613 +1,614 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. */ #include +#include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * Pool configuration repository. * * Pool configuration is stored as a packed nvlist on the filesystem. By * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot * (when the ZFS module is loaded). Pools can also have the 'cachefile' * property set that allows them to be stored in an alternate location until * the control of external software. * * For each cache file, we have a single nvlist which holds all the * configuration information. When the module loads, we read this information * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is * maintained independently in spa.c. Whenever the namespace is modified, or * the configuration of a pool is changed, we call spa_write_cachefile(), which * walks through all the active pools and writes the configuration to disk. */ static uint64_t spa_config_generation = 1; /* * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ char *spa_config_path = ZPOOL_CACHE; int zfs_autoimport_disable = 1; /* * Called when the module is first loaded, this routine loads the configuration * file into the SPA namespace. It does not actually open or load the pools; it * only populates the namespace. */ void spa_config_load(void) { void *buf = NULL; nvlist_t *nvlist, *child; nvpair_t *nvpair; char *pathname; zfs_file_t *fp; zfs_file_attr_t zfa; uint64_t fsize; int err; #ifdef _KERNEL if (zfs_autoimport_disable) return; #endif /* * Open the configuration file. */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); err = zfs_file_open(pathname, O_RDONLY, 0, &fp); kmem_free(pathname, MAXPATHLEN); if (err) return; if (zfs_file_getattr(fp, &zfa)) goto out; fsize = zfa.zfa_size; buf = kmem_alloc(fsize, KM_SLEEP); /* * Read the nvlist from the file. */ if (zfs_file_read(fp, buf, fsize, NULL) < 0) goto out; /* * Unpack the nvlist. */ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) goto out; /* * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. */ mutex_enter(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; child = fnvpair_value_nvlist(nvpair); if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; (void) spa_add(nvpair_name(nvpair), child, NULL); } mutex_exit(&spa_namespace_lock); nvlist_free(nvlist); out: if (buf != NULL) kmem_free(buf, fsize); zfs_file_close(fp); } static int spa_config_remove(spa_config_dirent_t *dp) { int error = 0; /* * Remove the cache file. If zfs_file_unlink() in not supported by the * platform fallback to truncating the file which is functionally * equivalent. */ error = zfs_file_unlink(dp->scd_path); if (error == EOPNOTSUPP) { int flags = O_RDWR | O_TRUNC; zfs_file_t *fp; error = zfs_file_open(dp->scd_path, flags, 0644, &fp); if (error == 0) { (void) zfs_file_fsync(fp, O_SYNC); (void) zfs_file_close(fp); } } return (error); } static int spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; char *buf; int oflags = O_RDWR | O_TRUNC | O_CREAT | O_LARGEFILE; char *temp; int err; zfs_file_t *fp; /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { err = spa_config_remove(dp); if (err == ENOENT) err = 0; return (err); } /* * Pack the configuration into a buffer. */ buf = fnvlist_pack(nvl, &buflen); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); /* * Write the configuration to disk. Due to the complexity involved * in performing a rename and remove from within the kernel the file * is instead truncated and overwritten in place. This way we always * have a consistent view of the data or a zero length file. */ err = zfs_file_open(dp->scd_path, oflags, 0644, &fp); if (err == 0) { err = zfs_file_write(fp, buf, buflen, NULL); if (err == 0) err = zfs_file_fsync(fp, O_SYNC); zfs_file_close(fp); if (err) (void) spa_config_remove(dp); } fnvlist_pack_free(buf, buflen); kmem_free(temp, MAXPATHLEN); return (err); } /* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. Synchronizing the pool cache is typically done after * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not * contain the correct information to open the pool and an explicit import * would be required. */ void spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; char *pool_name; boolean_t ccw_failure; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (!(spa_mode_global & SPA_MODE_WRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { /* * Skip over our own pool if we're about to remove * ourselves from the spa namespace or any pool that * is readonly. Since we cannot guarantee that a * readonly pool would successfully import upon reboot, * we don't allow them to be written to the cache file. */ if ((spa == target && removing) || !spa_writeable(spa)) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) nvl = fnvlist_alloc(); if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) pool_name = fnvlist_lookup_string( spa->spa_config, ZPOOL_CONFIG_POOL_NAME); else pool_name = spa_name(spa); fnvlist_add_nvlist(nvl, pool_name, spa->spa_config); mutex_exit(&spa->spa_props_lock); } error = spa_config_write(dp, nvl); if (error != 0) ccw_failure = B_TRUE; nvlist_free(nvl); } if (ccw_failure) { /* * Keep trying so that configuration data is * written if/when any temporary filesystem * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, target, NULL, NULL, NULL, 0, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); } else { /* * Do not rate limit future attempts to update * the config cache. */ target->spa_ccw_fail_time = 0; } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); } /* * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, * and we don't want to allow the local zone to see all the pools anyway. * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ nvlist_t * spa_all_configs(uint64_t *generation) { nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) return (NULL); pools = fnvlist_alloc(); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } } *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); return (pools); } void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); if (spa->spa_config != NULL && spa->spa_config != config) nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } /* * Generate the pool's configuration based on the current in-core state. * * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ nvlist_t * spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; uint64_t split_guid; char *pool_name; if (vd == NULL) { vd = rvd; locked = B_TRUE; spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == (SCL_CONFIG | SCL_STATE)); /* * If txg is -1, report the current value of spa->spa_config_txg. */ if (txg == -1ULL) txg = spa->spa_config_txg; /* * Originally, users had to handle spa namespace collisions by either * exporting the already imported pool or by specifying a new name for * the pool with a conflicting name. In the case of root pools from * virtual guests, neither approach to collision resolution is * reasonable. This is addressed by extending the new name syntax with * an option to specify that the new name is temporary. When specified, * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us * to use the previous name, which we do below. */ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { VERIFY0(nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &pool_name)); } else pool_name = spa_name(spa); config = fnvlist_alloc(); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); hostid = spa_get_hostid(spa); if (hostid != 0) fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename); int config_gen_flags = 0; if (vd != rvd) { fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_isspare) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 1ULL); if (vd->vdev_islog) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL); vd = vd->vdev_top; /* label contains top config */ } else { /* * Only add the (potentially large) split information * in the mos config, and not in the vdev labels */ if (spa->spa_config_splitting != NULL) fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, spa->spa_config_splitting); fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); config_gen_flags |= VDEV_CONFIG_MOS; } /* * Add the top-level config. We even add this on pools which * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); /* * If we're splitting, record the original pool's guid. */ if (spa->spa_config_splitting != NULL && nvlist_lookup_uint64(spa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid); } nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); nvlist_free(nvroot); /* * Store what's necessary for reading the MOS in the label. */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, spa->spa_label_features); if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); kmem_free(ddh, sizeof (ddt_histogram_t)); ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); ddt_get_dedup_object_stats(spa, ddo); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); kmem_free(ddo, sizeof (ddt_object_t)); dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); ddt_get_dedup_stats(spa, dds); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); kmem_free(dds, sizeof (ddt_stat_t)); } if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); } else { /* * If we have top-level vdevs that were added but have * not yet been prepared for allocation, do that now. * (It's safe now because the config cache is up to date, * so it will be able to translate the new DVAs.) * See comments in spa_vdev_add() for full details. */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * Explicitly skip vdevs that are indirect or * log vdevs that are being removed. The reason * is that both of those can have vdev_ms_array * set to 0 and we wouldn't want to change their * metaslab size nor call vdev_expand() on them. */ if (!vdev_is_concrete(tvd) || (tvd->vdev_islog && tvd->vdev_removing)) continue; if (tvd->vdev_ms_array == 0) vdev_metaslab_set_size(tvd); vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. */ txg_wait_synced(spa->spa_dsl_pool, txg); /* * Update the global config cache to reflect the new mosconfig. */ if (!spa->spa_is_root) { spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); } if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } EXPORT_SYMBOL(spa_config_load); EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); EXPORT_SYMBOL(spa_config_update); /* BEGIN CSTYLED */ #ifdef __linux__ /* string sysctls require a char array on FreeBSD */ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, "SPA config file (/etc/zfs/zpool.cache)"); #endif ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, "Disable pool import at module load"); /* END CSTYLED */