Index: projects/clang900-import/Makefile.inc1 =================================================================== --- projects/clang900-import/Makefile.inc1 (revision 352536) +++ projects/clang900-import/Makefile.inc1 (revision 352537) @@ -1,3399 +1,3400 @@ # # $FreeBSD$ # # Make command line options: # -DNO_CLEANDIR run ${MAKE} clean, instead of ${MAKE} cleandir # -DNO_CLEAN do not clean at all # -DDB_FROM_SRC use the user/group databases in src/etc instead of # the system database when installing. # -DNO_SHARE do not go into share subdir # -DKERNFAST define NO_KERNEL{CONFIG,CLEAN,OBJ} # -DNO_KERNELCONFIG do not run config in ${MAKE} buildkernel # -DNO_KERNELCLEAN do not run ${MAKE} clean in ${MAKE} buildkernel # -DNO_KERNELOBJ do not run ${MAKE} obj in ${MAKE} buildkernel # -DNO_PORTSUPDATE do not update ports in ${MAKE} update # -DNO_ROOT install without using root privilege # -DNO_DOCUPDATE do not update doc in ${MAKE} update # -DWITHOUT_CTF do not run the DTrace CTF conversion tools on built objects # LOCAL_DIRS="list of dirs" to add additional dirs to the SUBDIR list # LOCAL_ITOOLS="list of tools" to add additional tools to the ITOOLS list # LOCAL_LIB_DIRS="list of dirs" to add additional dirs to libraries target # LOCAL_MTREE="list of mtree files" to process to allow local directories # to be created before files are installed # LOCAL_TOOL_DIRS="list of dirs" to add additional dirs to the build-tools # list # LOCAL_XTOOL_DIRS="list of dirs" to add additional dirs to the # cross-tools target # METALOG="path to metadata log" to write permission and ownership # when NO_ROOT is set. (default: ${DESTDIR}/METALOG) # TARGET="machine" to crossbuild world for a different machine type # TARGET_ARCH= may be required when a TARGET supports multiple endians # BUILDENV_SHELL= shell to launch for the buildenv target (def:${SHELL}) # WORLD_FLAGS= additional flags to pass to make(1) during buildworld # KERNEL_FLAGS= additional flags to pass to make(1) during buildkernel # SUBDIR_OVERRIDE="list of dirs" to build rather than everything. # All libraries and includes, and some build tools will still build. # # The intended user-driven targets are: # buildworld - rebuild *everything*, including glue to help do upgrades # installworld- install everything built by "buildworld" # checkworld - run test suite on installed world # doxygen - build API documentation of the kernel # update - convenient way to update your source tree (eg: svn/svnup) # # Standard targets (not defined here) are documented in the makefiles in # /usr/share/mk. These include: # obj depend all install clean cleandepend cleanobj .if !defined(TARGET) || !defined(TARGET_ARCH) .error "Both TARGET and TARGET_ARCH must be defined." .endif .if make(showconfig) || make(test-system-*) _MKSHOWCONFIG= t .endif SRCDIR?= ${.CURDIR} LOCALBASE?= /usr/local # Cross toolchain changes must be in effect before bsd.compiler.mk # so that gets the right CC, and pass CROSS_TOOLCHAIN to submakes. .if defined(CROSS_TOOLCHAIN) .if exists(${LOCALBASE}/share/toolchains/${CROSS_TOOLCHAIN}.mk) .include "${LOCALBASE}/share/toolchains/${CROSS_TOOLCHAIN}.mk" .elif exists(${CROSS_TOOLCHAIN}) .include "${CROSS_TOOLCHAIN}" .else .error CROSS_TOOLCHAIN ${CROSS_TOOLCHAIN} not found .endif CROSSENV+=CROSS_TOOLCHAIN="${CROSS_TOOLCHAIN}" .endif .if defined(CROSS_TOOLCHAIN_PREFIX) CROSS_COMPILER_PREFIX?=${CROSS_TOOLCHAIN_PREFIX} .endif XCOMPILERS= CC CXX CPP .for COMPILER in ${XCOMPILERS} .if defined(CROSS_COMPILER_PREFIX) X${COMPILER}?= ${CROSS_COMPILER_PREFIX}${${COMPILER}} .else X${COMPILER}?= ${${COMPILER}} .endif .endfor # If a full path to an external cross compiler is given, don't build # a cross compiler. .if ${XCC:N${CCACHE_BIN}:M/*} MK_CLANG_BOOTSTRAP= no MK_GCC_BOOTSTRAP= no .endif # Pull in compiler metadata from buildworld/toolchain if possible to avoid # running CC from bsd.compiler.mk. .if make(installworld) || make(install) || make(distributeworld) || \ make(stageworld) .-include "${OBJTOP}/toolchain-metadata.mk" .if !defined(_LOADED_TOOLCHAIN_METADATA) .error A build is required first. You may have the wrong MAKEOBJDIRPREFIX set. .endif .endif # Pull in COMPILER_TYPE and COMPILER_FREEBSD_VERSION early. Pull it from the # tree to be friendlier to foreign OS builds. It's safe to do so unconditionally # here since we will always have the right make, unlike in src/Makefile # Don't include bsd.linker.mk yet until XBINUTILS is handled (after src.opts.mk) _NO_INCLUDE_LINKERMK= t # We also want the X_COMPILER* variables if we are using an external toolchain. _WANT_TOOLCHAIN_CROSS_VARS= t .include "share/mk/bsd.compiler.mk" .undef _NO_INCLUDE_LINKERMK .undef _WANT_TOOLCHAIN_CROSS_VARS # src.opts.mk depends on COMPILER_FEATURES .include "share/mk/src.opts.mk" .if ${TARGET} == ${MACHINE} TARGET_CPUTYPE?=${CPUTYPE} .else TARGET_CPUTYPE?= .endif .if !empty(TARGET_CPUTYPE) _TARGET_CPUTYPE=${TARGET_CPUTYPE} .else _TARGET_CPUTYPE=dummy .endif .if ${TARGET} == "arm" .if ${TARGET_ARCH:Marmv[67]*} != "" && ${TARGET_CPUTYPE:M*soft*} == "" TARGET_ABI= gnueabihf .else TARGET_ABI= gnueabi .endif .endif MACHINE_ABI?= unknown MACHINE_TRIPLE?=${MACHINE_ARCH:S/amd64/x86_64/:C/hf$//:S/mipsn32/mips64/}-${MACHINE_ABI}-freebsd13.0 TARGET_ABI?= unknown TARGET_TRIPLE?= ${TARGET_ARCH:S/amd64/x86_64/:C/hf$//:S/mipsn32/mips64/}-${TARGET_ABI}-freebsd13.0 KNOWN_ARCHES?= aarch64/arm64 \ amd64 \ arm \ armv6/arm \ armv7/arm \ i386 \ mips \ mipsel/mips \ mips64el/mips \ mipsn32el/mips \ mips64/mips \ mipsn32/mips \ mipshf/mips \ mipselhf/mips \ mips64elhf/mips \ mips64hf/mips \ powerpc \ powerpc64/powerpc \ powerpcspe/powerpc \ riscv64/riscv \ riscv64sf/riscv \ sparc64 .if ${TARGET} == ${TARGET_ARCH} _t= ${TARGET} .else _t= ${TARGET_ARCH}/${TARGET} .endif .for _t in ${_t} .if empty(KNOWN_ARCHES:M${_t}) .error Unknown target ${TARGET_ARCH}:${TARGET}. .endif .endfor # If all targets are disabled for system llvm then don't expect it to work # for cross-builds. .if !defined(TOOLS_PREFIX) && ${MK_LLVM_TARGET_ALL} == "no" && \ ${MACHINE} != ${TARGET} && ${MACHINE_ARCH} != ${TARGET_ARCH} && \ !make(showconfig) MK_SYSTEM_COMPILER= no MK_SYSTEM_LINKER= no .endif # Handle external binutils. .if defined(CROSS_TOOLCHAIN_PREFIX) CROSS_BINUTILS_PREFIX?=${CROSS_TOOLCHAIN_PREFIX} .endif # If we do not have a bootstrap binutils (because the in-tree one does not # support the target architecture), provide a default cross-binutils prefix. # This allows riscv64 builds, for example, to automatically use the # riscv64-binutils port or package. .if !make(showconfig) && !defined(_NO_INCLUDE_COMPILERMK) .if !empty(BROKEN_OPTIONS:MBINUTILS_BOOTSTRAP) && \ ${MK_LLD_BOOTSTRAP} == "no" && \ !defined(CROSS_BINUTILS_PREFIX) CROSS_BINUTILS_PREFIX=/usr/local/${TARGET_TRIPLE}/bin/ .if !exists(${CROSS_BINUTILS_PREFIX}) .error In-tree binutils does not support the ${TARGET_ARCH} architecture. Install the ${TARGET_ARCH}-binutils port or package or set CROSS_BINUTILS_PREFIX. .endif .endif .endif XBINUTILS= AS AR LD NM OBJCOPY RANLIB SIZE STRINGS .for BINUTIL in ${XBINUTILS} .if defined(CROSS_BINUTILS_PREFIX) && \ exists(${CROSS_BINUTILS_PREFIX}/${${BINUTIL}}) X${BINUTIL}?= ${CROSS_BINUTILS_PREFIX:C,/*$,,}/${${BINUTIL}} .else X${BINUTIL}?= ${${BINUTIL}} .endif .endfor # If a full path to an external linker is given, don't build lld. .if ${XLD:M/*} MK_LLD_BOOTSTRAP= no .endif # We also want the X_LINKER* variables if we are using an external toolchain. _WANT_TOOLCHAIN_CROSS_VARS= t .include "share/mk/bsd.linker.mk" .undef _WANT_TOOLCHAIN_CROSS_VARS # Begin WITH_SYSTEM_COMPILER / WITH_SYSTEM_LD # WITH_SYSTEM_COMPILER - Pull in needed values and make a decision. # Check if there is a local compiler that can satisfy as an external compiler. # Which compiler is expected to be used? .if ${MK_CLANG_BOOTSTRAP} == "yes" WANT_COMPILER_TYPE= clang .elif ${MK_GCC_BOOTSTRAP} == "yes" WANT_COMPILER_TYPE= gcc .else WANT_COMPILER_TYPE= .endif .if !defined(WANT_COMPILER_FREEBSD_VERSION) && !make(showconfig) && \ !make(test-system-linker) .if ${WANT_COMPILER_TYPE} == "clang" WANT_COMPILER_FREEBSD_VERSION_FILE= lib/clang/freebsd_cc_version.h WANT_COMPILER_FREEBSD_VERSION!= \ awk '$$2 == "FREEBSD_CC_VERSION" {printf("%d\n", $$3)}' \ ${SRCDIR}/${WANT_COMPILER_FREEBSD_VERSION_FILE} || echo unknown WANT_COMPILER_VERSION_FILE= lib/clang/include/clang/Basic/Version.inc WANT_COMPILER_VERSION!= \ awk '$$2 == "CLANG_VERSION" {split($$3, a, "."); print a[1] * 10000 + a[2] * 100 + a[3]}' \ ${SRCDIR}/${WANT_COMPILER_VERSION_FILE} || echo unknown .elif ${WANT_COMPILER_TYPE} == "gcc" WANT_COMPILER_FREEBSD_VERSION_FILE= gnu/usr.bin/cc/cc_tools/freebsd-native.h WANT_COMPILER_FREEBSD_VERSION!= \ awk '$$2 == "FBSD_CC_VER" {printf("%d\n", $$3)}' \ ${SRCDIR}/${WANT_COMPILER_FREEBSD_VERSION_FILE} || echo unknown WANT_COMPILER_VERSION_FILE= contrib/gcc/BASE-VER WANT_COMPILER_VERSION!= \ awk -F. '{print $$1 * 10000 + $$2 * 100 + $$3}' \ ${SRCDIR}/${WANT_COMPILER_VERSION_FILE} || echo unknown .endif .export WANT_COMPILER_FREEBSD_VERSION WANT_COMPILER_VERSION .endif # !defined(WANT_COMPILER_FREEBSD_VERSION) # It needs to be the same revision as we would build for the bootstrap. # If the expected vs CC is different then we can't skip. # GCC cannot be used for cross-arch yet. For clang we pass -target later if # TARGET_ARCH!=MACHINE_ARCH. .if ${MK_SYSTEM_COMPILER} == "yes" && \ defined(WANT_COMPILER_FREEBSD_VERSION) && \ (${MK_CLANG_BOOTSTRAP} == "yes" || ${MK_GCC_BOOTSTRAP} == "yes") && \ !make(xdev*) && \ ${X_COMPILER_TYPE} == ${WANT_COMPILER_TYPE} && \ (${X_COMPILER_TYPE} == "clang" || ${TARGET_ARCH} == ${MACHINE_ARCH}) && \ ${X_COMPILER_VERSION} == ${WANT_COMPILER_VERSION} && \ ${X_COMPILER_FREEBSD_VERSION} == ${WANT_COMPILER_FREEBSD_VERSION} # Everything matches, disable the bootstrap compiler. MK_CLANG_BOOTSTRAP= no MK_GCC_BOOTSTRAP= no USING_SYSTEM_COMPILER= yes .endif # ${WANT_COMPILER_TYPE} == ${COMPILER_TYPE} # WITH_SYSTEM_LD - Pull in needed values and make a decision. # Check if there is a local linker that can satisfy as an external linker. # Which linker is expected to be used? .if ${MK_LLD_BOOTSTRAP} == "yes" WANT_LINKER_TYPE= lld .elif ${MK_BINUTILS_BOOTSTRAP} == "yes" # Note that there's no support for bfd in WITH_SYSTEM_LINKER. WANT_LINKER_TYPE= bfd .else WANT_LINKER_TYPE= .endif .if !defined(WANT_LINKER_FREEBSD_VERSION) && !make(showconfig) && \ !make(test-system-compiler) .if ${WANT_LINKER_TYPE} == "lld" WANT_LINKER_FREEBSD_VERSION_FILE= lib/clang/include/lld/Common/Version.inc WANT_LINKER_FREEBSD_VERSION!= \ awk '$$2 == "LLD_REVISION_STRING" {gsub(/"/, "", $$3); print $$3}' \ ${SRCDIR}/${WANT_LINKER_FREEBSD_VERSION_FILE} || echo unknown WANT_LINKER_VERSION_FILE= lib/clang/include/lld/Common/Version.inc WANT_LINKER_VERSION!= \ awk '$$2 == "LLD_VERSION" {split($$3, a, "."); print a[1] * 10000 + a[2] * 100 + a[3]}' \ ${SRCDIR}/${WANT_LINKER_VERSION_FILE} || echo unknown .else WANT_LINKER_FREEBSD_VERSION_FILE= WANT_LINKER_FREEBSD_VERSION= .endif .export WANT_LINKER_FREEBSD_VERSION WANT_LINKER_VERSION .endif # !defined(WANT_LINKER_FREEBSD_VERSION) .if ${MK_SYSTEM_LINKER} == "yes" && \ defined(WANT_LINKER_FREEBSD_VERSION) && \ (${MK_LLD_BOOTSTRAP} == "yes") && \ !make(xdev*) && \ ${X_LINKER_TYPE} == ${WANT_LINKER_TYPE} && \ ${X_LINKER_VERSION} == ${WANT_LINKER_VERSION} && \ ${X_LINKER_FREEBSD_VERSION} == ${WANT_LINKER_FREEBSD_VERSION} # Everything matches, disable the bootstrap linker. MK_LLD_BOOTSTRAP= no USING_SYSTEM_LINKER= yes .endif # ${WANT_LINKER_TYPE} == ${LINKER_TYPE} # WITH_SYSTEM_COMPILER / WITH_SYSTEM_LINKER - Handle defaults and debug. USING_SYSTEM_COMPILER?= no USING_SYSTEM_LINKER?= no TEST_SYSTEM_COMPILER_VARS= \ USING_SYSTEM_COMPILER MK_SYSTEM_COMPILER \ MK_CROSS_COMPILER MK_CLANG_BOOTSTRAP MK_GCC_BOOTSTRAP \ WANT_COMPILER_TYPE WANT_COMPILER_VERSION WANT_COMPILER_VERSION_FILE \ WANT_COMPILER_FREEBSD_VERSION WANT_COMPILER_FREEBSD_VERSION_FILE \ CC COMPILER_TYPE COMPILER_FEATURES COMPILER_VERSION \ COMPILER_FREEBSD_VERSION \ XCC X_COMPILER_TYPE X_COMPILER_FEATURES X_COMPILER_VERSION \ X_COMPILER_FREEBSD_VERSION TEST_SYSTEM_LINKER_VARS= \ USING_SYSTEM_LINKER MK_SYSTEM_LINKER \ MK_LLD_BOOTSTRAP MK_BINUTILS_BOOTSTRAP \ WANT_LINKER_TYPE WANT_LINKER_VERSION WANT_LINKER_VERSION_FILE \ WANT_LINKER_FREEBSD_VERSION WANT_LINKER_FREEBSD_VERSION_FILE \ LD LINKER_TYPE LINKER_FEATURES LINKER_VERSION \ LINKER_FREEBSD_VERSION \ XLD X_LINKER_TYPE X_LINKER_FEATURES X_LINKER_VERSION \ X_LINKER_FREEBSD_VERSION .for _t in compiler linker test-system-${_t}: .PHONY .for v in ${TEST_SYSTEM_${_t:tu}_VARS} ${_+_}@printf "%-35s= %s\n" "${v}" "${${v}}" .endfor .endfor .if (make(buildworld) || make(buildkernel) || make(kernel-toolchain) || \ make(toolchain) || make(_cross-tools)) .if ${USING_SYSTEM_COMPILER} == "yes" .info SYSTEM_COMPILER: Determined that CC=${CC} matches the source tree. Not bootstrapping a cross-compiler. .elif ${MK_CLANG_BOOTSTRAP} == "yes" .info SYSTEM_COMPILER: libclang will be built for bootstrapping a cross-compiler. .endif .if ${USING_SYSTEM_LINKER} == "yes" .info SYSTEM_LINKER: Determined that LD=${LD} matches the source tree. Not bootstrapping a cross-linker. .elif ${MK_LLD_BOOTSTRAP} == "yes" .info SYSTEM_LINKER: libclang will be built for bootstrapping a cross-linker. .endif .endif # End WITH_SYSTEM_COMPILER / WITH_SYSTEM_LD # Store some compiler metadata for use in installworld where we don't # want to invoke CC at all. _TOOLCHAIN_METADATA_VARS= COMPILER_VERSION \ COMPILER_TYPE \ COMPILER_FEATURES \ COMPILER_FREEBSD_VERSION \ LINKER_VERSION \ LINKER_FEATURES \ LINKER_TYPE \ LINKER_FREEBSD_VERSION toolchain-metadata.mk: .PHONY .META @: > ${.TARGET} @echo ".info Using cached toolchain metadata from build at $$(hostname) on $$(date)" \ > ${.TARGET} @echo "_LOADED_TOOLCHAIN_METADATA=t" >> ${.TARGET} .for v in ${_TOOLCHAIN_METADATA_VARS} @echo "${v}=${${v}}" >> ${.TARGET} @echo "X_${v}=${X_${v}}" >> ${.TARGET} .endfor @echo ".export ${_TOOLCHAIN_METADATA_VARS}" >> ${.TARGET} @echo ".export ${_TOOLCHAIN_METADATA_VARS:C,^,X_,}" >> ${.TARGET} # We must do lib/ and libexec/ before bin/ in case of a mid-install error to # keep the users system reasonably usable. For static->dynamic root upgrades, # we don't want to install a dynamic binary without rtld and the needed # libraries. More commonly, for dynamic root, we don't want to install a # binary that requires a newer library version that hasn't been installed yet. # This ordering is not a guarantee though. The only guarantee of a working # system here would require fine-grained ordering of all components based # on their dependencies. .if !empty(SUBDIR_OVERRIDE) SUBDIR= ${SUBDIR_OVERRIDE} .else SUBDIR= lib libexec # Add LOCAL_LIB_DIRS, but only if they will not be picked up as a SUBDIR # of a LOCAL_DIRS directory. This allows LOCAL_DIRS=foo and # LOCAL_LIB_DIRS=foo/lib to behave as expected. .for _DIR in ${LOCAL_DIRS:M*/} ${LOCAL_DIRS:N*/:S|$|/|} _REDUNDANT_LIB_DIRS+= ${LOCAL_LIB_DIRS:M${_DIR}*} .endfor .for _DIR in ${LOCAL_LIB_DIRS} .if ${_DIR} == ".WAIT" || (empty(_REDUNDANT_LIB_DIRS:M${_DIR}) && exists(${.CURDIR}/${_DIR}/Makefile)) SUBDIR+= ${_DIR} .endif .endfor .if !defined(NO_ROOT) && (make(installworld) || make(install)) # Ensure libraries are installed before progressing. SUBDIR+=.WAIT .endif SUBDIR+=bin .if ${MK_CDDL} != "no" SUBDIR+=cddl .endif SUBDIR+=gnu include .if ${MK_KERBEROS} != "no" SUBDIR+=kerberos5 .endif .if ${MK_RESCUE} != "no" SUBDIR+=rescue .endif SUBDIR+=sbin .if ${MK_CRYPT} != "no" SUBDIR+=secure .endif .if !defined(NO_SHARE) SUBDIR+=share .endif .if ${MK_BOOT} != "no" SUBDIR+=stand .endif SUBDIR+=sys usr.bin usr.sbin .if ${MK_TESTS} != "no" SUBDIR+= tests .endif # Local directories are built in parallel with the base system directories. # Users may insert a .WAIT directive at the beginning or elsewhere within # the LOCAL_DIRS and LOCAL_LIB_DIRS lists as needed. .for _DIR in ${LOCAL_DIRS} .if ${_DIR} == ".WAIT" || exists(${.CURDIR}/${_DIR}/Makefile) SUBDIR+= ${_DIR} .endif .endfor # We must do etc/ last as it hooks into building the man whatis file # by calling 'makedb' in share/man. This is only relevant for # install/distribute so they build the whatis file after every manpage is # installed. .if make(installworld) || make(install) SUBDIR+=.WAIT .endif SUBDIR+=etc .endif # !empty(SUBDIR_OVERRIDE) .if defined(NOCLEAN) .warning NOCLEAN option is deprecated. Use NO_CLEAN instead. NO_CLEAN= ${NOCLEAN} .endif .if defined(NO_CLEANDIR) CLEANDIR= clean cleandepend .else CLEANDIR= cleandir .endif .if defined(WORLDFAST) NO_CLEAN= t NO_OBJWALK= t .endif .if ${MK_META_MODE} == "yes" # If filemon is used then we can rely on the build being incremental-safe. # The .meta files will also track the build command and rebuild should # it change. .if empty(.MAKE.MODE:Mnofilemon) NO_CLEAN= t .endif .endif .if defined(NO_OBJWALK) || ${MK_AUTO_OBJ} == "yes" NO_OBJWALK= t NO_KERNELOBJ= t .endif .if !defined(NO_OBJWALK) _obj= obj .endif LOCAL_TOOL_DIRS?= PACKAGEDIR?= ${DESTDIR}/${DISTDIR} .if empty(SHELL:M*csh*) BUILDENV_SHELL?=${SHELL} .else BUILDENV_SHELL?=/bin/sh .endif .if !defined(_MKSHOWCONFIG) .if !defined(SVN_CMD) || empty(SVN_CMD) . for _P in /usr/bin /usr/local/bin . for _S in svn svnlite . if exists(${_P}/${_S}) SVN_CMD= ${_P}/${_S} . endif . endfor . endfor .export SVN_CMD .endif SVNFLAGS?= -r HEAD .if !defined(VCS_REVISION) || empty(VCS_REVISION) .if !defined(SVNVERSION_CMD) || empty(SVNVERSION_CMD) . for _D in ${PATH:S,:, ,g} . if exists(${_D}/svnversion) SVNVERSION_CMD?=${_D}/svnversion . endif . if exists(${_D}/svnliteversion) SVNVERSION_CMD?=${_D}/svnliteversion . endif . endfor .endif _VCS_REVISION?= $$(eval ${SVNVERSION_CMD} ${SRCDIR}) . if !empty(_VCS_REVISION) VCS_REVISION= $$(echo r${_VCS_REVISION}) . endif .export VCS_REVISION .endif .if !defined(OSRELDATE) .if exists(/usr/include/osreldate.h) OSRELDATE!= awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \ /usr/include/osreldate.h .else OSRELDATE= 0 .endif .export OSRELDATE .endif # Set VERSION for CTFMERGE to use via the default CTFFLAGS=-L VERSION. .if !defined(_REVISION) _REVISION!= ${MAKE} -C ${SRCDIR}/release MK_AUTO_OBJ=no -V REVISION .export _REVISION .endif .if !defined(_BRANCH) _BRANCH!= ${MAKE} -C ${SRCDIR}/release MK_AUTO_OBJ=no -V BRANCH .export _BRANCH .endif .if !defined(SRCRELDATE) SRCRELDATE!= awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \ ${SRCDIR}/sys/sys/param.h .export SRCRELDATE .endif .if !defined(VERSION) VERSION= FreeBSD ${_REVISION}-${_BRANCH:C/-p[0-9]+$//} ${TARGET_ARCH} ${SRCRELDATE} .export VERSION .endif .if !defined(PKG_VERSION) .if ${_BRANCH:MSTABLE*} || ${_BRANCH:MCURRENT*} TIMENOW= %Y%m%d%H%M%S EXTRA_REVISION= .s${TIMENOW:gmtime} .elif ${_BRANCH:MALPHA*} EXTRA_REVISION= _${_BRANCH:C/-ALPHA/.a/} .elif ${_BRANCH:MBETA*} EXTRA_REVISION= _${_BRANCH:C/-BETA/.b/} .elif ${_BRANCH:MRC*} EXTRA_REVISION= _${_BRANCH:C/-RC/.r/} .elif ${_BRANCH:MPRERELEASE*} EXTRA_REVISION= _${_BRANCH:C/-PRERELEASE/.p/} .elif ${_BRANCH:M*-p*} EXTRA_REVISION= _${_BRANCH:C/.*-p([0-9]+$)/\1/} .endif PKG_VERSION= ${_REVISION}${EXTRA_REVISION} .endif .endif # !defined(PKG_VERSION) .if !defined(_MKSHOWCONFIG) _CPUTYPE!= MAKEFLAGS= CPUTYPE=${_TARGET_CPUTYPE} ${MAKE} -f /dev/null \ -m ${.CURDIR}/share/mk MK_AUTO_OBJ=no -V CPUTYPE .if ${_CPUTYPE} != ${_TARGET_CPUTYPE} .error CPUTYPE global should be set with ?=. .endif .endif .if make(buildworld) BUILD_ARCH!= uname -p .if ${MACHINE_ARCH} != ${BUILD_ARCH} .error To cross-build, set TARGET_ARCH. .endif .endif WORLDTMP?= ${OBJTOP}/tmp BPATH= ${CCACHE_WRAPPER_PATH_PFX}${WORLDTMP}/legacy/usr/sbin:${WORLDTMP}/legacy/usr/bin:${WORLDTMP}/legacy/bin XPATH= ${WORLDTMP}/usr/sbin:${WORLDTMP}/usr/bin # When building we want to find the cross tools before the host tools in ${BPATH}. # We also need to add UNIVERSE_TOOLCHAIN_PATH so that we can find the shared # toolchain files (clang, lld, etc.) during make universe/tinderbox STRICTTMPPATH= ${XPATH}:${BPATH}:${UNIVERSE_TOOLCHAIN_PATH} # We should not be using tools from /usr/bin accidentally since this could cause # the build to break on other systems that don't have that tool. For now we # still allow using the old behaviour (inheriting $PATH) if # BUILD_WITH_STRICT_TMPPATH is set to 0 but this will eventually be removed. # Currently strict $PATH can cause build failures and does not work yet with # USING_SYSTEM_LINKER/USING_SYSTEM_COMPILER. Once these issues have been # resolved it will be turned on by default. BUILD_WITH_STRICT_TMPPATH?=0 .if ${BUILD_WITH_STRICT_TMPPATH} != 0 TMPPATH= ${STRICTTMPPATH} .else TMPPATH= ${STRICTTMPPATH}:${PATH} .endif # # Avoid running mktemp(1) unless actually needed. # It may not be functional, e.g., due to new ABI # when in the middle of installing over this system. # .if make(distributeworld) || make(installworld) || make(stageworld) .if ${BUILD_WITH_STRICT_TMPPATH} != 0 MKTEMP=${WORLDTMP}/legacy/usr/bin/mktemp .if !exists(${MKTEMP}) .error "mktemp binary doesn't exist in expected location: ${MKTEMP}" .endif .else MKTEMP=mktemp .endif INSTALLTMP!= ${MKTEMP} -d -u -t install .endif .if make(stagekernel) || make(distributekernel) TAGS+= kernel PACKAGE= kernel .endif # # Building a world goes through the following stages # # 1. legacy stage [BMAKE] # This stage is responsible for creating compatibility # shims that are needed by the bootstrap-tools, # build-tools and cross-tools stages. These are generally # APIs that tools from one of those three stages need to # build that aren't present on the host. # 1. bootstrap-tools stage [BMAKE] # This stage is responsible for creating programs that # are needed for backward compatibility reasons. They # are not built as cross-tools. # 2. build-tools stage [TMAKE] # This stage is responsible for creating the object # tree and building any tools that are needed during # the build process. Some programs are listed during # this phase because they build binaries to generate # files needed to build these programs. This stage also # builds the 'build-tools' target rather than 'all'. # 3. cross-tools stage [XMAKE] # This stage is responsible for creating any tools that # are needed for building the system. A cross-compiler is one # of them. This differs from build tools in two ways: # 1. the 'all' target is built rather than 'build-tools' # 2. these tools are installed into TMPPATH for stage 4. # 4. world stage [WMAKE] # This stage actually builds the world. # 5. install stage (optional) [IMAKE] # This stage installs a previously built world. # BOOTSTRAPPING?= 0 # Keep these in sync MINIMUM_SUPPORTED_OSREL?= 1002501 MINIMUM_SUPPORTED_REL?= 10.3 # Common environment for world related stages CROSSENV+= \ MACHINE_ARCH=${TARGET_ARCH} \ MACHINE=${TARGET} \ CPUTYPE=${TARGET_CPUTYPE} .if ${MK_META_MODE} != "no" # Don't rebuild build-tools targets during normal build. CROSSENV+= BUILD_TOOLS_META=.NOMETA .endif .if defined(TARGET_CFLAGS) CROSSENV+= ${TARGET_CFLAGS} .endif BOOTSTRAPPING_OSRELDATE?=${OSRELDATE} # bootstrap-tools stage BMAKEENV= INSTALL="sh ${.CURDIR}/tools/install.sh" \ TOOLS_PREFIX=${TOOLS_PREFIX_UNDEF:U${WORLDTMP}} \ PATH=${BPATH}:${PATH} \ WORLDTMP=${WORLDTMP} \ MAKEFLAGS="-m ${.CURDIR}/tools/build/mk ${.MAKEFLAGS}" # need to keep this in sync with targets/pseudo/bootstrap-tools/Makefile BSARGS= DESTDIR= \ OBJTOP='${WORLDTMP}/obj-tools' \ OBJROOT='$${OBJTOP}/' \ MAKEOBJDIRPREFIX= \ BOOTSTRAPPING=${BOOTSTRAPPING_OSRELDATE} \ BWPHASE=${.TARGET:C,^_,,} \ SSP_CFLAGS= \ MK_HTML=no NO_LINT=yes MK_MAN=no \ -DNO_PIC MK_PROFILE=no -DNO_SHARED \ -DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no \ MK_CLANG_EXTRAS=no MK_CLANG_FULL=no \ MK_LLDB=no MK_RETPOLINE=no MK_TESTS=no \ MK_INCLUDES=yes BMAKE= \ ${BMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ ${BSARGS} .if empty(.MAKEOVERRIDES:MMK_LLVM_TARGET_ALL) BMAKE+= MK_LLVM_TARGET_ALL=no .endif # build-tools stage TMAKE= \ ${BMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \ DESTDIR= \ BOOTSTRAPPING=${BOOTSTRAPPING_OSRELDATE} \ BWPHASE=${.TARGET:C,^_,,} \ SSP_CFLAGS= \ -DNO_LINT \ -DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no \ MK_CLANG_EXTRAS=no MK_CLANG_FULL=no \ MK_LLDB=no MK_RETPOLINE=no MK_TESTS=no # cross-tools stage # TOOLS_PREFIX set in BMAKE XMAKE= ${BMAKE} \ TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \ MK_GDB=no MK_TESTS=no # kernel-tools stage KTMAKEENV= INSTALL="sh ${.CURDIR}/tools/install.sh" \ PATH=${BPATH}:${PATH} \ WORLDTMP=${WORLDTMP} KTMAKE= \ TOOLS_PREFIX=${TOOLS_PREFIX_UNDEF:U${WORLDTMP}} \ ${KTMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ DESTDIR= \ OBJTOP='${WORLDTMP}/obj-kernel-tools' \ OBJROOT='$${OBJTOP}/' \ MAKEOBJDIRPREFIX= \ BOOTSTRAPPING=${BOOTSTRAPPING_OSRELDATE} \ SSP_CFLAGS= \ MK_HTML=no -DNO_LINT MK_MAN=no \ -DNO_PIC MK_PROFILE=no -DNO_SHARED \ -DNO_CPU_CFLAGS MK_RETPOLINE=no MK_WARNS=no MK_CTF=no # world stage WMAKEENV= ${CROSSENV} \ INSTALL="sh ${.CURDIR}/tools/install.sh" \ PATH=${TMPPATH} \ SYSROOT=${WORLDTMP} # make hierarchy HMAKE= PATH=${TMPPATH} ${MAKE} LOCAL_MTREE=${LOCAL_MTREE:Q} .if defined(NO_ROOT) HMAKE+= PATH=${TMPPATH} METALOG=${METALOG} -DNO_ROOT .endif CROSSENV+= CC="${XCC} ${XCFLAGS}" CXX="${XCXX} ${XCXXFLAGS} ${XCFLAGS}" \ CPP="${XCPP} ${XCFLAGS}" \ AS="${XAS}" AR="${XAR}" LD="${XLD}" LLVM_LINK="${XLLVM_LINK}" \ NM=${XNM} OBJCOPY="${XOBJCOPY}" \ RANLIB=${XRANLIB} STRINGS=${XSTRINGS} \ SIZE="${XSIZE}" .if defined(CROSS_BINUTILS_PREFIX) && exists(${CROSS_BINUTILS_PREFIX}) # In the case of xdev-build tools, CROSS_BINUTILS_PREFIX won't be a # directory, but the compiler will look in the right place for its # tools so we don't need to tell it where to look. BFLAGS+= -B${CROSS_BINUTILS_PREFIX} .endif # The internal bootstrap compiler has a default sysroot set by TOOLS_PREFIX # and target set by TARGET/TARGET_ARCH. However, there are several needs to # always pass an explicit --sysroot and -target. # - External compiler needs sysroot and target flags. # - External ld needs sysroot. # - To be clear about the use of a sysroot when using the internal compiler. # - Easier debugging. # - Allowing WITH_SYSTEM_COMPILER+WITH_META_MODE to work together due to # the flip-flopping build command when sometimes using external and # sometimes using internal. # - Allow using lld which has no support for default paths. .if !defined(CROSS_BINUTILS_PREFIX) || !exists(${CROSS_BINUTILS_PREFIX}) BFLAGS+= -B${WORLDTMP}/usr/bin .endif .if ${WANT_COMPILER_TYPE} == gcc || \ (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == gcc) .elif ${WANT_COMPILER_TYPE} == clang || \ (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == clang) XCFLAGS+= -target ${TARGET_TRIPLE} .endif XCFLAGS+= --sysroot=${WORLDTMP} .if !empty(BFLAGS) XCFLAGS+= ${BFLAGS} .endif .if ${MK_LIB32} != "no" && (${TARGET_ARCH} == "amd64" || \ ${TARGET_ARCH} == "powerpc64" || ${TARGET_ARCH:Mmips64*} != "") LIBCOMPAT= 32 .include "Makefile.libcompat" .elif ${MK_LIBSOFT} != "no" && ${TARGET_ARCH:Marmv[67]*} != "" LIBCOMPAT= SOFT .include "Makefile.libcompat" .endif # META_MODE normally ignores host file changes since every build updates # timestamps (see NO_META_IGNORE_HOST in sys.mk). There are known times # when the ABI breaks though that we want to force rebuilding WORLDTMP # to get updated host tools. .if ${MK_META_MODE} == "yes" && defined(NO_CLEAN) && \ !defined(NO_META_IGNORE_HOST) && !defined(NO_META_IGNORE_HOST_HEADERS) && \ !defined(_MKSHOWCONFIG) # r318736 - ino64 major ABI breakage META_MODE_BAD_ABI_VERS+= 1200031 .if !defined(OBJDIR_HOST_OSRELDATE) .if exists(${OBJTOP}/host-osreldate.h) OBJDIR_HOST_OSRELDATE!= \ awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \ ${OBJTOP}/host-osreldate.h .elif exists(${WORLDTMP}/usr/include/osreldate.h) OBJDIR_HOST_OSRELDATE= 0 .endif .export OBJDIR_HOST_OSRELDATE .endif # Note that this logic is the opposite of normal BOOTSTRAP handling. We want # to compare the WORLDTMP's OSRELDATE to the host's OSRELDATE. If the WORLDTMP # is older than the ABI-breakage OSRELDATE of the HOST then we rebuild. .if defined(OBJDIR_HOST_OSRELDATE) .for _ver in ${META_MODE_BAD_ABI_VERS} .if ${OSRELDATE} >= ${_ver} && ${OBJDIR_HOST_OSRELDATE} < ${_ver} _meta_mode_need_rebuild= ${_ver} .endif .endfor .if defined(_meta_mode_need_rebuild) .info META_MODE: Rebuilding host tools due to ABI breakage in __FreeBSD_version ${_meta_mode_need_rebuild}. NO_META_IGNORE_HOST_HEADERS= 1 .export NO_META_IGNORE_HOST_HEADERS .endif # defined(_meta_mode_need_rebuild) .endif # defined(OBJDIR_HOST_OSRELDATE) .endif # ${MK_META_MODE} == "yes" && defined(NO_CLEAN) ... # This is only used for META_MODE+filemon to track what the oldest # __FreeBSD_version is in WORLDTMP. This purposely does NOT have # a make dependency on /usr/include/osreldate.h as the file should # only be copied when it is missing or meta mode determines it has changed. # Since host files are normally ignored without NO_META_IGNORE_HOST # the file will never be updated unless that flag is specified. This # allows tracking the oldest osreldate to force rebuilds via # META_MODE_BADABI_REVS above. host-osreldate.h: # DO NOT ADD /usr/include/osreldate.h here @cp -f /usr/include/osreldate.h ${.TARGET} WMAKE= ${WMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ BWPHASE=${.TARGET:C,^_,,} \ DESTDIR=${WORLDTMP} IMAKEENV= ${CROSSENV} IMAKE= ${IMAKEENV} ${MAKE} -f Makefile.inc1 \ ${IMAKE_INSTALL} ${IMAKE_MTREE} .if empty(.MAKEFLAGS:M-n) IMAKEENV+= PATH=${STRICTTMPPATH}:${INSTALLTMP} \ LD_LIBRARY_PATH=${INSTALLTMP} \ PATH_LOCALE=${INSTALLTMP}/locale IMAKE+= __MAKE_SHELL=${INSTALLTMP}/sh .else IMAKEENV+= PATH=${TMPPATH}:${INSTALLTMP} .endif # When generating install media, do not allow user and group information from # the build host to affect the contents of the distribution. .if make(distributeworld) || make(distrib-dirs) || make(distribution) DB_FROM_SRC= yes .endif .if defined(DB_FROM_SRC) INSTALLFLAGS+= -N ${.CURDIR}/etc MTREEFLAGS+= -N ${.CURDIR}/etc .endif _INSTALL_DDIR= ${DESTDIR}/${DISTDIR} INSTALL_DDIR= ${_INSTALL_DDIR:S://:/:g:C:/$::} .if defined(NO_ROOT) METALOG?= ${DESTDIR}/${DISTDIR}/METALOG METALOG:= ${METALOG:C,//+,/,g} IMAKE+= -DNO_ROOT METALOG=${METALOG} INSTALLFLAGS+= -U -M ${METALOG} -D ${INSTALL_DDIR} MTREEFLAGS+= -W .endif .if defined(BUILD_PKGS) INSTALLFLAGS+= -h sha256 .endif .if defined(DB_FROM_SRC) || defined(NO_ROOT) IMAKE_INSTALL= INSTALL="install ${INSTALLFLAGS}" IMAKE_MTREE= MTREE_CMD="mtree ${MTREEFLAGS}" .endif DESTDIR_MTREEFLAGS= -deU # When creating worldtmp we don't need to set the directories as owned by root # so we also pass -W WORLDTMP_MTREEFLAGS= -deUW .if defined(NO_ROOT) # When building with -DNO_ROOT we shouldn't be changing the directories # that are created by mtree to be owned by root/wheel. DESTDIR_MTREEFLAGS+= -W .endif MTREE?= mtree .if ${BUILD_WITH_STRICT_TMPPATH} != 0 MTREE= ${WORLDTMP}/legacy/usr/sbin/mtree .endif WORLDTMP_MTREE= ${MTREE} ${WORLDTMP_MTREEFLAGS} DESTDIR_MTREE= ${MTREE} ${DESTDIR_MTREEFLAGS} # kernel stage KMAKEENV= ${WMAKEENV:NSYSROOT=*} KMAKE= ${KMAKEENV} ${MAKE} ${.MAKEFLAGS} ${KERNEL_FLAGS} KERNEL=${INSTKERNNAME} # # buildworld # # Attempt to rebuild the entire system, with reasonable chance of # success, regardless of how old your existing system is. # _sanity_check: .PHONY .MAKE .if ${.CURDIR:C/[^,]//g} != "" # The m4 build of sendmail files doesn't like it if ',' is used # anywhere in the path of it's files. @echo @echo "*** Error: path to source tree contains a comma ','" @echo @false .elif ${.CURDIR:M*\:*} != "" # Using ':' leaks into PATH and breaks finding cross-tools. @echo @echo "*** Error: path to source tree contains a colon ':'" @echo @false .endif # Our current approach to dependency tracking cannot cope with certain source # tree changes, particularly with respect to removing source files and # replacing generated files. Handle these cases here in an ad-hoc fashion. _cleanobj_fast_depend_hack: .PHONY # Syscall stubs rewritten in C and obsolete MD assembly implementations # Date SVN Rev Syscalls # 20180604 r334626 brk sbrk .for f in brk sbrk @if [ -e "${OBJTOP}/lib/libc/.depend.${f}.o" ] && \ egrep -qw '${f}\.[sS]' ${OBJTOP}/lib/libc/.depend.${f}.o; then \ echo "Removing stale dependencies for ${f} syscall wrappers"; \ rm -f ${OBJTOP}/lib/libc/.depend.${f}.* \ ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.${f}.*}; \ fi .endfor # 20181013 r339348 bcopy reimplemented as .c .for f in bcopy memcpy memmove @if [ -e "${OBJTOP}/lib/libc/.depend.${f}.o" ] && \ egrep -qw 'bcopy\.[sS]' ${OBJTOP}/lib/libc/.depend.${f}.o; then \ echo "Removing stale dependencies for bcopy"; \ rm -f ${OBJTOP}/lib/libc/.depend.${f}.* \ ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.${f}.*}; \ fi .endfor # 20181115 r340463 bzero reimplemented as .c @if [ -e "${OBJTOP}/lib/libc/.depend.bzero.o" ] && \ egrep -qw 'bzero\.[sS]' ${OBJTOP}/lib/libc/.depend.bzero.o; then \ echo "Removing stale dependencies for bzero"; \ rm -f ${OBJTOP}/lib/libc/.depend.bzero.* \ ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.bzero.*}; \ fi # 20181009 track migration from ntp's embedded libevent to updated one @if [ -e "${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.bufferevent_openssl.o" ] && \ egrep -q 'contrib/ntp/sntp/libevent/bufferevent_openssl.c' \ ${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.bufferevent_openssl.o ; then \ echo "Removing stale libevent dependencies"; \ rm -f ${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.*; \ fi # 20181209 r341759 track migration across wpa update @if [ -e "${OBJTOP}/usr.sbin/wpa/wpa_supplicant/.depend.rrm.o" ] && \ egrep -q 'src/ap/rrm.c' \ ${OBJTOP}/usr.sbin/wpa/wpa_supplicant/.depend.rrm.o; then \ echo "Removing stale wpa dependencies"; \ rm -f ${OBJTOP}/usr.sbin/wpa/*/.depend*; \ fi _worldtmp: .PHONY @echo @echo "--------------------------------------------------------------" @echo ">>> Rebuilding the temporary build tree" @echo "--------------------------------------------------------------" .if !defined(NO_CLEAN) rm -rf ${WORLDTMP} .else # Note: for delete-old we need to set $PATH to also include the host $PATH # since otherwise a partial build with missing symlinks in ${WORLDTMP}/legacy/ # will fail to run due to missing binaries. $WMAKE sets PATH to only ${TMPPATH} # so we remove that assingnment from $WMAKE and prepend the new $PATH ${_+_}@if [ -e "${WORLDTMP}" ]; then \ echo ">>> Deleting stale files in build tree..."; \ cd ${.CURDIR}; env PATH=${TMPPATH}:${PATH} ${WMAKE:NPATH=*} \ _NO_INCLUDE_COMPILERMK=t -DBATCH_DELETE_OLD_FILES delete-old \ delete-old-libs >/dev/null; \ fi rm -rf ${WORLDTMP}/legacy/usr/include .if ${USING_SYSTEM_COMPILER} == "yes" .for cc in cc c++ if [ -x ${WORLDTMP}/usr/bin/${cc} ]; then \ inum=$$(stat -f %i ${WORLDTMP}/usr/bin/${cc}); \ find ${WORLDTMP}/usr/bin -inum $${inum} -delete; \ fi .endfor .endif # ${USING_SYSTEM_COMPILER} == "yes" .if ${USING_SYSTEM_LINKER} == "yes" @rm -f ${WORLDTMP}/usr/bin/ld ${WORLDTMP}/usr/bin/ld.lld .endif # ${USING_SYSTEM_LINKER} == "yes" .endif # !defined(NO_CLEAN) @mkdir -p ${WORLDTMP} @touch ${WORLDTMP}/${.TARGET} # We can't use mtree to create the worldtmp directories since it may not be # available on the target system (this happens e.g. when building on non-FreeBSD) cd ${.CURDIR}/tools/build; \ ${MAKE} DIRPRFX=tools/build/ DESTDIR=${WORLDTMP}/legacy installdirs # In order to build without inheriting $PATH we need to add symlinks to the host # tools in $WORLDTMP for the tools that we don't build during bootstrap-tools cd ${.CURDIR}/tools/build; \ ${MAKE} DIRPRFX=tools/build/ DESTDIR=${WORLDTMP}/legacy host-symlinks _legacy: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 1.1: legacy release compatibility shims" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${BMAKE} legacy _bootstrap-tools: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 1.2: bootstrap tools" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${BMAKE} bootstrap-tools mkdir -p ${WORLDTMP}/usr ${WORLDTMP}/lib/casper ${WORLDTMP}/lib/geom ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${WORLDTMP}/usr >/dev/null ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${WORLDTMP}/usr/include >/dev/null ln -sf ${.CURDIR}/sys ${WORLDTMP} .if ${MK_DEBUG_FILES} != "no" ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.debug.dist \ -p ${WORLDTMP}/usr/lib >/dev/null .endif .for _mtree in ${LOCAL_MTREE} ${WORLDTMP_MTREE} -f ${.CURDIR}/${_mtree} -p ${WORLDTMP} > /dev/null .endfor _cleanobj: .if !defined(NO_CLEAN) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.1: cleaning up the object tree" @echo "--------------------------------------------------------------" # Avoid including bsd.compiler.mk in clean and obj with _NO_INCLUDE_COMPILERMK # since the restricted $PATH might not contain a valid cc binary ${_+_}cd ${.CURDIR}; ${WMAKE} _NO_INCLUDE_COMPILERMK=t ${CLEANDIR} .if defined(LIBCOMPAT) ${_+_}cd ${.CURDIR}; ${LIBCOMPATWMAKE} _NO_INCLUDE_COMPILERMK=t -f Makefile.inc1 ${CLEANDIR} .endif .else ${_+_}cd ${.CURDIR}; ${WMAKE} _NO_INCLUDE_COMPILERMK=t _cleanobj_fast_depend_hack .endif # !defined(NO_CLEAN) _obj: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.2: rebuilding the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${WMAKE} _NO_INCLUDE_COMPILERMK=t obj _build-tools: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.3: build tools" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${TMAKE} build-tools _cross-tools: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 3: cross tools" @echo "--------------------------------------------------------------" @rm -f ${OBJTOP}/toolchain-metadata.mk ${_+_}cd ${.CURDIR}; ${XMAKE} cross-tools ${_+_}cd ${.CURDIR}; ${XMAKE} kernel-tools _build-metadata: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 3.1: recording build metadata" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${WMAKE} toolchain-metadata.mk ${_+_}cd ${.CURDIR}; ${WMAKE} host-osreldate.h _includes: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 4.1: building includes" @echo "--------------------------------------------------------------" # Special handling for SUBDIR_OVERRIDE in buildworld as they most likely need # headers from default SUBDIR. Do SUBDIR_OVERRIDE includes last. ${_+_}cd ${.CURDIR}; ${WMAKE} SUBDIR_OVERRIDE= SHARED=symlinks \ MK_INCLUDES=yes includes .if !empty(SUBDIR_OVERRIDE) && make(buildworld) ${_+_}cd ${.CURDIR}; ${WMAKE} MK_INCLUDES=yes SHARED=symlinks includes .endif _libraries: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 4.2: building libraries" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; \ ${WMAKE} -DNO_FSCHG MK_HTML=no -DNO_LINT MK_MAN=no \ MK_PROFILE=no MK_TESTS=no MK_TESTS_SUPPORT=${MK_TESTS} libraries everything: .PHONY @echo @echo "--------------------------------------------------------------" @echo ">>> stage 4.3: building everything" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; _PARALLEL_SUBDIR_OK=1 ${WMAKE} all WMAKE_TGTS= .if !defined(WORLDFAST) WMAKE_TGTS+= _sanity_check _worldtmp _legacy .if empty(SUBDIR_OVERRIDE) WMAKE_TGTS+= _bootstrap-tools .endif WMAKE_TGTS+= _cleanobj .if !defined(NO_OBJWALK) WMAKE_TGTS+= _obj .endif WMAKE_TGTS+= _build-tools _cross-tools WMAKE_TGTS+= _build-metadata WMAKE_TGTS+= _includes .endif .if !defined(NO_LIBS) WMAKE_TGTS+= _libraries .endif WMAKE_TGTS+= everything .if defined(LIBCOMPAT) && empty(SUBDIR_OVERRIDE) WMAKE_TGTS+= build${libcompat} .endif # record buildworld time in seconds .if make(buildworld) _BUILDWORLD_START!= date '+%s' .export _BUILDWORLD_START .endif buildworld: buildworld_prologue ${WMAKE_TGTS} buildworld_epilogue .PHONY .ORDER: buildworld_prologue ${WMAKE_TGTS} buildworld_epilogue buildworld_prologue: .PHONY @echo "--------------------------------------------------------------" @echo ">>> World build started on `LC_ALL=C date`" @echo "--------------------------------------------------------------" buildworld_epilogue: .PHONY @echo @echo "--------------------------------------------------------------" @echo ">>> World build completed on `LC_ALL=C date`" @seconds=$$(($$(date '+%s') - ${_BUILDWORLD_START})); \ echo -n ">>> World built in $$seconds seconds, "; \ echo "ncpu: $$(sysctl -n hw.ncpu)${.MAKE.JOBS:S/^/, make -j/}" @echo "--------------------------------------------------------------" # # We need to have this as a target because the indirection between Makefile # and Makefile.inc1 causes the correct PATH to be used, rather than a # modification of the current environment's PATH. In addition, we need # to quote multiword values. # buildenvvars: .PHONY @echo ${WMAKEENV:Q} ${.MAKE.EXPORTED:@v@$v=\"${$v}\"@} .if ${.TARGETS:Mbuildenv} .if ${.MAKEFLAGS:M-j} .error The buildenv target is incompatible with -j .endif .endif BUILDENV_DIR?= ${.CURDIR} # # Note: make will report any errors the shell reports. This can # be odd if the last command in an interactive shell generates an # error or is terminated by SIGINT. These reported errors look bad, # but are harmless. Allowing them also allows BUIDLENV_SHELL to # be a complex command whose status will be returned to the caller. # Some scripts in tools rely on this behavior to report build errors. # buildenv: .PHONY @echo Entering world for ${TARGET_ARCH}:${TARGET} .if ${BUILDENV_SHELL:M*zsh*} @echo For ZSH you must run: export CPUTYPE=${TARGET_CPUTYPE} .endif @cd ${BUILDENV_DIR} && env ${WMAKEENV} BUILDENV=1 ${BUILDENV_SHELL} TOOLCHAIN_TGTS= ${WMAKE_TGTS:Neverything:Nbuild${libcompat}} toolchain: ${TOOLCHAIN_TGTS} .PHONY KERNEL_TOOLCHAIN_TGTS= ${TOOLCHAIN_TGTS:N_obj:N_cleanobj:N_includes:N_libraries} .if make(kernel-toolchain) .ORDER: ${KERNEL_TOOLCHAIN_TGTS} .endif kernel-toolchain: ${KERNEL_TOOLCHAIN_TGTS} .PHONY # # installcheck # # Checks to be sure system is ready for installworld/installkernel. # installcheck: _installcheck_world _installcheck_kernel .PHONY _installcheck_world: .PHONY @echo "--------------------------------------------------------------" @echo ">>> Install check world" @echo "--------------------------------------------------------------" _installcheck_kernel: .PHONY @echo "--------------------------------------------------------------" @echo ">>> Install check kernel" @echo "--------------------------------------------------------------" # # Require DESTDIR to be set if installing for a different architecture or # using the user/group database in the source tree. # .if ${TARGET_ARCH} != ${MACHINE_ARCH} || ${TARGET} != ${MACHINE} || \ defined(DB_FROM_SRC) .if !make(distributeworld) _installcheck_world: __installcheck_DESTDIR _installcheck_kernel: __installcheck_DESTDIR __installcheck_DESTDIR: .PHONY .if !defined(DESTDIR) || empty(DESTDIR) @echo "ERROR: Please set DESTDIR!"; \ false .endif .endif .endif .if !defined(DB_FROM_SRC) # # Check for missing UIDs/GIDs. # CHECK_UIDS= auditdistd CHECK_GIDS= audit CHECK_UIDS+= ntpd CHECK_GIDS+= ntpd CHECK_UIDS+= proxy CHECK_GIDS+= proxy authpf CHECK_UIDS+= smmsp CHECK_GIDS+= smmsp CHECK_UIDS+= unbound CHECK_GIDS+= unbound _installcheck_world: __installcheck_UGID __installcheck_UGID: .PHONY .for uid in ${CHECK_UIDS} @if ! `id -u ${uid} >/dev/null 2>&1`; then \ echo "ERROR: Required ${uid} user is missing, see /usr/src/UPDATING."; \ false; \ fi .endfor .for gid in ${CHECK_GIDS} @if ! `find / -prune -group ${gid} >/dev/null 2>&1`; then \ echo "ERROR: Required ${gid} group is missing, see /usr/src/UPDATING."; \ false; \ fi .endfor .endif # # If installing over the running system (DESTDIR is / or unset) and the install # includes rescue, try running rescue from the objdir as a sanity check. If # rescue is not functional (e.g., because it depends on a system call not # supported by the currently running kernel), abort the installation. # .if !make(distributeworld) && ${MK_RESCUE} != "no" && \ (empty(DESTDIR) || ${DESTDIR} == "/") && empty(BYPASS_INSTALLCHECK_SH) _installcheck_world: __installcheck_sh_check __installcheck_sh_check: .PHONY @if [ "`${OBJTOP}/rescue/rescue/rescue sh -c 'echo OK'`" != \ OK ]; then \ echo "rescue/sh check failed, installation aborted" >&2; \ false; \ fi .endif # # Required install tools to be saved in a scratch dir for safety. # .if ${MK_ZONEINFO} != "no" _zoneinfo= zic tzsetup .endif ITOOLS= [ awk cap_mkdb cat chflags chmod chown cmp cp \ date echo egrep find grep id install ${_install-info} \ ln make mkdir mtree mv pwd_mkdb \ rm sed services_mkdb sh sort strip sysctl test true uname wc ${_zoneinfo} \ ${LOCAL_ITOOLS} # Needed for share/man .if ${MK_MAN_UTILS} != "no" ITOOLS+=makewhatis .endif # # distributeworld # # Distributes everything compiled by a `buildworld'. # # installworld # # Installs everything compiled by a 'buildworld'. # # Non-base distributions produced by the base system EXTRA_DISTRIBUTIONS= .if defined(LIBCOMPAT) EXTRA_DISTRIBUTIONS+= lib${libcompat} .endif .if ${MK_TESTS} != "no" EXTRA_DISTRIBUTIONS+= tests .endif DEBUG_DISTRIBUTIONS= .if ${MK_DEBUG_FILES} != "no" DEBUG_DISTRIBUTIONS+= base ${EXTRA_DISTRIBUTIONS:S,tests,,} .endif MTREE_MAGIC?= mtree 2.0 distributeworld installworld stageworld: _installcheck_world .PHONY mkdir -p ${INSTALLTMP} progs=$$(for prog in ${ITOOLS}; do \ if progpath=`which $$prog`; then \ echo $$progpath; \ else \ echo "Required tool $$prog not found in PATH." >&2; \ exit 1; \ fi; \ done); \ libs=$$(ldd -f "%o %p\n" -f "%o %p\n" $$progs 2>/dev/null | sort -u | \ while read line; do \ set -- $$line; \ if [ "$$2 $$3" != "not found" ]; then \ echo $$2; \ else \ echo "Required library $$1 not found." >&2; \ exit 1; \ fi; \ done); \ cp $$libs $$progs ${INSTALLTMP} cp -R $${PATH_LOCALE:-"/usr/share/locale"} ${INSTALLTMP}/locale .if defined(NO_ROOT) -mkdir -p ${METALOG:H} echo "#${MTREE_MAGIC}" > ${METALOG} .endif .if make(distributeworld) .for dist in ${EXTRA_DISTRIBUTIONS} -mkdir ${DESTDIR}/${DISTDIR}/${dist} ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.root.dist \ -p ${DESTDIR}/${DISTDIR}/${dist} >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/include >/dev/null .if ${MK_DEBUG_FILES} != "no" ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.debug.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib >/dev/null .endif .if defined(LIBCOMPAT) ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr >/dev/null .if ${MK_DEBUG_FILES} != "no" ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib/debug/usr >/dev/null .endif .endif .if ${MK_TESTS} != "no" && ${dist} == "tests" -mkdir -p ${DESTDIR}/${DISTDIR}/${dist}${TESTSBASE} ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}${TESTSBASE} >/dev/null .if ${MK_DEBUG_FILES} != "no" ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib/debug/${TESTSBASE} >/dev/null .endif .endif .if defined(NO_ROOT) ${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.root.dist | \ sed -e 's#^\./#./${dist}/#' >> ${METALOG} ${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.usr.dist | \ sed -e 's#^\./#./${dist}/usr/#' >> ${METALOG} ${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.include.dist | \ sed -e 's#^\./#./${dist}/usr/include/#' >> ${METALOG} .if defined(LIBCOMPAT) ${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist | \ sed -e 's#^\./#./${dist}/usr/#' >> ${METALOG} .endif .endif .endfor -mkdir ${DESTDIR}/${DISTDIR}/base ${_+_}cd ${.CURDIR}/etc; ${CROSSENV} PATH=${TMPPATH} ${MAKE} \ METALOG=${METALOG} ${IMAKE_INSTALL} ${IMAKE_MTREE} \ DISTBASE=/base DESTDIR=${DESTDIR}/${DISTDIR}/base \ LOCAL_MTREE=${LOCAL_MTREE:Q} distrib-dirs ${INSTALL_SYMLINK} ${INSTALLFLAGS} usr/src/sys ${INSTALL_DDIR}/base/sys .endif ${_+_}cd ${.CURDIR}; ${IMAKE} re${.TARGET:S/world$//}; \ ${IMAKEENV} rm -rf ${INSTALLTMP} .if make(distributeworld) .for dist in ${EXTRA_DISTRIBUTIONS} find ${DESTDIR}/${DISTDIR}/${dist} -mindepth 1 -type d -empty -delete .endfor .if defined(NO_ROOT) .for dist in base ${EXTRA_DISTRIBUTIONS} @# For each file that exists in this dist, print the corresponding @# line from the METALOG. This relies on the fact that @# a line containing only the filename will sort immediately before @# the relevant mtree line. cd ${DESTDIR}/${DISTDIR}; \ find ./${dist} | sort -u ${METALOG} - | \ awk 'BEGIN { print "#${MTREE_MAGIC}" } !/ type=/ { file = $$1 } / type=/ { if ($$1 == file) { sub(/^\.\/${dist}\//, "./"); print } }' > \ ${DESTDIR}/${DISTDIR}/${dist}.meta .endfor .for dist in ${DEBUG_DISTRIBUTIONS} @# For each file that exists in this dist, print the corresponding @# line from the METALOG. This relies on the fact that @# a line containing only the filename will sort immediately before @# the relevant mtree line. cd ${DESTDIR}/${DISTDIR}; \ find ./${dist}/usr/lib/debug | sort -u ${METALOG} - | \ awk 'BEGIN { print "#${MTREE_MAGIC}" } !/ type=/ { file = $$1 } / type=/ { if ($$1 == file) { sub(/^\.\/${dist}\//, "./"); print } }' > \ ${DESTDIR}/${DISTDIR}/${dist}.debug.meta .endfor .endif .endif packageworld: .PHONY .for dist in base ${EXTRA_DISTRIBUTIONS} .if defined(NO_ROOT) ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvf - --exclude usr/lib/debug \ @${DESTDIR}/${DISTDIR}/${dist}.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}.txz .else ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvf - --exclude usr/lib/debug . | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}.txz .endif .endfor .for dist in ${DEBUG_DISTRIBUTIONS} . if defined(NO_ROOT) ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvf - @${DESTDIR}/${DISTDIR}/${dist}.debug.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}-dbg.txz . else ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvLf - usr/lib/debug | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}-dbg.txz . endif .endfor _sysent_dirs= sys/kern _sysent_dirs+= sys/compat/freebsd32 _sysent_dirs+= sys/amd64/linux \ sys/amd64/linux32 \ sys/arm64/linux \ sys/i386/linux sysent: .PHONY .for _dir in ${_sysent_dirs} ${_+_}${MAKE} -C ${.CURDIR}/${_dir} sysent .endfor # # reinstall # # If you have a build server, you can NFS mount the source and obj directories # and do a 'make reinstall' on the *client* to install new binaries from the # most recent server build. # restage reinstall: .MAKE .PHONY @echo "--------------------------------------------------------------" @echo ">>> Making hierarchy" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 \ LOCAL_MTREE=${LOCAL_MTREE:Q} hierarchy .if make(restage) @echo "--------------------------------------------------------------" @echo ">>> Making distribution" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 \ LOCAL_MTREE=${LOCAL_MTREE:Q} distribution .endif @echo @echo "--------------------------------------------------------------" @echo ">>> Installing everything started on `LC_ALL=C date`" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 install .if defined(LIBCOMPAT) ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 install${libcompat} .endif @echo "--------------------------------------------------------------" @echo ">>> Installing everything completed on `LC_ALL=C date`" @echo "--------------------------------------------------------------" redistribute: .MAKE .PHONY @echo "--------------------------------------------------------------" @echo ">>> Distributing everything" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 distribute .if defined(LIBCOMPAT) ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 distribute${libcompat} \ DISTRIBUTION=lib${libcompat} .endif distrib-dirs distribution: .MAKE .PHONY ${_+_}cd ${.CURDIR}/etc; ${CROSSENV} PATH=${TMPPATH} ${MAKE} \ ${IMAKE_INSTALL} ${IMAKE_MTREE} METALOG=${METALOG} ${.TARGET} .if make(distribution) ${_+_}cd ${.CURDIR}; ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} -f Makefile.inc1 ${IMAKE_INSTALL} \ METALOG=${METALOG} MK_TESTS=no installconfig .endif # # buildkernel and installkernel # # Which kernels to build and/or install is specified by setting # KERNCONF. If not defined a GENERIC kernel is built/installed. # Only the existing (depending TARGET) config files are used # for building kernels and only the first of these is designated # as the one being installed. # # Note that we have to use TARGET instead of TARGET_ARCH when # we're in kernel-land. Since only TARGET_ARCH is (expected) to # be set to cross-build, we have to make sure TARGET is set # properly. .if defined(KERNFAST) NO_KERNELCLEAN= t NO_KERNELCONFIG= t NO_KERNELOBJ= t # Shortcut for KERNCONF=Blah -DKERNFAST is now KERNFAST=Blah .if !defined(KERNCONF) && ${KERNFAST} != "1" KERNCONF=${KERNFAST} .endif .endif .if ${TARGET_ARCH} == "powerpc64" KERNCONF?= GENERIC64 .else KERNCONF?= GENERIC .endif INSTKERNNAME?= kernel KERNSRCDIR?= ${.CURDIR}/sys KRNLCONFDIR= ${KERNSRCDIR}/${TARGET}/conf KRNLOBJDIR= ${OBJTOP}${KERNSRCDIR:C,^${.CURDIR},,} KERNCONFDIR?= ${KRNLCONFDIR} BUILDKERNELS= INSTALLKERNEL= .if defined(NO_INSTALLKERNEL) # All of the BUILDKERNELS loops start at index 1. BUILDKERNELS+= dummy .endif .for _kernel in ${KERNCONF} .if !defined(_MKSHOWCONFIG) && exists(${KERNCONFDIR}/${_kernel}) BUILDKERNELS+= ${_kernel} .if empty(INSTALLKERNEL) && !defined(NO_INSTALLKERNEL) INSTALLKERNEL= ${_kernel} .endif .else .if make(buildkernel) .error Missing KERNCONF ${KERNCONFDIR}/${_kernel} .endif .endif .endfor _cleankernobj_fast_depend_hack: .PHONY # 20180320 remove stale generated assym.s after renaming to .inc in r331254 @if [ -e "${OBJTOP}/sys/${KERNCONF}/assym.s" ]; then \ echo "Removing stale generated assym files"; \ rm -f ${OBJTOP}/sys/${KERNCONF}/assym.* \ ${OBJTOP}/sys/${KERNCONF}/.depend.assym.*; \ fi ${WMAKE_TGTS:N_worldtmp:Nbuild${libcompat}} ${.ALLTARGETS:M_*:N_worldtmp}: .MAKE .PHONY # record kernel(s) build time in seconds .if make(buildkernel) _BUILDKERNEL_START!= date '+%s' .endif # # buildkernel # # Builds all kernels defined by BUILDKERNELS. # buildkernel: .MAKE .PHONY .if empty(BUILDKERNELS:Ndummy) @echo "ERROR: Missing kernel configuration file(s) (${KERNCONF})."; \ false .endif @echo .for _kernel in ${BUILDKERNELS:Ndummy} @echo "--------------------------------------------------------------" @echo ">>> Kernel build for ${_kernel} started on `LC_ALL=C date`" @echo "--------------------------------------------------------------" @echo "===> ${_kernel}" mkdir -p ${KRNLOBJDIR} .if !defined(NO_KERNELCONFIG) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 1: configuring the kernel" @echo "--------------------------------------------------------------" cd ${KRNLCONFDIR}; \ PATH=${TMPPATH} \ config ${CONFIGARGS} -d ${KRNLOBJDIR}/${_kernel} \ -I '${KERNCONFDIR}' '${KERNCONFDIR}/${_kernel}' .endif .if !defined(NO_CLEAN) && !defined(NO_KERNELCLEAN) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.1: cleaning up the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} ${CLEANDIR} .else ${_+_}cd ${.CURDIR}; ${WMAKE} _cleankernobj_fast_depend_hack .endif .if !defined(NO_KERNELOBJ) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.2: rebuilding the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} obj .endif @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.3: build tools" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${KTMAKE} kernel-tools @echo @echo "--------------------------------------------------------------" @echo ">>> stage 3.1: building everything" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} all -DNO_MODULES_OBJ @echo "--------------------------------------------------------------" @echo ">>> Kernel build for ${_kernel} completed on `LC_ALL=C date`" @echo "--------------------------------------------------------------" .endfor @seconds=$$(($$(date '+%s') - ${_BUILDKERNEL_START})); \ echo -n ">>> Kernel(s) ${BUILDKERNELS} built in $$seconds seconds, "; \ echo "ncpu: $$(sysctl -n hw.ncpu)${.MAKE.JOBS:S/^/, make -j/}" @echo "--------------------------------------------------------------" NO_INSTALLEXTRAKERNELS?= yes # # installkernel, etc. # # Install the kernel defined by INSTALLKERNEL # installkernel installkernel.debug \ reinstallkernel reinstallkernel.debug: _installcheck_kernel .PHONY .if !defined(NO_INSTALLKERNEL) .if empty(INSTALLKERNEL) @echo "ERROR: No kernel \"${KERNCONF}\" to install."; \ false .endif @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${INSTALLKERNEL} on $$(LC_ALL=C date)" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${INSTALLKERNEL}; \ ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} ${IMAKE_INSTALL} KERNEL=${INSTKERNNAME} ${.TARGET:S/kernel//} @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${INSTALLKERNEL} completed on $$(LC_ALL=C date)" @echo "--------------------------------------------------------------" .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${_kernel} $$(LC_ALL=C date)" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; \ ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} ${IMAKE_INSTALL} KERNEL=${INSTKERNNAME}.${_kernel} ${.TARGET:S/kernel//} @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${_kernel} completed on $$(LC_ALL=C date)" @echo "--------------------------------------------------------------" .endfor .endif distributekernel distributekernel.debug: .PHONY .if !defined(NO_INSTALLKERNEL) .if empty(INSTALLKERNEL) @echo "ERROR: No kernel \"${KERNCONF}\" to install."; \ false .endif mkdir -p ${DESTDIR}/${DISTDIR} .if defined(NO_ROOT) @echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.premeta .endif ${_+_}cd ${KRNLOBJDIR}/${INSTALLKERNEL}; \ ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.premeta/} \ ${IMAKE_MTREE} PATH=${TMPPATH} ${MAKE} KERNEL=${INSTKERNNAME} \ DESTDIR=${INSTALL_DDIR}/kernel \ ${.TARGET:S/distributekernel/install/} .if defined(NO_ROOT) @sed -e 's|^./kernel|.|' ${DESTDIR}/${DISTDIR}/kernel.premeta > \ ${DESTDIR}/${DISTDIR}/kernel.meta .endif .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} .if defined(NO_ROOT) @echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta .endif ${_+_}cd ${KRNLOBJDIR}/${_kernel}; \ ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.${_kernel}.premeta/} \ ${IMAKE_MTREE} PATH=${TMPPATH} ${MAKE} \ KERNEL=${INSTKERNNAME}.${_kernel} \ DESTDIR=${INSTALL_DDIR}/kernel.${_kernel} \ ${.TARGET:S/distributekernel/install/} .if defined(NO_ROOT) @sed -e "s|^./kernel.${_kernel}|.|" \ ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta > \ ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta .endif .endfor .endif packagekernel: .PHONY .if defined(NO_ROOT) .if !defined(NO_INSTALLKERNEL) cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --exclude '*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.txz .endif .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --include '*/*/*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.meta | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --exclude '*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.${_kernel}.txz .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --include '*/*/*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}-dbg.txz .endif .endfor .endif .else .if !defined(NO_INSTALLKERNEL) cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --exclude '*.debug' . | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.txz .endif .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --include '*/*/*.debug' $$(eval find .) | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --exclude '*.debug' . | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.${_kernel}.txz .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --include '*/*/*.debug' $$(eval find .) | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}-dbg.txz .endif .endfor .endif .endif stagekernel: .PHONY ${_+_}${MAKE} -C ${.CURDIR} ${.MAKEFLAGS} distributekernel PORTSDIR?= /usr/ports WSTAGEDIR?= ${OBJTOP}/worldstage KSTAGEDIR?= ${OBJTOP}/kernelstage REPODIR?= ${OBJROOT}repo PKGSIGNKEY?= # empty .ORDER: stage-packages create-packages .ORDER: create-packages create-world-packages .ORDER: create-packages create-kernel-packages .ORDER: create-packages sign-packages _pkgbootstrap: .PHONY .if make(*package*) && !exists(${LOCALBASE}/sbin/pkg) @env ASSUME_ALWAYS_YES=YES pkg bootstrap .endif packages: .PHONY ${_+_}${MAKE} -C ${.CURDIR} PKG_VERSION=${PKG_VERSION} real-packages package-pkg: .PHONY rm -rf /tmp/ports.${TARGET} || : env ${WMAKEENV:Q} SRCDIR=${.CURDIR} PORTSDIR=${PORTSDIR} REVISION=${_REVISION} \ PKG_CMD=${PKG_CMD} PKG_VERSION=${PKG_VERSION} REPODIR=${REPODIR} \ WSTAGEDIR=${WSTAGEDIR} \ sh ${.CURDIR}/release/scripts/make-pkg-package.sh real-packages: stage-packages create-packages sign-packages .PHONY stage-packages-world: .PHONY @mkdir -p ${WSTAGEDIR} ${_+_}@cd ${.CURDIR}; \ ${MAKE} DESTDIR=${WSTAGEDIR} -DNO_ROOT stageworld stage-packages-kernel: .PHONY @mkdir -p ${KSTAGEDIR} ${_+_}@cd ${.CURDIR}; \ ${MAKE} DESTDIR=${KSTAGEDIR} -DNO_ROOT stagekernel stage-packages: .PHONY stage-packages-world stage-packages-kernel _repodir: .PHONY @mkdir -p ${REPODIR} create-packages-world: _pkgbootstrap _repodir .PHONY ${_+_}@cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 \ DESTDIR=${WSTAGEDIR} \ PKG_VERSION=${PKG_VERSION} create-world-packages create-packages-kernel: _pkgbootstrap _repodir .PHONY ${_+_}@cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 \ DESTDIR=${KSTAGEDIR} \ PKG_VERSION=${PKG_VERSION} DISTDIR=kernel \ create-kernel-packages create-packages: .PHONY create-packages-world create-packages-kernel create-world-packages: _pkgbootstrap .PHONY @rm -f ${WSTAGEDIR}/*.plist 2>/dev/null || : @cd ${WSTAGEDIR} ; \ env -i LC_COLLATE=C sort ${WSTAGEDIR}/${DISTDIR}/METALOG | \ awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk @for plist in ${WSTAGEDIR}/*.plist; do \ plist=$${plist##*/} ; \ pkgname=$${plist%.plist} ; \ echo "_PKGS+= $${pkgname}" ; \ done > ${WSTAGEDIR}/packages.mk ${_+_}@cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 create-world-packages-jobs \ .MAKE.JOB.PREFIX= .if make(create-world-packages-jobs) .include "${WSTAGEDIR}/packages.mk" .endif create-world-packages-jobs: .PHONY .for pkgname in ${_PKGS} create-world-packages-jobs: create-world-package-${pkgname} create-world-package-${pkgname}: .PHONY @sh ${SRCDIR}/release/packages/generate-ucl.sh -o ${pkgname} \ -s ${SRCDIR} -u ${WSTAGEDIR}/${pkgname}.ucl @awk -F\" ' \ /^name/ { printf("===> Creating %s-", $$2); next } \ /^version/ { print $$2; next } \ ' ${WSTAGEDIR}/${pkgname}.ucl @if [ "${pkgname}" == "runtime" ]; then \ sed -i '' -e "s/%VCS_REVISION%/${VCS_REVISION}/" ${WSTAGEDIR}/${pkgname}.ucl ; \ fi ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \ create -M ${WSTAGEDIR}/${pkgname}.ucl \ -p ${WSTAGEDIR}/${pkgname}.plist \ -r ${WSTAGEDIR} \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} .endfor _default_flavor= -default .if make(*package*) && exists(${KSTAGEDIR}/kernel.meta) . if ${MK_DEBUG_FILES} != "no" _debug=-debug . endif create-kernel-packages: .PHONY . for flavor in "" ${_debug} create-kernel-packages: create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},} create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},}: _pkgbootstrap .PHONY @cd ${KSTAGEDIR}/${DISTDIR} ; \ env -i LC_COLLATE=C sort ${KSTAGEDIR}/kernel.meta | \ awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk \ -v kernel=yes -v _kernconf=${INSTALLKERNEL} ; \ sed -e "s/%VERSION%/${PKG_VERSION}/" \ -e "s/%PKGNAME%/kernel-${INSTALLKERNEL:tl}${flavor}/" \ -e "s/%KERNELDIR%/kernel/" \ -e "s/%COMMENT%/FreeBSD ${INSTALLKERNEL} kernel ${flavor}/" \ -e "s/%DESC%/FreeBSD ${INSTALLKERNEL} kernel ${flavor}/" \ -e "s/ %VCS_REVISION%/${VCS_REVISION}/" \ ${SRCDIR}/release/packages/kernel.ucl \ > ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \ awk -F\" ' \ /name/ { printf("===> Creating %s-", $$2); next } \ /version/ {print $$2; next } ' \ ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \ ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \ create -M ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl \ -p ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.plist \ -r ${KSTAGEDIR}/${DISTDIR} \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} . endfor .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" . for _kernel in ${BUILDKERNELS:[2..-1]} . if exists(${KSTAGEDIR}/kernel.${_kernel}.meta) . if ${MK_DEBUG_FILES} != "no" _debug=-debug . endif . for flavor in "" ${_debug} create-kernel-packages: create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kernel} create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kernel}: _pkgbootstrap .PHONY @cd ${KSTAGEDIR}/kernel.${_kernel} ; \ env -i LC_COLLATE=C sort ${KSTAGEDIR}/kernel.${_kernel}.meta | \ awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk \ -v kernel=yes -v _kernconf=${_kernel} ; \ sed -e "s/%VERSION%/${PKG_VERSION}/" \ -e "s/%PKGNAME%/kernel-${_kernel:tl}${flavor}/" \ -e "s/%KERNELDIR%/kernel.${_kernel}/" \ -e "s/%COMMENT%/FreeBSD ${_kernel} kernel ${flavor}/" \ -e "s/%DESC%/FreeBSD ${_kernel} kernel ${flavor}/" \ -e "s/ %VCS_REVISION%/${VCS_REVISION}/" \ ${SRCDIR}/release/packages/kernel.ucl \ > ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \ awk -F\" ' \ /name/ { printf("===> Creating %s-", $$2); next } \ /version/ {print $$2; next } ' \ ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \ ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \ create -M ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl \ -p ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.plist \ -r ${KSTAGEDIR}/kernel.${_kernel} \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} . endfor . endif . endfor .endif sign-packages: _pkgbootstrap .PHONY @[ -L "${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/latest" ] && \ unlink ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/latest ; \ ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname repo \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} \ ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} \ ${PKGSIGNKEY} ; \ cd ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI); \ ln -s ${PKG_VERSION} latest # # # checkworld # # Run test suite on installed world. # checkworld: .PHONY @if [ ! -x "${LOCALBASE}/bin/kyua" ]; then \ echo "You need kyua (devel/kyua) to run the test suite." | /usr/bin/fmt; \ exit 1; \ fi ${_+_}PATH="$$PATH:${LOCALBASE}/bin" kyua test -k ${TESTSBASE}/Kyuafile # # # doxygen # # Build the API documentation with doxygen # doxygen: .PHONY @if [ ! -x "${LOCALBASE}/bin/doxygen" ]; then \ echo "You need doxygen (devel/doxygen) to generate the API documentation of the kernel." | /usr/bin/fmt; \ exit 1; \ fi ${_+_}cd ${.CURDIR}/tools/kerneldoc/subsys; ${MAKE} obj all # # update # # Update the source tree(s), by running svn/svnup to update to the # latest copy. # update: .PHONY .if defined(SVN_UPDATE) @echo "--------------------------------------------------------------" @echo ">>> Updating ${.CURDIR} using Subversion" @echo "--------------------------------------------------------------" @(cd ${.CURDIR}; ${SVN_CMD} update ${SVNFLAGS}) .endif # # ------------------------------------------------------------------------ # # From here onwards are utility targets used by the 'make world' and # related targets. If your 'world' breaks, you may like to try to fix # the problem and manually run the following targets to attempt to # complete the build. Beware, this is *not* guaranteed to work, you # need to have a pretty good grip on the current state of the system # to attempt to manually finish it. If in doubt, 'make world' again. # # # legacy: Build compatibility shims for the next three targets. This is a # minimal set of tools and shims necessary to compensate for older systems # which don't have the APIs required by the targets built in bootstrap-tools, # build-tools or cross-tools. # # libnv and libl are both requirements for config(8), which is an unconditional # bootstrap-tool. _config_deps= lib/libnv usr.bin/lex/lib legacy: .PHONY .if ${BOOTSTRAPPING} < ${MINIMUM_SUPPORTED_OSREL} && ${BOOTSTRAPPING} != 0 @echo "ERROR: Source upgrades from versions prior to ${MINIMUM_SUPPORTED_REL} are not supported."; \ false .endif .for _tool in tools/build ${_config_deps} ${_+_}@${ECHODIR} "===> ${_tool} (obj,includes,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${WORLDTMP}/legacy includes; \ ${MAKE} DIRPRFX=${_tool}/ MK_INCLUDES=no all; \ ${MAKE} DIRPRFX=${_tool}/ MK_INCLUDES=no \ DESTDIR=${WORLDTMP}/legacy install .endfor # # bootstrap-tools: Build tools needed for compatibility. These are binaries that # are built to build other binaries in the system. However, the focus of these # binaries is usually quite narrow. Bootstrap tools use the host's compiler and # libraries, augmented by -legacy, in addition to the libraries built during # bootstrap-tools. # _bt= _bootstrap-tools # We want to run the build with only ${WORLDTMP} in $PATH to ensure we don't # accidentally run tools that are incompatible but happen to be in $PATH. # This is especially important when building on Linux/MacOS where many of the # programs used during the build accept different flags or generate different # output. On those platforms we only symlink the tools known to be compatible # (e.g. basic utilities such as mkdir) into ${WORLDTMP} and build all others # from the FreeBSD sources during the bootstrap-tools stage. # We want to build without the user's $PATH starting in the bootstrap-tools # phase so the tools used in that phase (ln, cp, etc) must have already been # linked to $WORLDTMP. The tools are listed in the _host_tools_to_symlink # variable in tools/build/Makefile and are linked during the legacy phase. # Since they could be Linux or MacOS binaries, too we must only use flags that # are portable across operating systems. # If BOOTSTRAP_ALL_TOOLS is set we will build all the required tools from the # current source tree. Otherwise we create a symlink to the version found in # $PATH during the bootstrap-tools stage. .if defined(BOOTSTRAP_ALL_TOOLS) # BOOTSTRAPPING will be set on the command line so we can't override it here. # Instead set BOOTSTRAPPING_OSRELDATE so that the value 0 is set ${BSARGS} BOOTSTRAPPING_OSRELDATE:= 0 .endif .if ${MK_GAMES} != "no" _strfile= usr.bin/fortune/strfile .endif .if ${MK_GCC} != "no" && ${MK_CXX} != "no" _gperf= gnu/usr.bin/gperf .endif .if ${MK_VT} != "no" _vtfontcvt= usr.bin/vtfontcvt .endif # If we are not building the bootstrap because BOOTSTRAPPING is sufficient # we symlink the host version to $WORLDTMP instead. By doing this we can also # detect when a bootstrap tool is being used without the required MK_FOO. # If you add a new bootstrap tool where we could also use the host version, # please ensure that you also add a .else case where you add the tool to the # _bootstrap_tools_links variable. .if ${BOOTSTRAPPING} < 1000033 _m4= usr.bin/m4 _lex= usr.bin/lex # Note: lex needs m4 to build but m4 also depends on lex. However, lex can be # bootstrapped so we build lex first. ${_bt}-usr.bin/m4: ${_bt}-lib/libopenbsd ${_bt}-usr.bin/yacc ${_bt}-${_lex} _bt_m4_depend=${_bt}-${_m4} _bt_lex_depend=${_bt}-${_lex} ${_bt_m4_depend} .else _bootstrap_tools_links+=m4 lex .endif # ELF Tool Chain libraries are needed for ELF tools and dtrace tools. # r296685 fix cross-endian objcopy # r310724 fixed PR 215350, a crash in libdwarf with objects built by GCC 6.2. # r334881 added libdwarf constants used by ctfconvert. # r338478 fixed a crash in objcopy for mips64el objects # r339083 libelf: correct mips64el test to use ELF header # r348347 Add missing powerpc64 relocation support to libdwarf .if ${BOOTSTRAPPING} < 1300030 _elftoolchain_libs= lib/libelf lib/libdwarf ${_bt}-lib/libelf: ${_bt_m4_depend} ${_bt}-lib/libdwarf: ${_bt_m4_depend} .endif # r245440 mtree -N support added # r313404 requires sha384.h for libnetbsd, added to libmd in r292782 .if ${BOOTSTRAPPING} < 1100093 _nmtree= lib/libmd \ lib/libnetbsd \ usr.sbin/nmtree ${_bt}-lib/libnetbsd: ${_bt}-lib/libmd ${_bt}-usr.sbin/nmtree: ${_bt}-lib/libnetbsd .else _bootstrap_tools_links+=mtree .endif # r246097: log addition login.conf.db, passwd, pwd.db, and spwd.db with cat -l .if ${BOOTSTRAPPING} < 1000027 _cat= bin/cat .else _bootstrap_tools_links+=cat .endif # r277259 crunchide: Correct 64-bit section header offset # r281674 crunchide: always include both 32- and 64-bit ELF support .if ${BOOTSTRAPPING} < 1100078 _crunchide= usr.sbin/crunch/crunchide .else _bootstrap_tools_links+=crunchide .endif # r285986 crunchen: use STRIPBIN rather than STRIP # 1100113: Support MK_AUTO_OBJ # 1200006: META_MODE fixes .if ${BOOTSTRAPPING} < 1100078 || \ (${MK_AUTO_OBJ} == "yes" && ${BOOTSTRAPPING} < 1100114) || \ (${MK_META_MODE} == "yes" && ${BOOTSTRAPPING} < 1200006) _crunchgen= usr.sbin/crunch/crunchgen .else _bootstrap_tools_links+=crunchgen .endif # r296926 -P keymap search path, MFC to stable/10 in r298297 .if ${BOOTSTRAPPING} < 1003501 || \ (${BOOTSTRAPPING} >= 1100000 && ${BOOTSTRAPPING} < 1100103) _kbdcontrol= usr.sbin/kbdcontrol .else _bootstrap_tools_links+=kbdcontrol .endif _yacc= lib/liby \ usr.bin/yacc ${_bt}-usr.bin/yacc: ${_bt}-lib/liby .if ${MK_BSNMP} != "no" _gensnmptree= usr.sbin/bsnmpd/gensnmptree .endif .if ${MK_LOCALES} != "no" _localedef= usr.bin/localedef .endif # We need to build tblgen when we're building clang or lld, either as # bootstrap tools, or as the part of the normal build. .if ${MK_CLANG_BOOTSTRAP} != "no" || ${MK_CLANG} != "no" || \ ${MK_LLD_BOOTSTRAP} != "no" || ${MK_LLD} != "no" _clang_tblgen= \ lib/clang/libllvmminimal \ usr.bin/clang/llvm-tblgen \ usr.bin/clang/clang-tblgen \ usr.bin/clang/lldb-tblgen # XXX: lldb-tblgen is not needed, if top-level MK_LLDB=no ${_bt}-usr.bin/clang/clang-tblgen: ${_bt}-lib/clang/libllvmminimal ${_bt}-usr.bin/clang/llvm-tblgen: ${_bt}-lib/clang/libllvmminimal ${_bt}-usr.bin/clang/lldb-tblgen: ${_bt}-lib/clang/libllvmminimal .endif # Default to building the GPL DTC, but build the BSDL one if users explicitly # request it. _dtc= usr.bin/dtc .if ${MK_GPL_DTC} != "no" _dtc= gnu/usr.bin/dtc .endif .if ${MK_LOCALES} != "no" _localedef= usr.bin/localedef .endif .if ${MK_KERBEROS} != "no" _kerberos5_bootstrap_tools= \ kerberos5/tools/make-roken \ kerberos5/lib/libroken \ kerberos5/lib/libvers \ kerberos5/tools/asn1_compile \ kerberos5/tools/slc \ usr.bin/compile_et .ORDER: ${_kerberos5_bootstrap_tools:C/^/${_bt}-/g} .for _tool in ${_kerberos5_bootstrap_tools} ${_bt}-${_tool}: ${_bt}-usr.bin/yacc ${_bt_lex_depend} .endfor .endif ${_bt}-usr.bin/mandoc: ${_bt}-lib/libopenbsd # The tools listed in _basic_bootstrap_tools will generally not be # bootstrapped unless BOOTSTRAP_ALL_TOOL is set. However, when building on a # Linux or MacOS host the host versions are incompatible so we need to build # them from the source tree. Usually the link name will be the same as the subdir, # but some directories such as grep or test install multiple binaries. In that # case we use the _basic_bootstrap_tools_multilink variable which is a list of # subdirectory and comma-separated list of files. _basic_bootstrap_tools_multilink=usr.bin/grep grep,egrep,fgrep _basic_bootstrap_tools_multilink+=bin/test test,[ # bootstrap tools needed by buildworld: _basic_bootstrap_tools=usr.bin/awk usr.bin/cut bin/expr usr.bin/gencat \ usr.bin/join usr.bin/mktemp bin/rmdir usr.bin/sed usr.bin/sort \ usr.bin/truncate usr.bin/tsort # elf2aout is required for sparc64 build _basic_bootstrap_tools+=usr.bin/elf2aout # file2c is required for building usr.sbin/config: _basic_bootstrap_tools+=usr.bin/file2c # uuencode/uudecode required for share/tabset _basic_bootstrap_tools+=usr.bin/uuencode usr.bin/uudecode # xargs is required by mkioctls _basic_bootstrap_tools+=usr.bin/xargs # cap_mkdb is required for share/termcap: _basic_bootstrap_tools+=usr.bin/cap_mkdb # ldd is required for installcheck (TODO: just always use /usr/bin/ldd instead?) _basic_bootstrap_tools+=usr.bin/ldd # services_mkdb/pwd_mkdb are required for installworld: _basic_bootstrap_tools+=usr.sbin/services_mkdb usr.sbin/pwd_mkdb # sysctl/chflags are required for installkernel: _basic_bootstrap_tools+=sbin/sysctl bin/chflags # mkfifo is used by sys/conf/newvers.sh _basic_bootstrap_tools+=usr.bin/mkfifo .if ${MK_AMD} != "no" # unifdef is only used by usr.sbin/amd/libamu/Makefile _basic_bootstrap_tools+=usr.bin/unifdef .endif .if ${MK_BOOT} != "no" _basic_bootstrap_tools+=bin/dd # xz/unxz is used by EFI _basic_bootstrap_tools_multilink+=usr.bin/xz xz,unxz # md5 is used by boot/beri (and possibly others) _basic_bootstrap_tools+=sbin/md5 .if defined(BOOTSTRAP_ALL_TOOLS) ${_bt}-sbin/md5: ${_bt}-lib/libmd .endif .endif .if ${MK_ZONEINFO} != "no" _basic_bootstrap_tools+=usr.sbin/zic usr.sbin/tzsetup .endif .if defined(BOOTSTRAP_ALL_TOOLS) _other_bootstrap_tools+=${_basic_bootstrap_tools} .for _subdir _links in ${_basic_bootstrap_tools_multilink} _other_bootstrap_tools+=${_subdir} .endfor ${_bt}-usr.bin/awk: ${_bt_lex_depend} ${_bt}-usr.bin/yacc ${_bt}-bin/expr: ${_bt_lex_depend} ${_bt}-usr.bin/yacc # If we are bootstrapping file2c, we have to build it before config: ${_bt}-usr.sbin/config: ${_bt}-usr.bin/file2c ${_bt_lex_depend} # Note: no symlink to make/bmake in the !BOOTSTRAP_ALL_TOOLS case here since # the links to make/bmake make links will have already have been created in the # `make legacy` step. Not adding a link to make is important on non-FreeBSD # since "make" will usually point to GNU make there. _other_bootstrap_tools+=usr.bin/bmake .else # All tools in _basic_bootstrap_tools have the same name as the subdirectory # so we can use :T to get the name of the symlinks that we need to create. _bootstrap_tools_links+=${_basic_bootstrap_tools:T} .for _subdir _links in ${_basic_bootstrap_tools_multilink} _bootstrap_tools_links+=${_links:S/,/ /g} .endfor .endif # defined(BOOTSTRAP_ALL_TOOLS) # Link the tools that we need for building but don't need to bootstrap because # the host version is known to be compatible into ${WORLDTMP}/legacy # We do this before building any of the bootstrap tools in case they depend on # the presence of any of the links (e.g. as m4/lex/awk) ${_bt}-links: .PHONY .for _tool in ${_bootstrap_tools_links} ${_bt}-link-${_tool}: .PHONY .MAKE @if [ ! -e "${WORLDTMP}/legacy/bin/${_tool}" ]; then \ source_path=`which ${_tool}`; \ if [ ! -e "$${source_path}" ] ; then \ echo "Cannot find host tool '${_tool}'"; false; \ fi; \ ln -sfnv "$${source_path}" "${WORLDTMP}/legacy/bin/${_tool}"; \ fi ${_bt}-links: ${_bt}-link-${_tool} .endfor bootstrap-tools: ${_bt}-links .PHONY # Please document (add comment) why something is in 'bootstrap-tools'. # Try to bound the building of the bootstrap-tool to just the # FreeBSD versions that need the tool built at this stage of the build. .for _tool in \ ${_clang_tblgen} \ ${_kerberos5_bootstrap_tools} \ ${_strfile} \ ${_gperf} \ ${_dtc} \ ${_cat} \ ${_kbdcontrol} \ ${_elftoolchain_libs} \ usr.bin/lorder \ lib/libopenbsd \ usr.bin/mandoc \ usr.bin/rpcgen \ ${_yacc} \ ${_m4} \ ${_lex} \ ${_other_bootstrap_tools} \ usr.bin/xinstall \ ${_gensnmptree} \ usr.sbin/config \ ${_crunchide} \ ${_crunchgen} \ ${_nmtree} \ ${_vtfontcvt} \ ${_localedef} ${_bt}-${_tool}: ${_bt}-links .PHONY .MAKE ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ if [ "${_tool}" = "usr.bin/lex" ]; then \ ${MAKE} DIRPRFX=${_tool}/ bootstrap; \ fi; \ ${MAKE} DIRPRFX=${_tool}/ all; \ ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${WORLDTMP}/legacy install bootstrap-tools: ${_bt}-${_tool} .endfor # # build-tools: Build special purpose build tools # .if !defined(NO_SHARE) && ${MK_SYSCONS} != "no" _share= share/syscons/scrnmaps .endif .if ${MK_GCC} != "no" _gcc_tools= gnu/usr.bin/cc/cc_tools .endif .if ${MK_RESCUE} != "no" # rescue includes programs that have build-tools targets _rescue=rescue/rescue .endif .if ${MK_TCSH} != "no" _tcsh=bin/csh .endif .if ${MK_FILE} != "no" _libmagic=lib/libmagic .endif .if ${MK_PMC} != "no" && \ (${TARGET_ARCH} == "amd64" || ${TARGET_ARCH} == "i386") _jevents=lib/libpmc/pmu-events .endif # kernel-toolchain skips _cleanobj, so handle cleaning up previous # build-tools directories if needed. .if !defined(NO_CLEAN) && make(kernel-toolchain) _bt_clean= ${CLEANDIR} .endif .for _tool in \ ${_tcsh} \ bin/sh \ ${LOCAL_TOOL_DIRS} \ ${_jevents} \ lib/ncurses/ncurses \ lib/ncurses/ncursesw \ ${_rescue} \ ${_share} \ usr.bin/awk \ ${_libmagic} \ usr.bin/mkesdb_static \ usr.bin/mkcsmapper_static \ usr.bin/vi/catalog \ ${_gcc_tools} build-tools_${_tool}: .PHONY ${_+_}@${ECHODIR} "===> ${_tool} (${_bt_clean:D${_bt_clean},}obj,build-tools)"; \ cd ${.CURDIR}/${_tool}; \ if [ -n "${_bt_clean}" ]; then ${MAKE} DIRPRFX=${_tool}/ ${_bt_clean}; fi; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ build-tools build-tools: build-tools_${_tool} .endfor # # kernel-tools: Build kernel-building tools # kernel-tools: .PHONY mkdir -p ${WORLDTMP}/usr ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${WORLDTMP}/usr >/dev/null # # cross-tools: All the tools needed to build the rest of the system after # we get done with the earlier stages. It is the last set of tools needed # to begin building the target binaries. # .if ${TARGET_ARCH} != ${MACHINE_ARCH} || ${BUILD_WITH_STRICT_TMPPATH} != 0 .if ${TARGET_ARCH} == "amd64" || ${TARGET_ARCH} == "i386" _btxld= usr.sbin/btxld .endif .endif # Rebuild ctfconvert and ctfmerge to avoid difficult-to-diagnose failures # resulting from missing bug fixes or ELF Toolchain updates. .if ${MK_CDDL} != "no" _dtrace_tools= cddl/lib/libctf cddl/usr.bin/ctfconvert \ cddl/usr.bin/ctfmerge .endif # If we're given an XAS, don't build binutils. .if ${XAS:M/*} == "" .if ${MK_BINUTILS_BOOTSTRAP} != "no" _binutils= gnu/usr.bin/binutils .endif .if ${MK_ELFTOOLCHAIN_BOOTSTRAP} != "no" _elftctools= lib/libelftc \ lib/libpe \ usr.bin/objcopy \ usr.bin/nm \ usr.bin/size \ usr.bin/strings # These are not required by the build, but can be useful for developers who # cross-build on a FreeBSD 10 host: _elftctools+= usr.bin/addr2line .endif .elif ${TARGET_ARCH} != ${MACHINE_ARCH} && ${MK_ELFTOOLCHAIN_BOOTSTRAP} != "no" # If cross-building with an external binutils we still need to build strip for # the target (for at least crunchide). _elftctools= lib/libelftc \ lib/libpe \ usr.bin/objcopy .endif .if ${MK_CLANG_BOOTSTRAP} != "no" _clang= usr.bin/clang .endif .if ${MK_LLD_BOOTSTRAP} != "no" _lld= usr.bin/clang/lld .endif .if ${MK_CLANG_BOOTSTRAP} != "no" || ${MK_LLD_BOOTSTRAP} != "no" _clang_libs= lib/clang .endif .if ${MK_GCC_BOOTSTRAP} != "no" _gcc= gnu/usr.bin/cc .endif .if ${MK_USB} != "no" _usb_tools= stand/usb/tools .endif .if ${BUILD_WITH_STRICT_TMPPATH} != 0 || defined(BOOTSTRAP_ALL_TOOLS) _ar=usr.bin/ar .endif cross-tools: .MAKE .PHONY .for _tool in \ ${LOCAL_XTOOL_DIRS} \ ${_ar} \ ${_clang_libs} \ ${_clang} \ ${_lld} \ ${_binutils} \ ${_elftctools} \ ${_dtrace_tools} \ ${_gcc} \ ${_btxld} \ ${_usb_tools} ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ all; \ ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${WORLDTMP} install .endfor # # native-xtools is the current target for qemu-user cross builds of ports # via poudriere and the imgact_binmisc kernel module. # This target merely builds a toolchan/sysroot, then builds the tools it wants # with the options it wants in a special MAKEOBJDIRPREFIX, using the toolchain # already built. It then installs the static tools to NXBDESTDIR for Poudriere # to pickup. # NXBOBJROOT= ${OBJROOT}${MACHINE}.${MACHINE_ARCH}/nxb/ NXBOBJTOP= ${NXBOBJROOT}${NXB_TARGET}.${NXB_TARGET_ARCH} NXTP?= /nxb-bin .if ${NXTP:N/*} .error NXTP variable should be an absolute path .endif NXBDESTDIR?= ${DESTDIR}${NXTP} # This is the list of tools to be built/installed as static and where # appropriate to build for the given TARGET.TARGET_ARCH. NXBDIRS+= \ bin/cat \ bin/chmod \ bin/cp \ ${_tcsh} \ bin/echo \ bin/expr \ bin/hostname \ bin/ln \ bin/ls \ bin/mkdir \ bin/mv \ bin/ps \ bin/realpath \ bin/rm \ bin/rmdir \ bin/sh \ bin/sleep \ sbin/md5 \ sbin/sysctl \ usr.bin/addr2line \ usr.bin/ar \ usr.bin/awk \ usr.bin/basename \ usr.bin/bmake \ usr.bin/bzip2 \ usr.bin/cmp \ usr.bin/diff \ usr.bin/dirname \ usr.bin/objcopy \ usr.bin/env \ usr.bin/fetch \ usr.bin/find \ usr.bin/grep \ usr.bin/gzip \ usr.bin/head \ usr.bin/id \ usr.bin/lex \ usr.bin/limits \ usr.bin/lorder \ usr.bin/mandoc \ usr.bin/mktemp \ usr.bin/mt \ usr.bin/nm \ usr.bin/patch \ usr.bin/readelf \ usr.bin/sed \ usr.bin/size \ usr.bin/sort \ usr.bin/strings \ usr.bin/tar \ usr.bin/touch \ usr.bin/tr \ usr.bin/true \ usr.bin/uniq \ usr.bin/unzip \ usr.bin/wc \ usr.bin/xargs \ usr.bin/xinstall \ usr.bin/xz \ usr.bin/yacc \ usr.sbin/chown SUBDIR_DEPEND_usr.bin/clang= lib/clang .if ${MK_CLANG} != "no" NXBDIRS+= lib/clang NXBDIRS+= usr.bin/clang .endif .if ${MK_GCC} != "no" NXBDIRS+= gnu/usr.bin/cc .endif .if ${MK_BINUTILS} != "no" NXBDIRS+= gnu/usr.bin/binutils .endif # XXX: native-xtools passes along ${NXBDIRS} in SUBDIR_OVERRIDE that needs # to be evaluated after NXBDIRS is set. .if make(install) && !empty(SUBDIR_OVERRIDE) SUBDIR= ${SUBDIR_OVERRIDE} .endif NXBMAKEARGS+= \ OBJTOP=${NXBOBJTOP:Q} \ OBJROOT=${NXBOBJROOT:Q} \ MAKEOBJDIRPREFIX= \ -DNO_SHARED \ -DNO_CPU_CFLAGS \ -DNO_PIC \ SSP_CFLAGS= \ MK_CASPER=no \ MK_CLANG_EXTRAS=no \ MK_CLANG_FULL=no \ MK_CTF=no \ MK_DEBUG_FILES=no \ MK_GDB=no \ MK_HTML=no \ MK_LLDB=no \ MK_MAN=no \ MK_MAN_UTILS=yes \ MK_OFED=no \ MK_OPENSSH=no \ MK_PROFILE=no \ MK_RETPOLINE=no \ MK_SENDMAIL=no \ MK_SVNLITE=no \ MK_TESTS=no \ MK_WARNS=no \ MK_ZFS=no .if make(native-xtools*) && \ (!defined(NXB_TARGET) || !defined(NXB_TARGET_ARCH)) .error Missing NXB_TARGET / NXB_TARGET_ARCH .endif # For 'toolchain' we want to produce native binaries that themselves generate # native binaries. NXBTMAKE= ${NXBMAKEENV} ${MAKE} ${NXBMAKEARGS:N-DNO_PIC:N-DNO_SHARED} \ TARGET=${MACHINE} TARGET_ARCH=${MACHINE_ARCH} # For 'everything' we want to produce native binaries (hence -target to # be MACHINE) that themselves generate TARGET.TARGET_ARCH binaries. # TARGET/TARGET_ARCH are still passed along from user. # # Use the toolchain we create as an external toolchain. .if ${USING_SYSTEM_COMPILER} == "yes" || ${XCC:N${CCACHE_BIN}:M/*} NXBMAKE+= XCC="${XCC}" \ XCXX="${XCXX}" \ XCPP="${XCPP}" .else NXBMAKE+= XCC="${NXBOBJTOP}/tmp/usr/bin/cc" \ XCXX="${NXBOBJTOP}/tmp/usr/bin/c++" \ XCPP="${NXBOBJTOP}/tmp/usr/bin/cpp" .endif NXBMAKE+= ${NXBMAKEENV} ${MAKE} -f Makefile.inc1 ${NXBMAKEARGS} \ TARGET=${NXB_TARGET} TARGET_ARCH=${NXB_TARGET_ARCH} \ TARGET_TRIPLE=${MACHINE_TRIPLE:Q} # NXBDIRS is improperly based on MACHINE rather than NXB_TARGET. Need to # invoke a sub-make to reevaluate MK_GCC, etc, for NXBDIRS. NXBMAKE+= SUBDIR_OVERRIDE='$${NXBDIRS:M*}' # Need to avoid the -isystem logic when using clang as an external toolchain # even if the TARGET being built for wants GCC. NXBMAKE+= WANT_COMPILER_TYPE='$${X_COMPILER_TYPE}' native-xtools: .PHONY ${_+_}cd ${.CURDIR}; ${NXBTMAKE} _cleanobj MK_GCC=yes # Build the bootstrap/host/cross tools that produce native binaries # Pass along MK_GCC=yes to ensure GCC-needed build tools are built. # We don't quite know what the NXB_TARGET wants so just build it. ${_+_}cd ${.CURDIR}; ${NXBTMAKE} kernel-toolchain MK_GCC=yes # Populate includes/libraries sysroot that produce native binaries. # This is split out from 'toolchain' above mostly so that target LLVM # libraries have a proper LLVM_DEFAULT_TARGET_TRIPLE without # polluting the cross-compiler build. The LLVM/GCC libs are skipped # here to avoid the problem but are kept in 'toolchain' so that # needed build tools are built. ${_+_}cd ${.CURDIR}; ${NXBTMAKE} _includes MK_CLANG=no MK_GCC=no ${_+_}cd ${.CURDIR}; ${NXBTMAKE} _libraries MK_CLANG=no MK_GCC=no # Clean out improper TARGET=MACHINE files ${_+_}cd ${.CURDIR}/gnu/usr.bin/cc/cc_tools; ${NXBTMAKE} cleandir .if !defined(NO_OBJWALK) ${_+_}cd ${.CURDIR}; ${NXBMAKE} _obj .endif ${_+_}cd ${.CURDIR}; ${NXBMAKE} everything @echo ">> native-xtools done. Use 'make native-xtools-install' to install to a given DESTDIR" native-xtools-install: .PHONY mkdir -p ${NXBDESTDIR}/bin ${NXBDESTDIR}/sbin ${NXBDESTDIR}/usr ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${NXBDESTDIR}/usr >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${NXBDESTDIR}/usr/include >/dev/null ${_+_}cd ${.CURDIR}; ${NXBMAKE} \ DESTDIR=${NXBDESTDIR} \ -DNO_ROOT \ install # # hierarchy - ensure that all the needed directories are present # hierarchy hier: .MAKE .PHONY ${_+_}cd ${.CURDIR}/etc; ${HMAKE} distrib-dirs # # libraries - build all libraries, and install them under ${DESTDIR}. # # The list of libraries with dependents (${_prebuild_libs}) and their # interdependencies (__L) are built automatically by the # ${.CURDIR}/tools/make_libdeps.sh script. # libraries: .MAKE .PHONY ${_+_}cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 _prereq_libs; \ ${MAKE} -f Makefile.inc1 _startup_libs; \ ${MAKE} -f Makefile.inc1 _prebuild_libs; \ ${MAKE} -f Makefile.inc1 _generic_libs # # static libgcc.a prerequisite for shared libc # _prereq_libs= lib/libcompiler_rt .if ${MK_SSP} != "no" _prereq_libs+= gnu/lib/libssp/libssp_nonshared .endif # These dependencies are not automatically generated: # # gnu/lib/csu, gnu/lib/libgcc, lib/csu and lib/libc must be built before # all shared libraries for ELF. # _startup_libs= lib/csu .if ${MK_BSD_CRTBEGIN} == "no" _startup_libs+= gnu/lib/csu .endif _startup_libs+= lib/libcompiler_rt _startup_libs+= lib/libc _startup_libs+= lib/libc_nonshared .if ${MK_LIBCPLUSPLUS} != "no" _startup_libs+= lib/libcxxrt .endif .if ${MK_LLVM_LIBUNWIND} != "no" _prereq_libs+= lib/libgcc_eh lib/libgcc_s _startup_libs+= lib/libgcc_eh lib/libgcc_s lib/libgcc_s__L: lib/libc__L lib/libgcc_s__L: lib/libc_nonshared__L .if ${MK_LIBCPLUSPLUS} != "no" lib/libcxxrt__L: lib/libgcc_s__L .endif .else # MK_LLVM_LIBUNWIND == no _prereq_libs+= gnu/lib/libgcc _startup_libs+= gnu/lib/libgcc gnu/lib/libgcc__L: lib/libc__L gnu/lib/libgcc__L: lib/libc_nonshared__L .if ${MK_LIBCPLUSPLUS} != "no" lib/libcxxrt__L: gnu/lib/libgcc__L .endif .endif _prebuild_libs= ${_kerberos5_lib_libasn1} \ ${_kerberos5_lib_libhdb} \ ${_kerberos5_lib_libheimbase} \ ${_kerberos5_lib_libheimntlm} \ ${_libsqlite3} \ ${_kerberos5_lib_libheimipcc} \ ${_kerberos5_lib_libhx509} ${_kerberos5_lib_libkrb5} \ ${_kerberos5_lib_libroken} \ ${_kerberos5_lib_libwind} \ lib/libbz2 ${_libcom_err} lib/libcrypt \ lib/libelf lib/libexpat \ lib/libfigpar \ ${_lib_libgssapi} \ lib/libkiconv lib/libkvm lib/liblzma lib/libmd lib/libnv \ + lib/libzstd \ ${_lib_casper} \ lib/ncurses/ncurses lib/ncurses/ncursesw \ lib/libopie lib/libpam/libpam ${_lib_libthr} \ ${_lib_libradius} lib/libsbuf lib/libtacplus \ lib/libgeom \ ${_cddl_lib_libumem} ${_cddl_lib_libnvpair} \ ${_cddl_lib_libuutil} \ ${_cddl_lib_libavl} \ ${_cddl_lib_libzfs_core} ${_cddl_lib_libzfs} \ ${_cddl_lib_libctf} \ lib/libufs \ lib/libutil lib/libpjdlog ${_lib_libypclnt} lib/libz lib/msun \ ${_secure_lib_libcrypto} ${_secure_lib_libssl} \ ${_lib_libldns} ${_secure_lib_libssh} .if ${MK_GNUCXX} != "no" _prebuild_libs+= gnu/lib/libstdc++ gnu/lib/libsupc++ gnu/lib/libstdc++__L: lib/msun__L gnu/lib/libsupc++__L: gnu/lib/libstdc++__L .endif .if ${MK_DIALOG} != "no" _prebuild_libs+= gnu/lib/libdialog gnu/lib/libdialog__L: lib/msun__L lib/ncurses/ncursesw__L .endif .if ${MK_LIBCPLUSPLUS} != "no" _prebuild_libs+= lib/libc++ .endif lib/libgeom__L: lib/libexpat__L lib/libkvm__L: lib/libelf__L .if ${MK_LIBTHR} != "no" _lib_libthr= lib/libthr .endif .if ${MK_RADIUS_SUPPORT} != "no" _lib_libradius= lib/libradius .endif .if ${MK_OFED} != "no" _prebuild_libs+= \ lib/ofed/libibverbs \ lib/ofed/libibmad \ lib/ofed/libibumad \ lib/ofed/complib \ lib/ofed/libmlx5 lib/ofed/libibmad__L: lib/ofed/libibumad__L lib/ofed/complib__L: lib/libthr__L lib/ofed/libmlx5__L: lib/ofed/libibverbs__L lib/libthr__L .endif .if ${MK_CASPER} != "no" _lib_casper= lib/libcasper .endif lib/libpjdlog__L: lib/libutil__L lib/libcasper__L: lib/libnv__L lib/liblzma__L: lib/libthr__L _generic_libs= ${_cddl_lib} gnu/lib ${_kerberos5_lib} lib ${_secure_lib} usr.bin/lex/lib .if ${MK_IPFILTER} != "no" _generic_libs+= sbin/ipf/libipf .endif .for _DIR in ${LOCAL_LIB_DIRS} .if ${_DIR} == ".WAIT" || (empty(_generic_libs:M${_DIR}) && exists(${.CURDIR}/${_DIR}/Makefile)) _generic_libs+= ${_DIR} .endif .endfor lib/libopie__L lib/libtacplus__L: lib/libmd__L .if ${MK_CDDL} != "no" _cddl_lib_libumem= cddl/lib/libumem _cddl_lib_libnvpair= cddl/lib/libnvpair _cddl_lib_libavl= cddl/lib/libavl _cddl_lib_libuutil= cddl/lib/libuutil .if ${MK_ZFS} != "no" _cddl_lib_libzfs_core= cddl/lib/libzfs_core _cddl_lib_libzfs= cddl/lib/libzfs cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L cddl/lib/libzfs__L: lib/libthr__L lib/libmd__L lib/libz__L cddl/lib/libumem__L cddl/lib/libzfs__L: cddl/lib/libuutil__L cddl/lib/libavl__L lib/libgeom__L lib/libbe__L: cddl/lib/libzfs__L .endif _cddl_lib_libctf= cddl/lib/libctf _cddl_lib= cddl/lib cddl/lib/libctf__L: lib/libz__L .endif # cddl/lib/libdtrace requires lib/libproc and lib/librtld_db; it's only built # on select architectures though (see cddl/lib/Makefile) .if ${MACHINE_CPUARCH} != "sparc64" _prebuild_libs+= lib/libprocstat lib/libproc lib/librtld_db lib/libprocstat__L: lib/libelf__L lib/libkvm__L lib/libutil__L lib/libproc__L: lib/libprocstat__L lib/librtld_db__L: lib/libprocstat__L .endif .if ${MK_CRYPT} != "no" .if ${MK_OPENSSL} != "no" _secure_lib_libcrypto= secure/lib/libcrypto _secure_lib_libssl= secure/lib/libssl lib/libradius__L secure/lib/libssl__L: secure/lib/libcrypto__L secure/lib/libcrypto__L: lib/libthr__L .if ${MK_LDNS} != "no" _lib_libldns= lib/libldns lib/libldns__L: secure/lib/libssl__L .endif .if ${MK_OPENSSH} != "no" _secure_lib_libssh= secure/lib/libssh secure/lib/libssh__L: lib/libz__L secure/lib/libcrypto__L lib/libcrypt__L .if ${MK_LDNS} != "no" secure/lib/libssh__L: lib/libldns__L .endif .if ${MK_GSSAPI} != "no" && ${MK_KERBEROS_SUPPORT} != "no" secure/lib/libssh__L: lib/libgssapi__L kerberos5/lib/libkrb5__L \ kerberos5/lib/libhx509__L kerberos5/lib/libasn1__L lib/libcom_err__L \ lib/libmd__L kerberos5/lib/libroken__L .endif .endif .endif _secure_lib= secure/lib .endif .if ${MK_KERBEROS} != "no" kerberos5/lib/libasn1__L: lib/libcom_err__L kerberos5/lib/libroken__L kerberos5/lib/libhdb__L: kerberos5/lib/libasn1__L lib/libcom_err__L \ kerberos5/lib/libkrb5__L kerberos5/lib/libroken__L \ kerberos5/lib/libwind__L lib/libsqlite3__L kerberos5/lib/libheimntlm__L: secure/lib/libcrypto__L kerberos5/lib/libkrb5__L \ kerberos5/lib/libroken__L lib/libcom_err__L kerberos5/lib/libhx509__L: kerberos5/lib/libasn1__L lib/libcom_err__L \ secure/lib/libcrypto__L kerberos5/lib/libroken__L kerberos5/lib/libwind__L kerberos5/lib/libkrb5__L: kerberos5/lib/libasn1__L lib/libcom_err__L \ lib/libcrypt__L secure/lib/libcrypto__L kerberos5/lib/libhx509__L \ kerberos5/lib/libroken__L kerberos5/lib/libwind__L \ kerberos5/lib/libheimbase__L kerberos5/lib/libheimipcc__L kerberos5/lib/libroken__L: lib/libcrypt__L kerberos5/lib/libwind__L: kerberos5/lib/libroken__L lib/libcom_err__L kerberos5/lib/libheimbase__L: lib/libthr__L kerberos5/lib/libheimipcc__L: kerberos5/lib/libroken__L kerberos5/lib/libheimbase__L lib/libthr__L .endif lib/libsqlite3__L: lib/libthr__L .if ${MK_GSSAPI} != "no" _lib_libgssapi= lib/libgssapi .endif .if ${MK_KERBEROS} != "no" _kerberos5_lib= kerberos5/lib _kerberos5_lib_libasn1= kerberos5/lib/libasn1 _kerberos5_lib_libhdb= kerberos5/lib/libhdb _kerberos5_lib_libheimbase= kerberos5/lib/libheimbase _kerberos5_lib_libkrb5= kerberos5/lib/libkrb5 _kerberos5_lib_libhx509= kerberos5/lib/libhx509 _kerberos5_lib_libroken= kerberos5/lib/libroken _kerberos5_lib_libheimntlm= kerberos5/lib/libheimntlm _libsqlite3= lib/libsqlite3 _kerberos5_lib_libheimipcc= kerberos5/lib/libheimipcc _kerberos5_lib_libwind= kerberos5/lib/libwind _libcom_err= lib/libcom_err .endif .if ${MK_NIS} != "no" _lib_libypclnt= lib/libypclnt .endif .if ${MK_OPENSSL} == "no" lib/libradius__L: lib/libmd__L .endif lib/libproc__L: \ ${_cddl_lib_libctf:D${_cddl_lib_libctf}__L} lib/libelf__L lib/librtld_db__L lib/libutil__L .if ${MK_CXX} != "no" .if ${MK_LIBCPLUSPLUS} != "no" lib/libproc__L: lib/libcxxrt__L .else # This implies MK_GNUCXX != "no"; see lib/libproc lib/libproc__L: gnu/lib/libsupc++__L .endif .endif .for _lib in ${_prereq_libs} ${_lib}__PL: .PHONY .MAKE .if !defined(_MKSHOWCONFIG) && exists(${.CURDIR}/${_lib}) ${_+_}@${ECHODIR} "===> ${_lib} (obj,all,install)"; \ cd ${.CURDIR}/${_lib}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ obj; fi; \ ${MAKE} MK_TESTS=no MK_PROFILE=no -DNO_PIC \ DIRPRFX=${_lib}/ all; \ ${MAKE} MK_TESTS=no MK_PROFILE=no -DNO_PIC \ DIRPRFX=${_lib}/ install .endif .endfor .for _lib in ${_startup_libs} ${_prebuild_libs} ${_generic_libs} ${_lib}__L: .PHONY .MAKE .if !defined(_MKSHOWCONFIG) && exists(${.CURDIR}/${_lib}) ${_+_}@${ECHODIR} "===> ${_lib} (obj,all,install)"; \ cd ${.CURDIR}/${_lib}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ obj; fi; \ ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ all; \ ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ install .endif .endfor _prereq_libs: ${_prereq_libs:S/$/__PL/} _startup_libs: ${_startup_libs:S/$/__L/} _prebuild_libs: ${_prebuild_libs:S/$/__L/} _generic_libs: ${_generic_libs:S/$/__L/} # Enable SUBDIR_PARALLEL when not calling 'make all', unless called from # 'everything' with _PARALLEL_SUBDIR_OK set. This is because it is unlikely # that running 'make all' from the top-level, especially with a SUBDIR_OVERRIDE # or LOCAL_DIRS set, will have a reliable build if SUBDIRs are built in # parallel. This is safe for the world stage of buildworld though since it has # already built libraries in a proper order and installed includes into # WORLDTMP. Special handling is done for SUBDIR ordering for 'install*' to # avoid trashing a system if it crashes mid-install. .if !make(all) || defined(_PARALLEL_SUBDIR_OK) SUBDIR_PARALLEL= .endif .include .if make(check-old) || make(check-old-dirs) || \ make(check-old-files) || make(check-old-libs) || \ make(delete-old) || make(delete-old-dirs) || \ make(delete-old-files) || make(delete-old-libs) # # check for / delete old files section # .include "ObsoleteFiles.inc" OLD_LIBS_MESSAGE="Please be sure no application still uses those libraries, \ else you can not start such an application. Consult UPDATING for more \ information regarding how to cope with the removal/revision bump of a \ specific library." .if !defined(BATCH_DELETE_OLD_FILES) RM_I=-i .else RM_I=-v .endif delete-old-files: .PHONY @echo ">>> Removing old files (only deletes safe to delete libs)" # Ask for every old file if the user really wants to remove it. # It's annoying, but better safe than sorry. # NB: We cannot pass the list of OLD_FILES as a parameter because the # argument list will get too long. Using .for/.endfor make "loops" will make # the Makefile parser segfault. @exec 3<&0; \ cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_FILES -V "OLD_FILES:Musr/share/*.gz:R" | xargs -n1 | sort | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ chflags noschg "${DESTDIR}/$${file}" 2>/dev/null || true; \ rm ${RM_I} "${DESTDIR}/$${file}" <&3; \ fi; \ for ext in debug symbols; do \ if ! [ -e "${DESTDIR}/$${file}" ] && [ -f \ "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ rm ${RM_I} "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" \ <&3; \ fi; \ done; \ done # Remove catpages without corresponding manpages. @exec 3<&0; \ find ${DESTDIR}/usr/share/man/cat* ! -type d 2>/dev/null | sort | \ sed -ep -e's:${DESTDIR}/usr/share/man/cat:${DESTDIR}/usr/share/man/man:' | \ while read catpage; do \ read manpage; \ if [ ! -e "$${manpage}" ]; then \ rm ${RM_I} $${catpage} <&3; \ fi; \ done @echo ">>> Old files removed" check-old-files: .PHONY @echo ">>> Checking for old files" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_FILES -V "OLD_FILES:Musr/share/*.gz:R" | xargs -n1 | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ echo "${DESTDIR}/$${file}"; \ fi; \ for ext in debug symbols; do \ if [ -f "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}"; \ fi; \ done; \ done | sort # Check for catpages without corresponding manpages. @find ${DESTDIR}/usr/share/man/cat* ! -type d 2>/dev/null | \ sed -ep -e's:${DESTDIR}/usr/share/man/cat:${DESTDIR}/usr/share/man/man:' | \ while read catpage; do \ read manpage; \ if [ ! -e "$${manpage}" ]; then \ echo $${catpage}; \ fi; \ done | sort delete-old-libs: .PHONY @echo ">>> Removing old libraries" @echo "${OLD_LIBS_MESSAGE}" | fmt @exec 3<&0; \ cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_LIBS | xargs -n1 | sort | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ chflags noschg "${DESTDIR}/$${file}" 2>/dev/null || true; \ rm ${RM_I} "${DESTDIR}/$${file}" <&3; \ fi; \ for ext in debug symbols; do \ if ! [ -e "${DESTDIR}/$${file}" ] && [ -f \ "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ rm ${RM_I} "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" \ <&3; \ fi; \ done; \ done @echo ">>> Old libraries removed" check-old-libs: .PHONY @echo ">>> Checking for old libraries" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_LIBS | xargs -n1 | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ echo "${DESTDIR}/$${file}"; \ fi; \ for ext in debug symbols; do \ if [ -f "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}"; \ fi; \ done; \ done | sort delete-old-dirs: .PHONY @echo ">>> Removing old directories" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_DIRS | xargs -n1 | sort -r | \ while read dir; do \ if [ -d "${DESTDIR}/$${dir}" ]; then \ rmdir -v "${DESTDIR}/$${dir}" || true; \ elif [ -L "${DESTDIR}/$${dir}" ]; then \ echo "${DESTDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ if [ -d "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ rmdir -v "${DESTDIR}${DEBUGDIR}/$${dir}" || true; \ elif [ -L "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ done @echo ">>> Old directories removed" check-old-dirs: .PHONY @echo ">>> Checking for old directories" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_DIRS | xargs -n1 | sort -r | \ while read dir; do \ if [ -d "${DESTDIR}/$${dir}" ]; then \ echo "${DESTDIR}/$${dir}"; \ elif [ -L "${DESTDIR}/$${dir}" ]; then \ echo "${DESTDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ if [ -d "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${dir}"; \ elif [ -L "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ done delete-old: delete-old-files delete-old-dirs .PHONY @echo "To remove old libraries run '${MAKE_CMD} delete-old-libs'." check-old: check-old-files check-old-libs check-old-dirs .PHONY @echo "To remove old files and directories run '${MAKE_CMD} delete-old'." @echo "To remove old libraries run '${MAKE_CMD} delete-old-libs'." .endif # # showconfig - show build configuration. # showconfig: .PHONY @(${MAKE} -n -f ${.CURDIR}/sys/conf/kern.opts.mk -V dummy -dg1 UPDATE_DEPENDFILE=no NO_OBJ=yes; \ ${MAKE} -n -f ${.CURDIR}/share/mk/src.opts.mk -V dummy -dg1 UPDATE_DEPENDFILE=no NO_OBJ=yes) 2>&1 | grep ^MK_ | sort -u .if !empty(KRNLOBJDIR) && !empty(KERNCONF) DTBOUTPUTPATH= ${KRNLOBJDIR}/${KERNCONF}/ .if !defined(FDT_DTS_FILE) || empty(FDT_DTS_FILE) .if !defined(_MKSHOWCONFIG) && exists(${KERNCONFDIR}/${KERNCONF}) FDT_DTS_FILE!= awk 'BEGIN {FS="="} /^makeoptions[[:space:]]+FDT_DTS_FILE/ {print $$2}' \ '${KERNCONFDIR}/${KERNCONF}' ; echo .endif .endif .endif .if !defined(DTBOUTPUTPATH) || !exists(${DTBOUTPUTPATH}) DTBOUTPUTPATH= ${.CURDIR} .endif # # Build 'standalone' Device Tree Blob # builddtb: .PHONY @PATH=${TMPPATH} MACHINE=${TARGET} \ ${.CURDIR}/sys/tools/fdt/make_dtb.sh ${.CURDIR}/sys \ "${FDT_DTS_FILE}" ${DTBOUTPUTPATH} ############### # cleanworld # In the following, the first 'rm' in a series will usually remove all # files and directories. If it does not, then there are probably some # files with file flags set, so this unsets them and tries the 'rm' a # second time. There are situations where this target will be cleaning # some directories via more than one method, but that duplication is # needed to correctly handle all the possible situations. Removing all # files without file flags set in the first 'rm' instance saves time, # because 'chflags' will need to operate on fewer files afterwards. # # It is expected that BW_CANONICALOBJDIR == the CANONICALOBJDIR as would be # created by bsd.obj.mk, except that we don't want to .include that file # in this makefile. We don't do a cleandir walk if MK_AUTO_OBJ is yes # since it is not possible for files to land in the wrong place. # .if make(cleanworld) BW_CANONICALOBJDIR:=${OBJTOP}/ .elif make(cleanuniverse) BW_CANONICALOBJDIR:=${OBJROOT} .if ${MK_UNIFIED_OBJDIR} == "no" .error ${.TARGETS} only supported with WITH_UNIFIED_OBJDIR enabled. .endif .endif cleanworld cleanuniverse: .PHONY .if !empty(BW_CANONICALOBJDIR) && exists(${BW_CANONICALOBJDIR}) && \ ${.CURDIR:tA} != ${BW_CANONICALOBJDIR:tA} -rm -rf ${BW_CANONICALOBJDIR}* -chflags -R 0 ${BW_CANONICALOBJDIR} rm -rf ${BW_CANONICALOBJDIR}* .endif .if make(cleanworld) && ${MK_AUTO_OBJ} == "no" && \ (empty(BW_CANONICALOBJDIR) || ${.CURDIR:tA} == ${BW_CANONICALOBJDIR:tA}) .if ${.CURDIR} == ${.OBJDIR} || ${.CURDIR}/obj == ${.OBJDIR} # To be safe in this case, fall back to a 'make cleandir' ${_+_}@cd ${.CURDIR}; ${MAKE} cleandir .endif .endif .if ${TARGET} == ${MACHINE} && ${TARGET_ARCH} == ${MACHINE_ARCH} XDEV_CPUTYPE?=${CPUTYPE} .else XDEV_CPUTYPE?=${TARGET_CPUTYPE} .endif NOFUN=-DNO_FSCHG MK_HTML=no -DNO_LINT \ MK_MAN=no MK_NLS=no MK_PROFILE=no \ MK_KERBEROS=no MK_RESCUE=no MK_TESTS=no MK_WARNS=no \ TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \ CPUTYPE=${XDEV_CPUTYPE} XDDIR=${TARGET_ARCH}-freebsd XDTP?=/usr/${XDDIR} .if ${XDTP:N/*} .error XDTP variable should be an absolute path .endif CDBOBJROOT= ${OBJROOT}${MACHINE}.${MACHINE_ARCH}/xdev/ CDBOBJTOP= ${CDBOBJROOT}${XDDIR} CDBENV= \ INSTALL="sh ${.CURDIR}/tools/install.sh" CDENV= ${CDBENV} \ TOOLS_PREFIX=${XDTP} CDMAKEARGS= \ OBJTOP=${CDBOBJTOP:Q} \ OBJROOT=${CDBOBJROOT:Q} CD2MAKEARGS= ${CDMAKEARGS} .if ${WANT_COMPILER_TYPE} == gcc || \ (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == gcc) # GCC requires -isystem and -L when using a cross-compiler. --sysroot # won't set header path and -L is used to ensure the base library path # is added before the port PREFIX library path. CD2CFLAGS+= -isystem ${XDDESTDIR}/usr/include -L${XDDESTDIR}/usr/lib # GCC requires -B to find /usr/lib/crti.o when using a cross-compiler # combined with --sysroot. CD2CFLAGS+= -B${XDDESTDIR}/usr/lib # Force using libc++ for external GCC. .if defined(X_COMPILER_TYPE) && \ ${X_COMPILER_TYPE} == gcc && ${X_COMPILER_VERSION} >= 40800 CD2CXXFLAGS+= -isystem ${XDDESTDIR}/usr/include/c++/v1 -std=c++11 \ -nostdinc++ .endif .endif CD2CFLAGS+= --sysroot=${XDDESTDIR}/ CD2ENV=${CDENV} CC="${CC} ${CD2CFLAGS}" CXX="${CXX} ${CD2CXXFLAGS} ${CD2CFLAGS}" \ CPP="${CPP} ${CD2CFLAGS}" \ MACHINE=${TARGET} MACHINE_ARCH=${TARGET_ARCH} CDTMP= ${OBJTOP}/${XDDIR}/tmp CDMAKE=${CDENV} PATH=${CDTMP}/usr/bin:${PATH} ${MAKE} ${CDMAKEARGS} ${NOFUN} CD2MAKE=${CD2ENV} PATH=${CDTMP}/usr/bin:${XDDESTDIR}/usr/bin:${PATH} \ ${MAKE} ${CD2MAKEARGS} ${NOFUN} .if ${MK_META_MODE} != "no" # Don't rebuild build-tools targets during normal build. CD2MAKE+= BUILD_TOOLS_META=.NOMETA .endif XDDESTDIR=${DESTDIR}${XDTP} .ORDER: xdev-build xdev-install xdev-links xdev: xdev-build xdev-install .PHONY .ORDER: _xb-worldtmp _xb-bootstrap-tools _xb-build-tools _xb-cross-tools xdev-build: _xb-worldtmp _xb-bootstrap-tools _xb-build-tools _xb-cross-tools .PHONY _xb-worldtmp: .PHONY mkdir -p ${CDTMP}/usr ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${CDTMP}/usr >/dev/null _xb-bootstrap-tools: .PHONY .for _tool in \ ${_clang_tblgen} \ ${_gperf} \ ${_yacc} ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${CDMAKE} DIRPRFX=${_tool}/ obj; fi; \ ${CDMAKE} DIRPRFX=${_tool}/ all; \ ${CDMAKE} DIRPRFX=${_tool}/ DESTDIR=${CDTMP} install .endfor _xb-build-tools: .PHONY ${_+_}@cd ${.CURDIR}; \ ${CDBENV} ${MAKE} ${CDMAKEARGS} -f Makefile.inc1 ${NOFUN} build-tools XDEVDIRS= \ ${_clang_libs} \ ${_lld} \ ${_binutils} \ ${_elftctools} \ usr.bin/ar \ ${_clang} \ ${_gcc} _xb-cross-tools: .PHONY .for _tool in ${XDEVDIRS} ${_+_}@${ECHODIR} "===> xdev ${_tool} (obj,all)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${CDMAKE} DIRPRFX=${_tool}/ obj; fi; \ ${CDMAKE} DIRPRFX=${_tool}/ all .endfor _xi-mtree: .PHONY ${_+_}@${ECHODIR} "mtree populating ${XDDESTDIR}" mkdir -p ${XDDESTDIR} ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.root.dist \ -p ${XDDESTDIR} >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${XDDESTDIR}/usr >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${XDDESTDIR}/usr/include >/dev/null .if defined(LIBCOMPAT) ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${XDDESTDIR}/usr >/dev/null .endif .if ${MK_TESTS} != "no" mkdir -p ${XDDESTDIR}${TESTSBASE} ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${XDDESTDIR}${TESTSBASE} >/dev/null .endif .ORDER: xdev-build _xi-mtree _xi-cross-tools _xi-includes _xi-libraries xdev-install: xdev-build _xi-mtree _xi-cross-tools _xi-includes _xi-libraries .PHONY _xi-cross-tools: .PHONY @echo "_xi-cross-tools" .for _tool in ${XDEVDIRS} ${_+_}@${ECHODIR} "===> xdev ${_tool} (install)"; \ cd ${.CURDIR}/${_tool}; \ ${CDMAKE} DIRPRFX=${_tool}/ install DESTDIR=${XDDESTDIR} .endfor _xi-includes: .PHONY .if !defined(NO_OBJWALK) ${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 _obj \ DESTDIR=${XDDESTDIR} .endif ${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 includes \ DESTDIR=${XDDESTDIR} _xi-libraries: .PHONY ${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 libraries \ DESTDIR=${XDDESTDIR} xdev-links: .PHONY ${_+_}cd ${XDDESTDIR}/usr/bin; \ mkdir -p ../../../../usr/bin; \ for i in *; do \ ln -sf ../../${XDTP}/usr/bin/$$i \ ../../../../usr/bin/${XDDIR}-$$i; \ ln -sf ../../${XDTP}/usr/bin/$$i \ ../../../../usr/bin/${XDDIR}${_REVISION}-$$i; \ done Index: projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs/zfs.8 =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs/zfs.8 (revision 352536) +++ projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs/zfs.8 (revision 352537) @@ -1,3918 +1,3937 @@ '\" te .\" Copyright (c) 2013, Martin Matuska . .\" All Rights Reserved. .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" Copyright (c) 2010, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2011, 2014 by Delphix. All rights reserved. .\" Copyright (c) 2011, Pawel Jakub Dawidek .\" Copyright (c) 2012, Glen Barber .\" Copyright (c) 2012, Bryan Drewery .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2013, Steven Hartland .\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. .\" Copyright (c) 2014, Xin LI .\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. .\" Copyright 2018 Joyent, Inc. .\" Copyright (c) 2018 Datto Inc. .\" .\" $FreeBSD$ .\" .Dd February 15, 2018 .Dt ZFS 8 .Os .Sh NAME .Nm zfs .Nd configures ZFS file systems .Sh SYNOPSIS .Nm .Op Fl \&? .Nm .Cm create .Op Fl pu .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... Ar filesystem .Nm .Cm create .Op Fl ps .Op Fl b Ar blocksize .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Fl V .Ar size volume .Nm .Cm destroy .Op Fl fnpRrv .Ar filesystem Ns | Ns Ar volume .Nm .Cm destroy .Op Fl dnpRrv .Sm off .Ar filesystem Ns | Ns volume .Ns @snap .Op % Ns Ar snap .Op , Ns Ar snap Op % Ns Ar snap .Op , Ns ... .Sm on .Nm .Cm destroy .Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark .Nm .Cm snapshot Ns | Ns Cm snap .Op Fl r .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem@snapname Ns | Ns Ar volume@snapname .Ar filesystem@snapname Ns | Ns Ar volume@snapname Ns ... .Nm .Cm rollback .Op Fl rRf .Ar snapshot .Nm .Cm clone .Op Fl p .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar snapshot filesystem Ns | Ns Ar volume .Nm .Cm promote .Ar clone-filesystem .Nm .Cm rename .Op Fl f .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm rename .Op Fl f .Fl p .Ar filesystem Ns | Ns Ar volume .Ar filesystem Ns | Ns Ar volume .Nm .Cm rename .Fl r .Ar snapshot snapshot .Nm .Cm rename .Fl u .Op Fl p .Ar filesystem filesystem .Nm .Cm list .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp .Op Fl o Ar property Ns Oo , Ns property Ns Oc Ns ... .Op Fl t Ar type Ns Oo , Ns type Ns Oc Ns ... .Oo Fl s Ar property Oc Ns ... .Oo Fl S Ar property Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot | Ns Ar bookmark Ns ... .Nm .Cm remap .Ar filesystem Ns | Ns Ar volume .Nm .Cm set .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Nm .Cm get .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp .Op Fl o Ar all | field Ns Oo , Ns Ar field Oc Ns ... .Op Fl t Ar type Ns Oo Ns , Ar type Oc Ns ... .Op Fl s Ar source Ns Oo Ns , Ns Ar source Oc Ns ... .Ar all | property Ns Oo Ns , Ns Ar property Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Nm .Cm inherit .Op Fl rS .Ar property .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Nm .Cm upgrade .Op Fl v .Nm .Cm upgrade .Op Fl r .Op Fl V Ar version .Fl a | Ar filesystem .Nm .Cm userspace .Op Fl Hinp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Oo Fl s Ar field Oc Ns ... .Oo Fl S Ar field Oc Ns ... .Op Fl t Ar type Ns Oo Ns , Ns Ar type Oc Ns ... .Ar filesystem Ns | Ns Ar snapshot .Nm .Cm groupspace .Op Fl Hinp .Op Fl o Ar field Ns Oo , Ns field Oc Ns ... .Oo Fl s Ar field Oc Ns ... .Oo Fl S Ar field Oc Ns ... .Op Fl t Ar type Ns Oo Ns , Ns Ar type Oc Ns ... .Ar filesystem Ns | Ns Ar snapshot .Nm .Cm mount .Nm .Cm mount .Op Fl vO .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Fl a | Ar filesystem .Nm .Cm unmount Ns | Ns Cm umount .Op Fl f .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Nm .Cm share .Fl a | Ar filesystem .Nm .Cm unshare .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Nm .Cm bookmark .Ar snapshot .Ar bookmark .Nm .Cm send .Op Fl DLPRVcenpv .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Nm .Cm send -.Op Fl Lce -.Op Fl i Ar snapshot Ns | Ns bookmark +.Op Fl LPcenv +.Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm send .Op Fl PVenv .Fl t Ar receive_resume_token .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnsFu .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnsFu .Op Fl d | e .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem .Nm .Cm receive Ns | Ns Cm recv .Fl A .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Op Fl ldug .Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... .Ar perm Ns | Ns Ar @setname Ns .Oo Ns , Ns Ar perm Ns | Ns Ar @setname Oc Ns ... .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Op Fl ld .Fl e Ns | Ns Cm everyone .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Fl c .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Nm .Cm allow .Fl s .Ar @setname .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Nm .Cm unallow .Op Fl rldug .Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Nm .Cm unallow .Op Fl rld .Fl e Ns | Ns Cm everyone .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Nm .Cm unallow .Op Fl r .Fl c .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Nm .Cm unallow .Op Fl r .Fl s .Ar @setname .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Nm .Cm hold .Op Fl r .Ar tag snapshot Ns ... .Nm .Cm holds .Op Fl Hp .Op Fl r Ns | Ns Fl d Ar depth .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns .Ns ... .Nm .Cm release .Op Fl r .Ar tag snapshot Ns ... .Nm .Cm diff .Op Fl FHt .Ar snapshot .Op Ar snapshot Ns | Ns Ar filesystem .Nm .Cm program .Op Fl jn .Op Fl t Ar timeout .Op Fl m Ar memory_limit .Ar pool script .Op Ar arg1 No ... .Nm .Cm jail .Ar jailid Ns | Ns Ar jailname filesystem .Nm .Cm unjail .Ar jailid Ns | Ns Ar jailname filesystem .Sh DESCRIPTION The .Nm command configures .Tn ZFS datasets within a .Tn ZFS storage pool, as described in .Xr zpool 8 . A dataset is identified by a unique path within the .Tn ZFS namespace. For example: .Bd -ragged -offset 4n .No pool/ Ns Brq filesystem,volume,snapshot .Ed .Pp where the maximum length of a dataset name is .Dv MAXNAMELEN (256 bytes) and the maximum amount of nesting allowed in a path is 50 levels deep. .Pp A dataset can be one of the following: .Bl -hang -width 12n .It Sy file system A .Tn ZFS dataset of type .Em filesystem can be mounted within the standard system namespace and behaves like other file systems. While .Tn ZFS file systems are designed to be .Tn POSIX compliant, known issues exist that prevent compliance in some cases. Applications that depend on standards conformance might fail due to nonstandard behavior when checking file system free space. .It Sy volume A logical volume exported as a raw or block device. This type of dataset should only be used under special circumstances. File systems are typically used in most environments. .It Sy snapshot A read-only version of a file system or volume at a given point in time. It is specified as .Em filesystem@name or .Em volume@name . .El .Ss ZFS File System Hierarchy A .Tn ZFS storage pool is a logical collection of devices that provide space for datasets. A storage pool is also the root of the .Tn ZFS file system hierarchy. .Pp The root of the pool can be accessed as a file system, such as mounting and unmounting, taking snapshots, and setting properties. The physical storage characteristics, however, are managed by the .Xr zpool 8 command. .Pp See .Xr zpool 8 for more information on creating and administering pools. .Ss Snapshots A snapshot is a read-only copy of a file system or volume. Snapshots can be created extremely quickly, and initially consume no additional space within the pool. As data within the active dataset changes, the snapshot consumes more data than would otherwise be shared with the active dataset. .Pp Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, but cannot be accessed independently. .Pp File system snapshots can be accessed under the .Pa \&.zfs/snapshot directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the .Pa \&.zfs directory can be controlled by the .Sy snapdir property. .Ss Clones A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space. .Pp Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The .Sy origin property exposes this dependency, and the .Cm destroy command lists any such dependencies, if they exist. .Pp The clone parent-child dependency relationship can be reversed by using the .Cm promote subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone was created from. .Ss Mount Points Creating a .Tn ZFS file system is a simple operation, so the number of file systems per system is likely to be numerous. To cope with this, .Tn ZFS automatically manages mounting and unmounting file systems without the need to edit the .Pa /etc/fstab file. All automatically managed file systems are mounted by .Tn ZFS at boot time. .Pp By default, file systems are mounted under .Pa /path , where .Ar path is the name of the file system in the .Tn ZFS namespace. Directories are created and destroyed as needed. .Pp A file system can also have a mount point set in the .Sy mountpoint property. This directory is created as needed, and .Tn ZFS automatically mounts the file system when the .Qq Nm Cm mount Fl a command is invoked (without editing .Pa /etc/fstab ) . The .Sy mountpoint property can be inherited, so if .Em pool/home has a mount point of .Pa /home , then .Em pool/home/user automatically inherits a mount point of .Pa /home/user . .Pp A file system .Sy mountpoint property of .Cm none prevents the file system from being mounted. .Pp If needed, .Tn ZFS file systems can also be managed with traditional tools .Pq Xr mount 8 , Xr umount 8 , Xr fstab 5 . If a file system's mount point is set to .Cm legacy , .Tn ZFS makes no attempt to manage the file system, and the administrator is responsible for mounting and unmounting the file system. .Ss Jails .No A Tn ZFS dataset can be attached to a jail by using the .Qq Nm Cm jail subcommand. You cannot attach a dataset to one jail and the children of the same dataset to another jail. You can also not attach the root file system of the jail or any dataset which needs to be mounted before the zfs rc script is run inside the jail, as it would be attached unmounted until it is mounted from the rc script inside the jail. To allow management of the dataset from within a jail, the .Sy jailed property has to be set and the jail needs access to the .Pa /dev/zfs device. The .Sy quota property cannot be changed from within a jail. See .Xr jail 8 for information on how to allow mounting .Tn ZFS datasets from within a jail. .Pp .No A Tn ZFS dataset can be detached from a jail using the .Qq Nm Cm unjail subcommand. .Pp After a dataset is attached to a jail and the jailed property is set, a jailed file system cannot be mounted outside the jail, since the jail administrator might have set the mount point to an unacceptable value. .Ss Deduplication Deduplication is the process for removing redundant data at the block-level, reducing the total amount of data stored. If a file system has the .Cm dedup property enabled, duplicate data blocks are removed synchronously. The result is that only unique data is stored and common components are shared among files. .Ss Native Properties Properties are divided into two types, native properties and user-defined (or "user") properties. Native properties either export internal statistics or control .Tn ZFS behavior. In addition, native properties are either editable or read-only. User properties have no effect on .Tn ZFS behavior, but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the .Qq Sx User Properties section, below. .Pp Every dataset has a set of properties that export statistics about the dataset as well as control various behaviors. Properties are inherited from the parent unless overridden by the child. Some properties apply only to certain types of datasets (file systems, volumes, or snapshots). .Pp The values of numeric properties can be specified using human-readable suffixes (for example, .Sy k , KB , M , Gb , and so forth, up to .Sy Z for zettabyte). The following are all valid (and equal) specifications: .Bd -ragged -offset 4n 1536M, 1.5g, 1.50GB .Ed .Pp The values of non-numeric properties are case sensitive and must be lowercase, except for .Sy mountpoint , sharenfs , No and Sy sharesmb . .Pp The following native properties consist of read-only statistics about the dataset. These properties can be neither set, nor inherited. Native properties apply to all dataset types unless otherwise noted. .Bl -tag -width 2n .It Sy available The amount of space available to the dataset and all its children, assuming that there is no other activity in the pool. Because space is shared within a pool, availability can be limited by any number of factors, including physical pool size, quotas, reservations, or other datasets within the pool. .Pp This property can also be referred to by its shortened column name, .Sy avail . .It Sy compressratio For non-snapshots, the compression ratio achieved for the .Sy used space of this dataset, expressed as a multiplier. The .Sy used property includes descendant datasets, and, for clones, does not include the space shared with the origin snapshot. For snapshots, the .Sy compressratio is the same as the .Sy refcompressratio property. Compression can be turned on by running: .Qq Nm Cm set compression=on Ar dataset The default value is .Cm off . .It Sy createtxg The transaction group (txg) in which the dataset was created. Bookmarks have the same .Sy createtxg as the snapshot they are initially tied to. This property is suitable for ordering a list of snapshots, e.g. for incremental send and receive. .It Sy creation The time this dataset was created. .It Sy clones For snapshots, this property is a comma-separated list of filesystems or volumes which are clones of this snapshot. The clones' .Sy origin property is this snapshot. If the .Sy clones property is not empty, then this snapshot can not be destroyed (even with the .Fl r or .Fl f options). .It Sy defer_destroy This property is .Cm on if the snapshot has been marked for deferred destroy by using the .Qq Nm Cm destroy -d command. Otherwise, the property is .Cm off . .It Sy filesystem_count The total number of filesystems and volumes that exist under this location in the dataset tree. This value is only available when a .Sy filesystem_limit has been set somewhere in the tree under which the dataset resides. .It Sy guid The 64 bit GUID of this dataset or bookmark which does not change over its entire lifetime. When a snapshot is sent to another pool, the received snapshot has the same GUID. Thus, the .Sy guid is suitable to identify a snapshot across pools. .It Sy logicalreferenced The amount of space that is .Qq logically accessible by this dataset. See the .Sy referenced property. The logical space ignores the effect of the .Sy compression and .Sy copies properties, giving a quantity closer to the amount of data that applications see. However, it does include space consumed by metadata. .Pp This property can also be referred to by its shortened column name, .Sy lrefer . .It Sy logicalused The amount of space that is .Qq logically consumed by this dataset and all its descendents. See the .Sy used property. The logical space ignores the effect of the .Sy compression and .Sy copies properties, giving a quantity closer to the amount of data that applications see. .Pp This property can also be referred to by its shortened column name, .Sy lused . .It Sy mounted For file systems, indicates whether the file system is currently mounted. This property can be either .Cm yes or .Cm no . .It Sy origin For cloned file systems or volumes, the snapshot from which the clone was created. See also the .Sy clones property. .It Sy receive_resume_token For filesystems or volumes which have saved partially-completed state from .Sy zfs receive -s , this opaque token can be provided to .Sy zfs send -t to resume and complete the .Sy zfs receive . .It Sy referenced The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. When a snapshot or clone is created, it initially references the same amount of space as the file system or snapshot it was created from, since its contents are identical. .Pp This property can also be referred to by its shortened column name, .Sy refer . .It Sy refcompressratio The compression ratio achieved for the .Sy referenced space of this dataset, expressed as a multiplier. See also the .Sy compressratio property. .It Sy snapshot_count The total number of snapshots that exist under this location in the dataset tree. This value is only available when a .Sy snapshot_limit has been set somewhere in the tree under which the dataset resides. .It Sy type The type of dataset: .Sy filesystem , volume , No or Sy snapshot . .It Sy used The amount of space consumed by this dataset and all its descendents. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendent datasets. The amount of space that a dataset consumes from its parent, as well as the amount of space that are freed if this dataset is recursively destroyed, is the greater of its space used and its reservation. .Pp When snapshots (see the .Qq Sx Snapshots section) are created, their space is initially shared between the snapshot and the file system, and possibly with previous snapshots. As the file system changes, space that was previously shared becomes unique to the snapshot, and counted in the snapshot's space used. Additionally, deleting snapshots can increase the amount of space unique to (and used by) other snapshots. .Pp The amount of space used, available, or referenced does not take into account pending changes. Pending changes are generally accounted for within a few seconds. Committing a change to a disk using .Xr fsync 2 or .Sy O_SYNC does not necessarily guarantee that the space usage information is updated immediately. .It Sy usedby* The .Sy usedby* properties decompose the .Sy used properties into the various reasons that space is used. Specifically, .Sy used No = .Sy usedbysnapshots + usedbydataset + usedbychildren + usedbyrefreservation . These properties are only available for datasets created with .Tn ZFS pool version 13 pools and higher. .It Sy usedbysnapshots The amount of space consumed by snapshots of this dataset. In particular, it is the amount of space that would be freed if all of this dataset's snapshots were destroyed. Note that this is not simply the sum of the snapshots' .Sy used properties because space can be shared by multiple snapshots. .It Sy usedbydataset The amount of space used by this dataset itself, which would be freed if the dataset were destroyed (after first removing any .Sy refreservation and destroying any necessary snapshots or descendents). .It Sy usedbychildren The amount of space used by children of this dataset, which would be freed if all the dataset's children were destroyed. .It Sy usedbyrefreservation The amount of space used by a .Sy refreservation set on this dataset, which would be freed if the .Sy refreservation was removed. .It Sy userused@ Ns Ar user The amount of space consumed by the specified user in this dataset. Space is charged to the owner of each file, as displayed by .Qq Nm ls Fl l . The amount of space charged is displayed by .Qq Nm du and .Qq Nm ls Fl s . See the .Qq Nm Cm userspace subcommand for more information. .Pp Unprivileged users can access only their own space usage. The root user, or a user who has been granted the .Sy userused privilege with .Qq Nm Cm allow , can access everyone's usage. .Pp The .Sy userused@ Ns ... properties are not displayed by .Qq Nm Cm get all . The user's name must be appended after the .Sy @ symbol, using one of the following forms: .Bl -bullet -offset 2n .It POSIX name (for example, .Em joe ) .It POSIX numeric ID (for example, .Em 1001 ) .El .It Sy userrefs This property is set to the number of user holds on this snapshot. User holds are set by using the .Qq Nm Cm hold command. .It Sy groupused@ Ns Ar group The amount of space consumed by the specified group in this dataset. Space is charged to the group of each file, as displayed by .Nm ls Fl l . See the .Sy userused@ Ns Ar user property for more information. .Pp Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the .Sy groupused privilege with .Qq Nm Cm allow , can access all groups' usage. .It Sy volblocksize Ns = Ns Ar blocksize For volumes, specifies the block size of the volume. The .Ar blocksize cannot be changed once the volume has been written, so it should be set at volume creation time. The default .Ar blocksize for volumes is 8 Kbytes. Any power of 2 from 512 bytes to 128 Kbytes is valid. .Pp This property can also be referred to by its shortened column name, .Sy volblock . .It Sy written The amount of .Sy referenced space written to this dataset since the previous snapshot. .It Sy written@ Ns Ar snapshot The amount of .Sy referenced space written to this dataset since the specified snapshot. This is the space that is referenced by this dataset but was not referenced by the specified snapshot. .Pp The .Ar snapshot may be specified as a short snapshot name (just the part after the .Sy @ ) , in which case it will be interpreted as a snapshot in the same filesystem as this dataset. The .Ar snapshot may be a full snapshot name .Pq Em filesystem@snapshot , which for clones may be a snapshot in the origin's filesystem (or the origin of the origin's filesystem, etc). .El .Pp The following native properties can be used to change the behavior of a .Tn ZFS dataset. .Bl -tag -width 2n .It Xo .Sy aclinherit Ns = Ns Cm discard | .Cm noallow | .Cm restricted | .Cm passthrough | .Cm passthrough-x .Xc Controls how .Tn ACL entries are inherited when files and directories are created. A file system with an .Sy aclinherit property of .Cm discard does not inherit any .Tn ACL entries. A file system with an .Sy aclinherit property value of .Cm noallow only inherits inheritable .Tn ACL entries that specify "deny" permissions. The property value .Cm restricted (the default) removes the .Em write_acl and .Em write_owner permissions when the .Tn ACL entry is inherited. A file system with an .Sy aclinherit property value of .Cm passthrough inherits all inheritable .Tn ACL entries without any modifications made to the .Tn ACL entries when they are inherited. A file system with an .Sy aclinherit property value of .Cm passthrough-x has the same meaning as .Cm passthrough , except that the .Em owner@ , group@ , No and Em everyone@ Tn ACE Ns s inherit the execute permission only if the file creation mode also requests the execute bit. .Pp When the property value is set to .Cm passthrough , files are created with a mode determined by the inheritable .Tn ACE Ns s. If no inheritable .Tn ACE Ns s exist that affect the mode, then the mode is set in accordance to the requested mode from the application. .It Sy aclmode Ns = Ns Cm discard | groupmask | passthrough | restricted Controls how an .Tn ACL is modified during .Xr chmod 2 . A file system with an .Sy aclmode property of .Cm discard (the default) deletes all .Tn ACL entries that do not represent the mode of the file. An .Sy aclmode property of .Cm groupmask reduces permissions granted in all .Em ALLOW entries found in the .Tn ACL such that they are no greater than the group permissions specified by .Xr chmod 2 . A file system with an .Sy aclmode property of .Cm passthrough indicates that no changes are made to the .Tn ACL other than creating or updating the necessary .Tn ACL entries to represent the new mode of the file or directory. An .Sy aclmode property of .Cm restricted will cause the .Xr chmod 2 operation to return an error when used on any file or directory which has a non-trivial .Tn ACL whose entries can not be represented by a mode. .Xr chmod 2 is required to change the set user ID, set group ID, or sticky bits on a file or directory, as they do not have equivalent .Tn ACL entries. In order to use .Xr chmod 2 on a file or directory with a non-trivial .Tn ACL when .Sy aclmode is set to .Cm restricted , you must first remove all .Tn ACL entries which do not represent the current mode. .It Sy atime Ns = Ns Cm on | off Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value is .Cm on . .It Sy canmount Ns = Ns Cm on | off | noauto If this property is set to .Cm off , the file system cannot be mounted, and is ignored by .Qq Nm Cm mount Fl a . Setting this property to .Cm off is similar to setting the .Sy mountpoint property to .Cm none , except that the dataset still has a normal .Sy mountpoint property, which can be inherited. Setting this property to .Cm off allows datasets to be used solely as a mechanism to inherit properties. One example of setting .Sy canmount Ns = Ns Cm off is to have two datasets with the same .Sy mountpoint , so that the children of both datasets appear in the same directory, but might have different inherited characteristics. .Pp When the .Cm noauto value is set, a dataset can only be mounted and unmounted explicitly. The dataset is not mounted automatically when the dataset is created or imported, nor is it mounted by the .Qq Nm Cm mount Fl a command or unmounted by the .Qq Nm Cm umount Fl a command. .Pp This property is not inherited. .It Sy checksum Ns = Ns Cm on | off | fletcher2 | fletcher4 | sha256 | noparity | sha512 | skein Controls the checksum used to verify data integrity. The default value is .Cm on , which automatically selects an appropriate algorithm (currently, .Cm fletcher4 , but this may change in future releases). The value .Cm off disables integrity checking on user data. The value .Cm noparity not only disables integrity but also disables maintaining parity for user data. This setting is used internally by a dump device residing on a RAID-Z pool and should not be used by any other dataset. Disabling checksums is .Em NOT a recommended practice. The .Sy sha512 , and .Sy skein checksum algorithms require enabling the appropriate features on the pool. Please see .Xr zpool-features 7 for more information on these algorithms. .Pp Changing this property affects only newly-written data. .Pp Salted checksum algorithms .Pq Cm edonr , skein are currently not supported for any filesystem on the boot pools. .It Sy compression Ns = Ns Cm on | off | lzjb | gzip | gzip- Ns Ar N | Cm zle | Cm lz4 Controls the compression algorithm used for this dataset. Setting compression to .Cm on indicates that the current default compression algorithm should be used. The default balances compression and decompression speed, with compression ratio and is expected to work well on a wide variety of workloads. Unlike all other settings for this property, on does not select a fixed compression type. As new compression algorithms are added to ZFS and enabled on a pool, the default compression algorithm may change. The current default compression algorthm is either .Cm lzjb or, if the .Sy lz4_compress feature is enabled, .Cm lz4 . The .Cm lzjb compression algorithm is optimized for performance while providing decent data compression. Setting compression to .Cm on uses the .Cm lzjb compression algorithm. The .Cm gzip compression algorithm uses the same compression as the .Xr gzip 1 command. You can specify the .Cm gzip level by using the value .Cm gzip- Ns Ar N where .Ar N is an integer from 1 (fastest) to 9 (best compression ratio). Currently, .Cm gzip is equivalent to .Cm gzip-6 (which is also the default for .Xr gzip 1 ) . The .Cm zle compression algorithm compresses runs of zeros. .Pp The .Sy lz4 compression algorithm is a high-performance replacement for the .Sy lzjb algorithm. It features significantly faster compression and decompression, as well as a moderately higher compression ratio than .Sy lzjb , but can only be used on pools with the .Sy lz4_compress feature set to .Sy enabled . See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy lz4_compress feature. .Pp This property can also be referred to by its shortened column name .Cm compress . Changing this property affects only newly-written data. .It Sy copies Ns = Ns Cm 1 | 2 | 3 Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool, for example, mirroring or RAID-Z. The copies are stored on different disks, if possible. The space used by multiple copies is charged to the associated file and dataset, changing the .Sy used property and counting against quotas and reservations. .Pp Changing this property only affects newly-written data. Therefore, set this property at file system creation time by using the .Fl o Cm copies= Ns Ar N option. .It Sy dedup Ns = Ns Cm on | off | verify | sha256 Ns Oo Cm ,verify Oc | Sy sha512 Ns Oo Cm ,verify Oc | Sy skein Ns Oo Cm ,verify Oc Configures deduplication for a dataset. The default value is .Cm off . The default deduplication checksum is .Cm sha256 (this may change in the future). When .Sy dedup is enabled, the checksum defined here overrides the .Sy checksum property. Setting the value to .Cm verify has the same effect as the setting .Cm sha256,verify . .Pp If set to .Cm verify , .Tn ZFS will do a byte-to-byte comparsion in case of two blocks having the same signature to make sure the block contents are identical. .It Sy devices Ns = Ns Cm on | off The .Sy devices property is currently not supported on .Fx . .It Sy exec Ns = Ns Cm on | off Controls whether processes can be executed from within this file system. The default value is .Cm on . .It Sy mlslabel Ns = Ns Ar label | Cm none The .Sy mlslabel property is currently not supported on .Fx . .It Sy filesystem_limit Ns = Ns Ar count | Cm none Limits the number of filesystems and volumes that can exist under this point in the dataset tree. The limit is not enforced if the user is allowed to change the limit. Setting a .Sy filesystem_limit on a descendent of a filesystem that already has a .Sy filesystem_limit does not override the ancestor's .Sy filesystem_limit , but rather imposes an additional limit. This feature must be enabled to be used .Po see .Xr zpool-features 7 .Pc . .It Sy mountpoint Ns = Ns Ar path | Cm none | legacy Controls the mount point used for this file system. See the .Qq Sx Mount Points section for more information on how this property is used. .Pp When the .Sy mountpoint property is changed for a file system, the file system and any children that inherit the mount point are unmounted. If the new value is .Cm legacy , then they remain unmounted. Otherwise, they are automatically remounted in the new location if the property was previously .Cm legacy or .Cm none , or if they were mounted before the property was changed. In addition, any shared file systems are unshared and shared in the new location. .It Sy nbmand Ns = Ns Cm on | off The .Sy nbmand property is currently not supported on .Fx . .It Sy primarycache Ns = Ns Cm all | none | metadata Controls what is cached in the primary cache (ARC). If this property is set to .Cm all , then both user data and metadata is cached. If this property is set to .Cm none , then neither user data nor metadata is cached. If this property is set to .Cm metadata , then only metadata is cached. The default value is .Cm all . .It Sy quota Ns = Ns Ar size | Cm none Limits the amount of space a dataset and its descendents can consume. This property enforces a hard limit on the amount of space used. This includes all space consumed by descendents, including file systems and snapshots. Setting a quota on a descendent of a dataset that already has a quota does not override the ancestor's quota, but rather imposes an additional limit. .Pp Quotas cannot be set on volumes, as the .Sy volsize property acts as an implicit quota. .It Sy snapshot_limit Ns = Ns Ar count | Cm none Limits the number of snapshots that can be created on a dataset and its descendents. Setting a .Sy snapshot_limit on a descendent of a dataset that already has a .Sy snapshot_limit does not override the ancestor's .Sy snapshot_limit , but rather imposes an additional limit. The limit is not enforced if the user is allowed to change the limit. For example, this means that recursive snapshots taken from the global zone are counted against each delegated dataset within a jail. This feature must be enabled to be used .Po see .Xr zpool-features 7 .Pc . .It Sy userquota@ Ns Ar user Ns = Ns Ar size | Cm none Limits the amount of space consumed by the specified user. Similar to the .Sy refquota property, the .Sy userquota space calculation does not include space that is used by descendent datasets, such as snapshots and clones. User space consumption is identified by the .Sy userspace@ Ns Ar user property. .Pp Enforcement of user quotas may be delayed by several seconds. This delay means that a user might exceed their quota before the system notices that they are over quota and begins to refuse additional writes with the .Em EDQUOT error message. See the .Cm userspace subcommand for more information. .Pp Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the .Sy userquota privilege with .Qq Nm Cm allow , can get and set everyone's quota. .Pp This property is not available on volumes, on file systems before version 4, or on pools before version 15. The .Sy userquota@ Ns ... properties are not displayed by .Qq Nm Cm get all . The user's name must be appended after the .Sy @ symbol, using one of the following forms: .Bl -bullet -offset 2n .It POSIX name (for example, .Em joe ) .It POSIX numeric ID (for example, .Em 1001 ) .El .It Sy groupquota@ Ns Ar group Ns = Ns Ar size | Cm none Limits the amount of space consumed by the specified group. Group space consumption is identified by the .Sy userquota@ Ns Ar user property. .Pp Unprivileged users can access only their own groups' space usage. The root user, or a user who has been granted the .Sy groupquota privilege with .Qq Nm Cm allow , can get and set all groups' quotas. .It Sy readonly Ns = Ns Cm on | off Controls whether this dataset can be modified. The default value is .Cm off . .It Sy recordsize Ns = Ns Ar size Specifies a suggested block size for files in the file system. This property is designed solely for use with database workloads that access files in fixed-size records. .Tn ZFS automatically tunes block sizes according to internal algorithms optimized for typical access patterns. .Pp For databases that create very large files but access them in small random chunks, these algorithms may be suboptimal. Specifying a .Sy recordsize greater than or equal to the record size of the database can result in significant performance gains. Use of this property for general purpose file systems is strongly discouraged, and may adversely affect performance. .Pp The size specified must be a power of two greater than or equal to 512 and less than or equal to 128 Kbytes. If the .Sy large_blocks feature is enabled on the pool, the size may be up to 1 Mbyte. See .Xr zpool-features 7 for details on ZFS feature flags. .Pp Changing the file system's .Sy recordsize affects only files created afterward; existing files are unaffected. .Pp This property can also be referred to by its shortened column name, .Sy recsize . .It Sy redundant_metadata Ns = Ns Cm all | most Controls what types of metadata are stored redundantly. ZFS stores an extra copy of metadata, so that if a single block is corrupted, the amount of user data lost is limited. This extra copy is in addition to any redundancy provided at the pool level .Pq e.g. by mirroring or RAID-Z , and is in addition to an extra copy specified by the .Sy copies property .Pq up to a total of 3 copies . For example if the pool is mirrored, .Cm copies Ns = Ns Ar 2 , and .Cm redundant_metadata Ns = Ns Ar most , then ZFS stores 6 copies of most metadata, and 4 copies of data and some metadata. .Pp When set to .Cm all , ZFS stores an extra copy of all metadata. If a single on-disk block is corrupt, at worst a single block of user data .Po which is .Cm recordsize bytes long can be lost. .Pc .Pp When set to .Cm most , ZFS stores an extra copy of most types of metadata. This can improve performance of random writes, because less metadata must be written. In practice, at worst about 100 blocks .Po of .Cm recordsize bytes each .Pc of user data can be lost if a single on-disk block is corrupt. The exact behavior of which metadata blocks are stored redundantly may change in future releases. .Pp The default value is .Cm all . .It Sy refquota Ns = Ns Ar size | Cm none Limits the amount of space a dataset can consume. This property enforces a hard limit on the amount of space used. This hard limit does not include space used by descendents, including file systems and snapshots. .It Sy refreservation Ns = Ns Ar size | Cm none | Cm auto The minimum amount of space guaranteed to a dataset, not including its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by .Sy refreservation . The .Sy refreservation reservation is accounted for in the parent datasets' space used, and counts against the parent datasets' quotas and reservations. .Pp If .Sy refreservation is set, a snapshot is only allowed if there is enough free pool space outside of this reservation to accommodate the current number of "referenced" bytes in the dataset. .Pp If .Sy refreservation is set to .Sy auto , a volume is thick provisioned or not sparse. .Sy refreservation Ns = Cm auto is only supported on volumes. See .Sy volsize in the Native Properties section for more information about sparse volumes. .Pp This property can also be referred to by its shortened column name, .Sy refreserv . .It Sy reservation Ns = Ns Ar size | Cm none The minimum amount of space guaranteed to a dataset and its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by its reservation. Reservations are accounted for in the parent datasets' space used, and count against the parent datasets' quotas and reservations. .Pp This property can also be referred to by its shortened column name, .Sy reserv . .It Sy secondarycache Ns = Ns Cm all | none | metadata Controls what is cached in the secondary cache (L2ARC). If this property is set to .Cm all , then both user data and metadata is cached. If this property is set to .Cm none , then neither user data nor metadata is cached. If this property is set to .Cm metadata , then only metadata is cached. The default value is .Cm all . .It Sy setuid Ns = Ns Cm on | off Controls whether the .No set- Ns Tn UID bit is respected for the file system. The default value is .Cm on . .It Sy sharesmb Ns = Ns Cm on | off | Ar opts The .Sy sharesmb property currently has no effect on .Fx . .It Sy sharenfs Ns = Ns Cm on | off | Ar opts Controls whether the file system is shared via .Tn NFS , and what options are used. A file system with a .Sy sharenfs property of .Cm off is managed the traditional way via .Xr exports 5 . Otherwise, the file system is automatically shared and unshared with the .Qq Nm Cm share and .Qq Nm Cm unshare commands. If the property is set to .Cm on no .Tn NFS export options are used. Otherwise, .Tn NFS export options are equivalent to the contents of this property. The export options may be comma-separated. See .Xr exports 5 for a list of valid options. .Pp When the .Sy sharenfs property is changed for a dataset, the .Xr mountd 8 daemon is reloaded. .It Sy logbias Ns = Ns Cm latency | throughput Provide a hint to .Tn ZFS about handling of synchronous requests in this dataset. If .Sy logbias is set to .Cm latency (the default), .Tn ZFS will use pool log devices (if configured) to handle the requests at low latency. If .Sy logbias is set to .Cm throughput , .Tn ZFS will not use configured pool log devices. .Tn ZFS will instead optimize synchronous operations for global pool throughput and efficient use of resources. .It Sy snapdir Ns = Ns Cm hidden | visible Controls whether the .Pa \&.zfs directory is hidden or visible in the root of the file system as discussed in the .Qq Sx Snapshots section. The default value is .Cm hidden . .It Sy sync Ns = Ns Cm standard | always | disabled Controls the behavior of synchronous requests (e.g. .Xr fsync 2 , O_DSYNC). This property accepts the following values: .Bl -tag -offset 4n -width 8n .It Sy standard This is the POSIX specified behavior of ensuring all synchronous requests are written to stable storage and all devices are flushed to ensure data is not cached by device controllers (this is the default). .It Sy always All file system transactions are written and flushed before their system calls return. This has a large performance penalty. .It Sy disabled Disables synchronous requests. File system transactions are only committed to stable storage periodically. This option will give the highest performance. However, it is very dangerous as .Tn ZFS would be ignoring the synchronous transaction demands of applications such as databases or .Tn NFS . Administrators should only use this option when the risks are understood. .El .It Sy volsize Ns = Ns Ar size For volumes, specifies the logical size of the volume. By default, creating a volume establishes a reservation of equal size. For storage pools with a version number of 9 or higher, a .Sy refreservation is set instead. Any changes to .Sy volsize are reflected in an equivalent change to the reservation (or .Sy refreservation ) . The .Sy volsize can only be set to a multiple of .Cm volblocksize , and cannot be zero. .Pp The reservation is kept equal to the volume's logical size to prevent unexpected behavior for consumers. Without the reservation, the volume could run out of space, resulting in undefined behavior or data corruption, depending on how the volume is used. These effects can also occur when the volume size is changed while it is in use (particularly when shrinking the size). Extreme care should be used when adjusting the volume size. .Pp Though not recommended, a "sparse volume" (also known as "thin provisioned") can be created by specifying the .Fl s option to the .Qq Nm Cm create Fl V command, or by changing the value of the .Sy refreservation property, or .Sy reservation property on pool .Po version 8 or earlier .Pc after the volume has been created. A "sparse volume" is a volume where the value of .Sy refreservation is less then the size of the volume plus the space required to store its metadata. Consequently, writes to a sparse volume can fail with .Sy ENOSPC when the pool is low on space. For a sparse volume, changes to .Sy volsize are not reflected in the .Sy refreservation . A volume that is not sparse is said to be "thick provisioned". A sparse volume can become thick provisioned by setting .Sy refreservation to .Sy auto . .It Sy volmode Ns = Ns Cm default | geom | dev | none This property specifies how volumes should be exposed to the OS. Setting it to .Sy geom exposes volumes as .Xr geom 4 providers, providing maximal functionality. Setting it to .Sy dev exposes volumes only as cdev device in devfs. Such volumes can be accessed only as raw disk device files, i.e. they can not be partitioned, mounted, participate in RAIDs, etc, but they are faster, and in some use scenarios with untrusted consumer, such as NAS or VM storage, can be more safe. Volumes with property set to .Sy none are not exposed outside ZFS, but can be snapshoted, cloned, replicated, etc, that can be suitable for backup purposes. Value .Sy default means that volumes exposition is controlled by system-wide sysctl/tunable .Va vfs.zfs.vol.mode , where .Sy geom , .Sy dev and .Sy none are encoded as 1, 2 and 3 respectively. The default values is .Sy geom . This property can be changed any time, but so far it is processed only during volume creation and pool import. .It Sy vscan Ns = Ns Cm off | on The .Sy vscan property is currently not supported on .Fx . .It Sy xattr Ns = Ns Cm off | on The .Sy xattr property is currently not supported on .Fx . .It Sy jailed Ns = Ns Cm off | on Controls whether the dataset is managed from a jail. See the .Qq Sx Jails section for more information. The default value is .Cm off . .El .Pp The following three properties cannot be changed after the file system is created, and therefore, should be set when the file system is created. If the properties are not set with the .Qq Nm Cm create or .Nm zpool Cm create commands, these properties are inherited from the parent dataset. If the parent dataset lacks these properties due to having been created prior to these features being supported, the new file system will have the default values for these properties. .Bl -tag -width 4n .It Sy casesensitivity Ns = Ns Cm sensitive | insensitive | mixed Indicates whether the file name matching algorithm used by the file system should be case-sensitive, case-insensitive, or allow a combination of both styles of matching. The default value for the .Sy casesensitivity property is .Cm sensitive . Traditionally, UNIX and POSIX file systems have case-sensitive file names. .Pp The .Cm mixed value for the .Sy casesensitivity property indicates that the file system can support requests for both case-sensitive and case-insensitive matching behavior. .It Sy normalization Ns = Ns Cm none | formC | formD | formKC | formKD Indicates whether the file system should perform a .Sy unicode normalization of file names whenever two file names are compared, and which normalization algorithm should be used. File names are always stored unmodified, names are normalized as part of any comparison process. If this property is set to a legal value other than .Cm none , and the .Sy utf8only property was left unspecified, the .Sy utf8only property is automatically set to .Cm on . The default value of the .Sy normalization property is .Cm none . This property cannot be changed after the file system is created. .It Sy utf8only Ns = Ns Cm on | off Indicates whether the file system should reject file names that include characters that are not present in the .Sy UTF-8 character code set. If this property is explicitly set to .Cm off , the normalization property must either not be explicitly set or be set to .Cm none . The default value for the .Sy utf8only property is .Cm off . This property cannot be changed after the file system is created. .El .Pp The .Sy casesensitivity , normalization , No and Sy utf8only properties are also new permissions that can be assigned to non-privileged users by using the .Tn ZFS delegated administration feature. .Ss Temporary Mount Point Properties When a file system is mounted, either through .Xr mount 8 for legacy mounts or the .Qq Nm Cm mount command for normal file systems, its mount options are set according to its properties. The correlation between properties and mount options is as follows: .Bl -column -offset 4n "PROPERTY" "MOUNT OPTION" .It "PROPERTY MOUNT OPTION" .It "atime atime/noatime" .It "exec exec/noexec" .It "readonly ro/rw" .It "setuid suid/nosuid" .El .Pp In addition, these options can be set on a per-mount basis using the .Fl o option, without affecting the property that is stored on disk. The values specified on the command line override the values stored in the dataset. These properties are reported as "temporary" by the .Qq Nm Cm get command. If the properties are changed while the dataset is mounted, the new setting overrides any temporary settings. .Ss User Properties In addition to the standard native properties, .Tn ZFS supports arbitrary user properties. User properties have no effect on .Tn ZFS behavior, but applications or administrators can use them to annotate datasets (file systems, volumes, and snapshots). .Pp User property names must contain a colon .Pq Sy \&: character to distinguish them from native properties. They may contain lowercase letters, numbers, and the following punctuation characters: colon .Pq Sy \&: , dash .Pq Sy \&- , period .Pq Sy \&. and underscore .Pq Sy \&_ . The expected convention is that the property name is divided into two portions such as .Em module Ns Sy \&: Ns Em property , but this namespace is not enforced by .Tn ZFS . User property names can be at most 256 characters, and cannot begin with a dash .Pq Sy \&- . .Pp When making programmatic use of user properties, it is strongly suggested to use a reversed .Tn DNS domain name for the .Ar module component of property names to reduce the chance that two independently-developed packages use the same property name for different purposes. Property names beginning with .Em com.sun are reserved for use by Sun Microsystems. .Pp The values of user properties are arbitrary strings, are always inherited, and are never validated. All of the commands that operate on properties .Po .Qq Nm Cm list , .Qq Nm Cm get , .Qq Nm Cm set and so forth .Pc can be used to manipulate both native properties and user properties. Use the .Qq Nm Cm inherit command to clear a user property. If the property is not defined in any parent dataset, it is removed entirely. Property values are limited to 1024 characters. .Sh SUBCOMMANDS All subcommands that modify state are logged persistently to the pool in their original form. .Bl -tag -width 2n .It Xo .Nm .Op Fl \&? .Xc .Pp Displays a help message. .It Xo .Nm .Cm create .Op Fl pu .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem .Xc .Pp Creates a new .Tn ZFS file system. The file system is automatically mounted according to the .Sy mountpoint property inherited from the parent. .Bl -tag -width indent .It Fl p Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the .Sy mountpoint property inherited from their parent. Any property specified on the command line using the .Fl o option is ignored. If the target filesystem already exists, the operation completes successfully. .It Fl u Newly created file system is not mounted. .It Fl o Ar property Ns = Ns Ar value Sets the specified property as if the command .Qq Nm Cm set Ar property Ns = Ns Ar value was invoked at the same time the dataset was created. Any editable .Tn ZFS property can also be set at creation time. Multiple .Fl o options can be specified. An error results if the same property is specified in multiple .Fl o options. .El .It Xo .Nm .Cm create .Op Fl ps .Op Fl b Ar blocksize .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Fl V .Ar size volume .Xc .Pp Creates a volume of the given size. The volume is exported as a block device in .Pa /dev/zvol/path , where .Ar path is the name of the volume in the .Tn ZFS namespace. The size represents the logical size as exported by the device. By default, a reservation of equal size is created. .Pp .Ar size is automatically rounded up to the nearest 128 Kbytes to ensure that the volume has an integral number of blocks regardless of .Ar blocksize . .Bl -tag -width indent .It Fl p Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the .Sy mountpoint property inherited from their parent. Any property specified on the command line using the .Fl o option is ignored. If the target filesystem already exists, the operation completes successfully. .It Fl s Creates a sparse volume with no reservation. See .Sy volsize in the .Qq Sx Native Properties section for more information about sparse volumes. .It Fl b Ar blocksize Equivalent to .Fl o Cm volblocksize Ns = Ns Ar blocksize . If this option is specified in conjunction with .Fl o Cm volblocksize , the resulting behavior is undefined. .It Fl o Ar property Ns = Ns Ar value Sets the specified property as if the .Qq Nm Cm set Ar property Ns = Ns Ar value command was invoked at the same time the dataset was created. Any editable .Tn ZFS property can also be set at creation time. Multiple .Fl o options can be specified. An error results if the same property is specified in multiple .Fl o options. .El .It Xo .Nm .Cm destroy .Op Fl fnpRrv .Ar filesystem Ns | Ns Ar volume .Xc .Pp Destroys the given dataset. By default, the command unshares any file systems that are currently shared, unmounts any file systems that are currently mounted, and refuses to destroy a dataset that has active dependents (children or clones). .Bl -tag -width indent .It Fl r Recursively destroy all children. .It Fl R Recursively destroy all dependents, including cloned file systems outside the target hierarchy. .It Fl f Force an unmount of any file systems using the .Qq Nm Cm unmount Fl f command. This option has no effect on non-file systems or unmounted file systems. .It Fl n Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in conjunction with the .Fl v or .Fl p flags to determine what data would be deleted. .It Fl p Print machine-parsable verbose information about the deleted data. .It Fl v Print verbose information about the deleted data. .El .Pp Extreme care should be taken when applying either the .Fl r or the .Fl R options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. .It Xo .Nm .Cm destroy .Op Fl dnpRrv .Sm off .Ar snapshot .Op % Ns Ar snapname .Op , Ns ... .Sm on .Xc .Pp The given snapshots are destroyed immediately if and only if the .Qq Nm Cm destroy command without the .Fl d option would have destroyed it. Such immediate destruction would occur, for example, if the snapshot had no clones and the user-initiated reference count were zero. .Pp If a snapshot does not qualify for immediate destruction, it is marked for deferred deletion. In this state, it exists as a usable, visible snapshot until both of the preconditions listed above are met, at which point it is destroyed. .Pp An inclusive range of snapshots may be specified by separating the first and last snapshots with a percent sign .Pq Sy % . The first and/or last snapshots may be left blank, in which case the filesystem's oldest or newest snapshot will be implied. .Pp Multiple snapshots (or ranges of snapshots) of the same filesystem or volume may be specified in a comma-separated list of snapshots. Only the snapshot's short name (the part after the .Sy @ ) should be specified when using a range or comma-separated list to identify multiple snapshots. .Bl -tag -width indent .It Fl r Destroy (or mark for deferred deletion) all snapshots with this name in descendent file systems. .It Fl R Recursively destroy all clones of these snapshots, including the clones, snapshots, and children. If this flag is specified, the .Fl d flag will have no effect. .It Fl n Do a dry-run ("No-op") deletion. No data will be deleted. This is useful in conjunction with the .Fl v or .Fl p flags to determine what data would be deleted. .It Fl p Print machine-parsable verbose information about the deleted data. .It Fl v Print verbose information about the deleted data. .It Fl d Defer snapshot deletion. .El .Pp Extreme care should be taken when applying either the .Fl r or the .Fl R options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. .It Xo .Nm .Cm destroy .Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark .Xc .Pp The given bookmark is destroyed. .It Xo .Nm .Cm snapshot Ns | Ns Cm snap .Op Fl r .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem@snapname Ns | Ns volume@snapname .Ar filesystem@snapname Ns | Ns volume@snapname Ns ... .Xc .Pp Creates snapshots with the given names. All previous modifications by successful system calls to the file system are part of the snapshots. Snapshots are taken atomically, so that all snapshots correspond to the same moment in time. See the .Qq Sx Snapshots section for details. .Bl -tag -width indent .It Fl r Recursively create snapshots of all descendent datasets .It Fl o Ar property Ns = Ns Ar value Sets the specified property; see .Qq Nm Cm create for details. .El .It Xo .Nm .Cm rollback .Op Fl rRf .Ar snapshot .Xc .Pp Roll back the given dataset to a previous snapshot. When a dataset is rolled back, all data that has changed since the snapshot is discarded, and the dataset reverts to the state at the time of the snapshot. By default, the command refuses to roll back to a snapshot other than the most recent one. In order to do so, all intermediate snapshots and bookmarks must be destroyed by specifying the .Fl r option. .Pp The .Fl rR options do not recursively destroy the child snapshots of a recursive snapshot. Only direct snapshots of the specified filesystem are destroyed by either of these options. To completely roll back a recursive snapshot, you must rollback the individual child snapshots. .Bl -tag -width indent .It Fl r Destroy any snapshots and bookmarks more recent than the one specified. .It Fl R Destroy any more recent snapshots and bookmarks, as well as any clones of those snapshots. .It Fl f Used with the .Fl R option to force an unmount of any clone file systems that are to be destroyed. .El .It Xo .Nm .Cm clone .Op Fl p .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar snapshot filesystem Ns | Ns Ar volume .Xc .Pp Creates a clone of the given snapshot. See the .Qq Sx Clones section for details. The target dataset can be located anywhere in the .Tn ZFS hierarchy, and is created as the same type as the original. .Bl -tag -width indent .It Fl p Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the .Sy mountpoint property inherited from their parent. If the target filesystem or volume already exists, the operation completes successfully. .It Fl o Ar property Ns = Ns Ar value Sets the specified property; see .Qq Nm Cm create for details. .El .It Xo .Nm .Cm promote .Ar clone-filesystem .Xc .Pp Promotes a clone file system to no longer be dependent on its "origin" snapshot. This makes it possible to destroy the file system that the clone was created from. The clone parent-child dependency relationship is reversed, so that the origin file system becomes a clone of the specified file system. .Pp The snapshot that was cloned, and any snapshots previous to this snapshot, are now owned by the promoted clone. The space they use moves from the origin file system to the promoted clone, so enough space must be available to accommodate these snapshots. No new space is consumed by this operation, but the space accounting is adjusted. The promoted clone must not have any conflicting snapshot names of its own. The .Cm rename subcommand can be used to rename any conflicting snapshots. .It Xo .Nm .Cm rename .Op Fl f .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo .Nm .Cm rename .Op Fl f .Fl p .Ar filesystem Ns | Ns Ar volume .Ar filesystem Ns | Ns Ar volume .Xc .It Xo .Nm .Cm rename .Fl u .Op Fl p .Ar filesystem filesystem .Xc .Pp Renames the given dataset. The new target can be located anywhere in the .Tn ZFS hierarchy, with the exception of snapshots. Snapshots can only be renamed within the parent file system or volume. When renaming a snapshot, the parent file system of the snapshot does not need to be specified as part of the second argument. Renamed file systems can inherit new mount points, in which case they are unmounted and remounted at the new mount point. .Bl -tag -width indent .It Fl p Creates all the nonexistent parent datasets. Datasets created in this manner are automatically mounted according to the .Sy mountpoint property inherited from their parent. .It Fl u Do not remount file systems during rename. If a file system's .Sy mountpoint property is set to .Cm legacy or .Cm none , file system is not unmounted even if this option is not given. .It Fl f Force unmount any filesystems that need to be unmounted in the process. This flag has no effect if used together with the .Fl u flag. .El .It Xo .Nm .Cm rename .Fl r .Ar snapshot snapshot .Xc .Pp Recursively rename the snapshots of all descendent datasets. Snapshots are the only dataset that can be renamed recursively. .It Xo .Nm .Cm list .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... .Oo Fl s Ar property Oc Ns ... .Oo Fl S Ar property Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Xc .Pp Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the .Sy listsnaps property is .Cm on (the default is .Cm off ) . The following fields are displayed, .Sy name , used , available , referenced , mountpoint . .Bl -tag -width indent .It Fl r Recursively display any children of the dataset on the command line. .It Fl d Ar depth Recursively display any children of the dataset, limiting the recursion to .Ar depth . A depth of .Sy 1 will display only the dataset and its direct children. .It Fl H Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary white space. .It Fl p Display numbers in parsable (exact) values. .It Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... A comma-separated list of properties to display. The property must be: .Bl -bullet -offset 2n .It One of the properties described in the .Qq Sx Native Properties section .It A user property .It The value .Cm name to display the dataset name .It The value .Cm space to display space usage properties on file systems and volumes. This is a shortcut for specifying .Fl o .Sy name,avail,used,usedsnap,usedds,usedrefreserv,usedchild .Fl t .Sy filesystem,volume syntax. .El .It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... A comma-separated list of types to display, where .Ar type is one of .Sy filesystem , snapshot , snap , volume , bookmark , No or Sy all . For example, specifying .Fl t Cm snapshot displays only snapshots. .It Fl s Ar property A property for sorting the output by column in ascending order based on the value of the property. The property must be one of the properties described in the .Qq Sx Properties section, or the special value .Cm name to sort by the dataset name. Multiple properties can be specified at one time using multiple .Fl s property options. Multiple .Fl s options are evaluated from left to right in decreasing order of importance. .Pp The following is a list of sorting criteria: .Bl -bullet -offset 2n .It Numeric types sort in numeric order. .It String types sort in alphabetical order. .It Types inappropriate for a row sort that row to the literal bottom, regardless of the specified ordering. .It If no sorting options are specified the existing behavior of .Qq Nm Cm list is preserved. .El .It Fl S Ar property Same as the .Fl s option, but sorts by property in descending order. .El .It Xo .Nm .Cm set .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .Pp Sets the property or list of properties to the given value(s) for each dataset. Only some properties can be edited. See the "Properties" section for more information on what properties can be set and acceptable values. Numeric values can be specified as exact values, or in a human-readable form with a suffix of .Sy B , K , M , G , T , P , E , Z (for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, or zettabytes, respectively). User properties can be set on snapshots. For more information, see the .Qq Sx User Properties section. .It Xo .Nm .Cm get .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp .Op Fl o Ar all | field Ns Oo , Ns Ar field Oc Ns ... .Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... .Op Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... .Ar all | property Ns Oo , Ns Ar property Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Ns ... .Xc .Pp Displays properties for the given datasets. If no datasets are specified, then the command displays properties for all datasets on the system. For each property, the following columns are displayed: .Pp .Bl -hang -width "property" -offset indent -compact .It name Dataset name .It property Property name .It value Property value .It source Property source. Can either be local, default, temporary, inherited, received, or none (\&-). .El .Pp All columns except the .Sy RECEIVED column are displayed by default. The columns to display can be specified by using the .Fl o option. This command takes a comma-separated list of properties as described in the .Qq Sx Native Properties and .Qq Sx User Properties sections. .Pp The special value .Cm all can be used to display all properties that apply to the given dataset's type (filesystem, volume, snapshot, or bookmark). .Bl -tag -width indent .It Fl r Recursively display properties for any children. .It Fl d Ar depth Recursively display any children of the dataset, limiting the recursion to .Ar depth . A depth of .Sy 1 will display only the dataset and its direct children. .It Fl H Display output in a form more easily parsed by scripts. Any headers are omitted, and fields are explicitly separated by a single tab instead of an arbitrary amount of space. .It Fl p Display numbers in parsable (exact) values. .It Fl o Cm all | Ar field Ns Oo , Ns Ar field Oc Ns ... A comma-separated list of columns to display. Supported values are .Sy name,property,value,received,source . Default values are .Sy name,property,value,source . The keyword .Cm all specifies all columns. .It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... A comma-separated list of types to display, where .Ar type is one of .Sy filesystem , snapshot , volume , No or Sy all . For example, specifying .Fl t Cm snapshot displays only snapshots. .It Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... A comma-separated list of sources to display. Those properties coming from a source other than those in this list are ignored. Each source must be one of the following: .Sy local,default,inherited,temporary,received,none . The default value is all sources. .El .It Xo .Nm .Cm inherit .Op Fl rS .Ar property .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Xc .Pp Clears the specified property, causing it to be inherited from an ancestor, restored to default if no ancestor has the property set, or with the .Fl S option reverted to the received value if one exists. See the .Qq Sx Properties section for a listing of default values, and details on which properties can be inherited. .Bl -tag -width indent .It Fl r Recursively inherit the given property for all children. .It Fl S Revert the property to the received value if one exists; otherwise operate as if the .Fl S option was not specified. .El .It Xo .Nm .Cm remap .Ar filesystem Ns | Ns Ar volume .Xc .Pp Remap the indirect blocks in the given fileystem or volume so that they no longer reference blocks on previously removed vdevs and we can eventually shrink the size of the indirect mapping objects for the previously removed vdevs. Note that remapping all blocks might not be possible and that references from snapshots will still exist and cannot be remapped. .It Xo .Nm .Cm upgrade .Op Fl v .Xc .Pp Displays a list of file systems that are not the most recent version. .Bl -tag -width indent .It Fl v Displays .Tn ZFS filesystem versions supported by the current software. The current .Tn ZFS filesystem version and all previous supported versions are displayed, along with an explanation of the features provided with each version. .El .It Xo .Nm .Cm upgrade .Op Fl r .Op Fl V Ar version .Fl a | Ar filesystem .Xc .Pp Upgrades file systems to a new on-disk version. Once this is done, the file systems will no longer be accessible on systems running older versions of the software. .Qq Nm Cm send streams generated from new snapshots of these file systems cannot be accessed on systems running older versions of the software. .Pp In general, the file system version is independent of the pool version. See .Xr zpool 8 for information on the .Nm zpool Cm upgrade command. .Pp In some cases, the file system version and the pool version are interrelated and the pool version must be upgraded before the file system version can be upgraded. .Bl -tag -width indent .It Fl r Upgrade the specified file system and all descendent file systems. .It Fl V Ar version Upgrade to the specified .Ar version . If the .Fl V flag is not specified, this command upgrades to the most recent version. This option can only be used to increase the version number, and only up to the most recent version supported by this software. .It Fl a Upgrade all file systems on all imported pools. .It Ar filesystem Upgrade the specified file system. .El .It Xo .Nm .Cm userspace .Op Fl Hinp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Oo Fl s Ar field Oc Ns ... .Oo Fl S Ar field Oc Ns ... .Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... .Ar filesystem Ns | Ns Ar snapshot .Xc .Pp Displays space consumed by, and quotas on, each user in the specified filesystem or snapshot. This corresponds to the .Sy userused@ Ns Ar user and .Sy userquota@ Ns Ar user properties. .Bl -tag -width indent .It Fl n Print numeric ID instead of user/group name. .It Fl H Do not print headers, use tab-delimited output. .It Fl p Use exact (parsable) numeric output. .It Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Display only the specified fields from the following set: .Sy type,name,used,quota . The default is to display all fields. .It Fl s Ar field Sort output by this field. The .Fl s and .Fl S flags may be specified multiple times to sort first by one field, then by another. The default is .Fl s Cm type Fl s Cm name . .It Fl S Ar field Sort by this field in reverse order. See .Fl s . .It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Print only the specified types from the following set: .Sy all,posixuser,smbuser,posixgroup,smbgroup . .Pp The default is .Fl t Cm posixuser,smbuser . .Pp The default can be changed to include group types. .It Fl i Translate SID to POSIX ID. This flag currently has no effect on .Fx . .El .It Xo .Nm .Cm groupspace .Op Fl Hinp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Oo Fl s Ar field Oc Ns ... .Oo Fl S Ar field Oc Ns ... .Op Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... .Ar filesystem Ns | Ns Ar snapshot .Xc .Pp Displays space consumed by, and quotas on, each group in the specified filesystem or snapshot. This subcommand is identical to .Qq Nm Cm userspace , except that the default types to display are .Fl t Sy posixgroup,smbgroup . .It Xo .Nm .Cm mount .Xc .Pp Displays all .Tn ZFS file systems currently mounted. .Bl -tag -width indent .It Fl f .El .It Xo .Nm .Cm mount .Op Fl vO .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... .Fl a | Ar filesystem .Xc .Pp Mounts .Tn ZFS file systems. .Bl -tag -width indent .It Fl v Report mount progress. .It Fl O Perform an overlay mount. Overlay mounts are not supported on .Fx . .It Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... An optional, comma-separated list of mount options to use temporarily for the duration of the mount. See the .Qq Sx Temporary Mount Point Properties section for details. .It Fl a Mount all available .Tn ZFS file systems. This command may be executed on .Fx system startup by .Pa /etc/rc.d/zfs . For more information, see variable .Va zfs_enable in .Xr rc.conf 5 . .It Ar filesystem Mount the specified filesystem. .El .It Xo .Nm .Cm unmount Ns | Ns Cm umount .Op Fl f .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Xc .Pp Unmounts currently mounted .Tn ZFS file systems. .Bl -tag -width indent .It Fl f Forcefully unmount the file system, even if it is currently in use. .It Fl a Unmount all available .Tn ZFS file systems. .It Ar filesystem | mountpoint Unmount the specified filesystem. The command can also be given a path to a .Tn ZFS file system mount point on the system. .El .It Xo .Nm .Cm share .Fl a | Ar filesystem .Xc .Pp Shares .Tn ZFS file systems that have the .Sy sharenfs property set. .Bl -tag -width indent .It Fl a Share all .Tn ZFS file systems that have the .Sy sharenfs property set. This command may be executed on .Fx system startup by .Pa /etc/rc.d/zfs . For more information, see variable .Va zfs_enable in .Xr rc.conf 5 . .It Ar filesystem Share the specified filesystem according to the .Tn sharenfs property. File systems are shared when the .Tn sharenfs property is set. .El .It Xo .Nm .Cm unshare .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Xc .Pp Unshares .Tn ZFS file systems that have the .Tn sharenfs property set. .Bl -tag -width indent .It Fl a Unshares .Tn ZFS file systems that have the .Sy sharenfs property set. This command may be executed on .Fx system shutdown by .Pa /etc/rc.d/zfs . For more information, see variable .Va zfs_enable in .Xr rc.conf 5 . .It Ar filesystem | mountpoint Unshare the specified filesystem. The command can also be given a path to a .Tn ZFS file system shared on the system. .El .It Xo .Nm .Cm bookmark .Ar snapshot .Ar bookmark .Xc .Pp Creates a bookmark of the given snapshot. Bookmarks mark the point in time when the snapshot was created, and can be used as the incremental source for a .Qq Nm Cm send command. .Pp This feature must be enabled to be used. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy bookmark feature. .It Xo .Nm .Cm send .Op Fl DLPRVcenpv .Op Fl i Ar snapshot | Fl I Ar snapshot .Ar snapshot .Xc .Pp Creates a stream representation of the last .Ar snapshot argument (not part of .Fl i or .Fl I ) which is written to standard output. The output can be redirected to a file or to a different system (for example, using .Xr ssh 1 ) . By default, a full stream is generated. .Bl -tag -width indent .It Fl i Ar snapshot Generate an incremental stream from the first .Ar snapshot Pq the incremental source to the second .Ar snapshot Pq the incremental target . The incremental source can be specified as the last component of the snapshot name .Pq the Em @ No character and following and it is assumed to be from the same file system as the incremental target. .Pp If the destination is a clone, the source may be the origin snapshot, which must be fully specified (for example, .Cm pool/fs@origin , not just .Cm @origin ) . .It Fl I Ar snapshot Generate a stream package that sends all intermediary snapshots from the first .Ar snapshot to the second .Ar snapshot . For example, .Ic -I @a fs@d is similar to .Ic -i @a fs@b; -i @b fs@c; -i @c fs@d . The incremental source may be specified as with the .Fl i option. .It Fl R, -replicate Generate a replication stream package, which will replicate the specified filesystem, and all descendent file systems, up to the named snapshot. When received, all properties, snapshots, descendent file systems, and clones are preserved. .Pp If the .Fl i or .Fl I flags are used in conjunction with the .Fl R flag, an incremental replication stream is generated. The current values of properties, and current snapshot and file system names are set when the stream is received. If the .Fl F flag is specified when this stream is received, snapshots and file systems that do not exist on the sending side are destroyed. .It Fl D, -dedup Generate a deduplicated stream. Blocks which would have been sent multiple times in the send stream will only be sent once. The receiving system must also support this feature to receive a deduplicated stream. This flag can be used regardless of the dataset's .Sy dedup property, but performance will be much better if the filesystem uses a dedup-capable checksum (eg. .Sy sha256 ) . .It Fl L, -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128KB. The receiving system must have the .Sy large_blocks pool feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy large_blocks feature. .It Fl e, -embed Generate a more compact stream by using WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy embedded_data feature. .It Fl c, -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory (see the .Sy compression property for details). If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c then the data will be decompressed before sending so it can be split into smaller block sizes. .It Fl p, -props Include the dataset's properties in the stream. This flag is implicit when .Fl R is specified. The receiving system must also support this feature. .It Fl n, -dryrun Do a dry-run ("No-op") send. Do not generate any actual send data. This is useful in conjunction with the .Fl v or .Fl P flags to determine what data will be sent. In this case, the verbose output will be written to standard output (contrast with a non-dry-run, where the stream is written to standard output and the verbose output goes to standard error). .It Fl P, -parsable Print machine-parsable verbose information about the stream package generated. .It Fl v, -verbose Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .It Fl V Set the process title to a per-second report of how much data has been sent. .El .Pp The format of the stream is committed. You will be able to receive your streams on future versions of .Tn ZFS . .It Xo .Nm .Cm send -.Op Fl Lce +.Op Fl LPcenv .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .Pp Generate a send stream, which may be of a filesystem, and may be incremental from a bookmark. If the destination is a filesystem or volume, the pool must be read-only, or the filesystem must not be mounted. When the stream generated from a filesystem or volume is received, the default snapshot name will be .Pq --head-- . .Bl -tag -width indent -.It Fl i Ar snapshot Ns | Ns bookmark +.It Fl i Ar snapshot Ns | Ns Ar bookmark Generate an incremental send stream. The incremental source must be an earlier snapshot in the destination's history. It will commonly be an earlier snapshot in the destination's filesystem, in which case it can be specified as the last component of the name .Pq the Em # No or Em @ No character and following . .Pp If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc. +.It Fl n, -dryrun +Do a dry-run +.Pq Qq No-op +send. +Do not generate any actual send data. +This is useful in conjunction with the +.Fl v +or +.Fl P +flags to determine what data will be sent. +In this case, the verbose output will be written to standard output +.Po contrast with a non-dry-run, where the stream is written to standard output +and the verbose output goes to standard error +.Pc . +.It Fl v, -verbose +Print verbose information about the stream package generated. +This information includes a per-second report of how much data has been sent. .It Fl L, -large-block Generate a stream which may contain blocks larger than 128KB. This flag has no effect if the .Sy large_blocks pool feature is disabled, or if the .Sy recordsize property of this filesystem has never been set above 128KB. The receiving system must have the .Sy large_blocks pool feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy large_blocks feature. +.It Fl P, -parsable +Print machine-parsable verbose information about the stream package generated. .It Fl c, -compressed Generate a more compact stream by using compressed WRITE records for blocks which are compressed on disk and in memory (see the .Sy compression property for details). If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the .Fl L option is not supplied in conjunction with .Fl c then the data will be decompressed before sending so it can be split into smaller block sizes. .It Fl e, -embed Generate a more compact stream by using WRITE_EMBEDDED records for blocks which are stored more compactly on disk by the .Sy embedded_data pool feature. This flag has no effect if the .Sy embedded_data feature is disabled. The receiving system must have the .Sy embedded_data feature enabled. If the .Sy lz4_compress feature is active on the sending system, then the receiving system must have that feature enabled as well. See .Xr zpool-features 7 for details on ZFS feature flags and the .Sy embedded_data feature. .El .It Xo .Nm .Cm send .Op Fl Penv .Fl t .Ar receive_resume_token .Xc Creates a send stream which resumes an interrupted receive. The .Ar receive_resume_token is the value of this property on the filesystem or volume that was being received into. See the documentation for .Sy zfs receive -s for more details. .It Xo .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnsFu .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo .Nm .Cm receive Ns | Ns Cm recv .Op Fl vnsFu .Op Fl d | e .Op Fl o Sy origin Ns = Ns Ar snapshot .Ar filesystem .Xc .Pp Creates a snapshot whose contents are as specified in the stream provided on standard input. If a full stream is received, then a new file system is created as well. Streams are created using the .Qq Nm Cm send subcommand, which by default creates a full stream. .Qq Nm Cm recv can be used as an alias for .Qq Nm Cm receive . .Pp If an incremental stream is received, then the destination file system must already exist, and its most recent snapshot must match the incremental stream's source. For .Sy zvol Ns s, the destination device link is destroyed and recreated, which means the .Sy zvol cannot be accessed during the .Sy receive operation. .Pp When a snapshot replication package stream that is generated by using the .Qq Nm Cm send Fl R command is received, any snapshots that do not exist on the sending location are destroyed by using the .Qq Nm Cm destroy Fl d command. .Pp The name of the snapshot (and file system, if a full stream is received) that this subcommand creates depends on the argument type and the .Fl d or .Fl e option. .Pp If the argument is a snapshot name, the specified .Ar snapshot is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified .Ar filesystem or .Ar volume . If the .Fl d or .Fl e option is specified, the snapshot name is determined by appending the sent snapshot's name to the specified .Ar filesystem . If the .Fl d option is specified, all but the pool name of the sent snapshot path is appended (for example, .Sy b/c@1 appended from sent snapshot .Sy a/b/c@1 ) , and if the .Fl e option is specified, only the tail of the sent snapshot path is appended (for example, .Sy c@1 appended from sent snapshot .Sy a/b/c@1 ) . In the case of .Fl d , any file systems needed to replicate the path of the sent snapshot are created within the specified file system. .Bl -tag -width indent .It Fl d Use the full sent snapshot path without the first element (without pool name) to determine the name of the new snapshot as described in the paragraph above. .It Fl e Use only the last element of the sent snapshot path to determine the name of the new snapshot as described in the paragraph above. .It Fl u File system that is associated with the received stream is not mounted. .It Fl v Print verbose information about the stream and the time required to perform the receive operation. .It Fl n Do not actually receive the stream. This can be useful in conjunction with the .Fl v option to verify the name the receive operation would use. .It Fl o Sy origin Ns = Ns Ar snapshot Forces the stream to be received as a clone of the given snapshot. If the stream is a full send stream, this will create the filesystem described by the stream as a clone of the specified snapshot. Which snapshot was specified will not affect the success or failure of the receive, as long as the snapshot does exist. If the stream is an incremental send stream, all the normal verification will be performed. .It Fl F Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by .Qq Nm Cm send Fl R Bro Fl i | Fl I Brc ) , destroy snapshots and file systems that do not exist on the sending side. .It Fl s If the receive is interrupted, save the partially received state, rather than deleting it. Interruption may be due to premature termination of the stream .Po e.g. due to network failure or failure of the remote system if the stream is being read over a network connection .Pc , a checksum error in the stream, termination of the .Nm zfs Cm receive process, or unclean shutdown of the system. .Pp The receive can be resumed with a stream generated by .Nm zfs Cm send Fl t Ar token , where the .Ar token is the value of the .Sy receive_resume_token property of the filesystem or volume which is received into. .Pp To use this flag, the storage pool must have the .Sy extensible_dataset feature enabled. See .Xr zpool-features 5 for details on ZFS feature flags. .El .It Xo .Nm .Cm receive Ns | Ns Cm recv .Fl A .Ar filesystem Ns | Ns Ar volume .Xc Abort an interrupted .Nm zfs Cm receive Fl s , deleting its saved partially received state. .It Xo .Nm .Cm allow .Ar filesystem Ns | Ns Ar volume .Xc .Pp Displays permissions that have been delegated on the specified filesystem or volume. See the other forms of .Qq Nm Cm allow for more information. .It Xo .Nm .Cm allow .Op Fl ldug .Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... .Ar perm Ns | Ns Ar @setname Ns .Oo Ns , Ns Ar perm Ns | Ns Ar @setname Oc Ns ... .Ar filesystem Ns | Ns Ar volume .Xc .It Xo .Nm .Cm allow .Op Fl ld .Fl e Ns | Ns Cm everyone .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Xc .Pp Delegates .Tn ZFS administration permission for the file systems to non-privileged users. .Bl -tag -width indent .It Xo .Op Fl ug .Ar user Ns | Ns Ar group Ns Oo , Ar user Ns | Ns Ar group Oc Ns ... .Xc Specifies to whom the permissions are delegated. Multiple entities can be specified as a comma-separated list. If neither of the .Fl ug options are specified, then the argument is interpreted preferentially as the keyword .Cm everyone , then as a user name, and lastly as a group name. To specify a user or group named .Qq everyone , use the .Fl u or .Fl g options. To specify a group with the same name as a user, use the .Fl g option. .It Op Fl e Ns | Ns Cm everyone Specifies that the permissions be delegated to .Qq everyone . .It Xo .Ar perm Ns | Ns Ar @setname Ns Oo , Ns Ar perm Ns | Ns Ar @setname Oc Ns ... .Xc The permissions to delegate. Multiple permissions may be specified as a comma-separated list. Permission names are the same as .Tn ZFS subcommand and property names. See the property list below. Property set names, which begin with an at sign .Pq Sy @ , may be specified. See the .Fl s form below for details. .It Xo .Op Fl ld .Ar filesystem Ns | Ns Ar volume .Xc Specifies where the permissions are delegated. If neither of the .Fl ld options are specified, or both are, then the permissions are allowed for the file system or volume, and all of its descendents. If only the .Fl l option is used, then is allowed "locally" only for the specified file system. If only the .Fl d option is used, then is allowed only for the descendent file systems. .El .Pp Permissions are generally the ability to use a .Tn ZFS subcommand or change a .Tn ZFS property. The following permissions are available: .Bl -column -offset 4n "secondarycache" "subcommand" .It NAME Ta TYPE Ta NOTES .It allow Ta subcommand Ta Must Xo also have the permission that is being allowed .Xc .It clone Ta subcommand Ta Must Xo also have the 'create' ability and 'mount' ability in the origin file system .Xc .It create Ta subcommand Ta Must also have the 'mount' ability .It destroy Ta subcommand Ta Must also have the 'mount' ability .It diff Ta subcommand Ta Allows lookup of paths within a dataset given an object number, and the ability to create snapshots necessary to 'zfs diff' .It hold Ta subcommand Ta Allows adding a user hold to a snapshot .It mount Ta subcommand Ta Allows mount/umount of Tn ZFS No datasets .It promote Ta subcommand Ta Must Xo also have the 'mount' and 'promote' ability in the origin file system .Xc .It receive Ta subcommand Ta Must also have the 'mount' and 'create' ability .It release Ta subcommand Ta Allows Xo releasing a user hold which might destroy the snapshot .Xc .It rename Ta subcommand Ta Must Xo also have the 'mount' and 'create' ability in the new parent .Xc .It rollback Ta subcommand Ta Must also have the 'mount' ability .It send Ta subcommand .It share Ta subcommand Ta Allows Xo sharing file systems over the .Tn NFS protocol .Xc .It snapshot Ta subcommand Ta Must also have the 'mount' ability .It groupquota Ta other Ta Allows accessing any groupquota@... property .It groupused Ta other Ta Allows reading any groupused@... property .It userprop Ta other Ta Allows changing any user property .It userquota Ta other Ta Allows accessing any userquota@... property .It userused Ta other Ta Allows reading any userused@... property .It aclinherit Ta property .It aclmode Ta property .It atime Ta property .It canmount Ta property .It casesensitivity Ta property .It checksum Ta property .It compression Ta property .It copies Ta property .It dedup Ta property .It devices Ta property .It exec Ta property .It filesystem_limit Ta property .It logbias Ta property .It jailed Ta property .It mlslabel Ta property .It mountpoint Ta property .It nbmand Ta property .It normalization Ta property .It primarycache Ta property .It quota Ta property .It readonly Ta property .It recordsize Ta property .It refquota Ta property .It refreservation Ta property .It reservation Ta property .It secondarycache Ta property .It setuid Ta property .It sharenfs Ta property .It sharesmb Ta property .It snapdir Ta property .It snapshot_limit Ta property .It sync Ta property .It utf8only Ta property .It version Ta property .It volblocksize Ta property .It volsize Ta property .It vscan Ta property .It xattr Ta property .El .It Xo .Nm .Cm allow .Fl c .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Xc .Pp Sets "create time" permissions. These permissions are granted (locally) to the creator of any newly-created descendent file system. .It Xo .Nm .Cm allow .Fl s .Ar @setname .Ar perm Ns | Ns Ar @setname Ns Op Ns , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... .Ar filesystem Ns | Ns Ar volume .Xc .Pp Defines or adds permissions to a permission set. The set can be used by other .Qq Nm Cm allow commands for the specified file system and its descendents. Sets are evaluated dynamically, so changes to a set are immediately reflected. Permission sets follow the same naming restrictions as ZFS file systems, but the name must begin with an "at sign" .Pq Sy @ , and can be no more than 64 characters long. .It Xo .Nm .Cm unallow .Op Fl rldug .Ar user Ns | Ns Ar group Ns Oo Ns , Ns Ar user Ns | Ns Ar group Oc Ns ... .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Xc .It Xo .Nm .Cm unallow .Op Fl rld .Fl e Ns | Ns Cm everyone .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Xc .It Xo .Nm .Cm unallow .Op Fl r .Fl c .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Xc .Pp Removes permissions that were granted with the .Qq Nm Cm allow command. No permissions are explicitly denied, so other permissions granted are still in effect. For example, if the permission is granted by an ancestor. If no permissions are specified, then all permissions for the specified .Ar user , group , No or everyone are removed. Specifying .Cm everyone .Po or using the Fl e option .Pc only removes the permissions that were granted to everyone , not all permissions for every user and group. See the .Qq Nm Cm allow command for a description of the .Fl ldugec options. .Bl -tag -width indent .It Fl r Recursively remove the permissions from this file system and all descendents. .El .It Xo .Nm .Cm unallow .Op Fl r .Fl s .Ar @setname .Oo Ar perm Ns | Ns Ar @setname Ns Op , Ns Ar perm Ns | Ns Ar @setname Ns .Ns ... Oc .Ar filesystem Ns | Ns Ar volume .Xc .Pp Removes permissions from a permission set. If no permissions are specified, then all permissions are removed, thus removing the set entirely. .It Xo .Nm .Cm hold .Op Fl r .Ar tag snapshot Ns ... .Xc .Pp Adds a single reference, named with the .Ar tag argument, to the specified snapshot or snapshots. Each snapshot has its own tag namespace, and tags must be unique within that space. .Pp If a hold exists on a snapshot, attempts to destroy that snapshot by using the .Qq Nm Cm destroy command returns .Em EBUSY . .Bl -tag -width indent .It Fl r Specifies that a hold with the given tag is applied recursively to the snapshots of all descendent file systems. .El .It Xo .Nm .Cm holds .Op Fl Hp .Op Fl r Ns | Ns Fl d Ar depth .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns .Ns ... .Xc .Pp Lists all existing user references for the given dataset or datasets. .Bl -tag -width indent .It Fl H Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary white space. .It Fl p Display numbers in parsable (exact) values. .It Fl r Lists the holds that are set on the descendent snapshots of the named datasets or snapshots, in addition to listing the holds on the named snapshots, if any. .It Fl d Ar depth Recursively display any holds on the named snapshots, or descendent snapshots of the named datasets or snapshots, limiting the recursion to .Ar depth . .El .It Xo .Nm .Cm release .Op Fl r .Ar tag snapshot Ns ... .Xc .Pp Removes a single reference, named with the .Ar tag argument, from the specified snapshot or snapshots. The tag must already exist for each snapshot. .Bl -tag -width indent .It Fl r Recursively releases a hold with the given tag on the snapshots of all descendent file systems. .El .It Xo .Nm .Cm diff .Op Fl FHt .Ar snapshot .Op Ar snapshot Ns | Ns Ar filesystem .Xc .Pp Display the difference between a snapshot of a given filesystem and another snapshot of that filesystem from a later time or the current contents of the filesystem. The first column is a character indicating the type of change, the other columns indicate pathname, new pathname .Pq in case of rename , change in link count, and optionally file type and/or change time. .Pp The types of change are: .Bl -column -offset 2n indent .It \&- Ta path was removed .It \&+ Ta path was added .It \&M Ta path was modified .It \&R Ta path was renamed .El .Bl -tag -width indent .It Fl F Display an indication of the type of file, in a manner similar to the .Fl F option of .Xr ls 1 . .Bl -column -offset 2n indent .It \&B Ta block device .It \&C Ta character device .It \&F Ta regular file .It \&/ Ta directory .It \&@ Ta symbolic link .It \&= Ta socket .It \&> Ta door (not supported on Fx ) .It \&| Ta named pipe (not supported on Fx ) .It \&P Ta event port (not supported on Fx ) .El .It Fl H Give more parsable tab-separated output, without header lines and without arrows. .It Fl t Display the path's inode change time as the first column of output. .El .It Xo .Nm .Cm program .Op Fl jn .Op Fl t Ar timeout .Op Fl m Ar memory_limit .Ar pool script .Op Ar arg1 No ... .Xc .Pp Executes .Ar script as a ZFS channel program on .Ar pool . The ZFS channel program interface allows ZFS administrative operations to be run programmatically via a Lua script. The entire script is executed atomically, with no other administrative operations taking effect concurrently. A library of ZFS calls is made available to channel program scripts. Channel programs may only be run with root privileges. .Pp For full documentation of the ZFS channel program interface, see the manual page for .Xr zfs-program 8 . .Bl -tag -width indent .It Fl j Display channel program output in JSON format. When this flag is specified and standard output is empty - channel program encountered an error. The details of such an error will be printed to standard error in plain text. .It Fl n Executes a read-only channel program, which runs faster. The program cannot change on-disk state by calling functions from the zfs.sync submodule. The program can be used to gather information such as properties and determining if changes would succeed (zfs.check.*). Without this flag, all pending changes must be synced to disk before a channel program can complete. .It Fl t Ar timeout Execution time limit, in milliseconds. If a channel program executes for longer than the provided timeout, it will be stopped and an error will be returned. The default timeout is 1000 ms, and can be set to a maximum of 10000 ms. .It Fl m Ar memory-limit Memory limit, in bytes. If a channel program attempts to allocate more memory than the given limit, it will be stopped and an error returned. The default memory limit is 10 MB, and can be set to a maximum of 100 MB. .Pp All remaining argument strings are passed directly to the channel program as arguments. See .Xr zfs-program 8 for more information. .El .It Xo .Nm .Cm jail .Ar jailid filesystem .Xc .Pp Attaches the specified .Ar filesystem to the jail identified by JID .Ar jailid . From now on this file system tree can be managed from within a jail if the .Sy jailed property has been set. To use this functionality, the jail needs the .Va allow.mount and .Va allow.mount.zfs parameters set to 1 and the .Va enforce_statfs parameter set to a value lower than 2. .Pp See .Xr jail 8 for more information on managing jails and configuring the parameters above. .It Xo .Nm .Cm unjail .Ar jailid filesystem .Xc .Pp Detaches the specified .Ar filesystem from the jail identified by JID .Ar jailid . .El .Sh EXIT STATUS The following exit values are returned: .Bl -tag -offset 2n -width 2n .It 0 Successful completion. .It 1 An error occurred. .It 2 Invalid command line options were specified. .El .Sh EXAMPLES .Bl -tag -width 0n .It Sy Example 1 No Creating a Tn ZFS No File System Hierarchy .Pp The following commands create a file system named .Em pool/home and a file system named .Em pool/home/bob . The mount point .Pa /home is set for the parent file system, and is automatically inherited by the child file system. .Bd -literal -offset 2n .Li # Ic zfs create pool/home .Li # Ic zfs set mountpoint=/home pool/home .Li # Ic zfs create pool/home/bob .Ed .It Sy Example 2 No Creating a Tn ZFS No Snapshot .Pp The following command creates a snapshot named .Sy yesterday . This snapshot is mounted on demand in the .Pa \&.zfs/snapshot directory at the root of the .Em pool/home/bob file system. .Bd -literal -offset 2n .Li # Ic zfs snapshot pool/home/bob@yesterday .Ed .It Sy Example 3 No Creating and Destroying Multiple Snapshots .Pp The following command creates snapshots named .Em yesterday of .Em pool/home and all of its descendent file systems. Each snapshot is mounted on demand in the .Pa \&.zfs/snapshot directory at the root of its file system. The second command destroys the newly created snapshots. .Bd -literal -offset 2n .Li # Ic zfs snapshot -r pool/home@yesterday .Li # Ic zfs destroy -r pool/home@yesterday .Ed .It Sy Example 4 No Disabling and Enabling File System Compression .Pp The following command disables the .Sy compression property for all file systems under .Em pool/home . The next command explicitly enables .Sy compression for .Em pool/home/anne . .Bd -literal -offset 2n .Li # Ic zfs set compression=off pool/home .Li # Ic zfs set compression=on pool/home/anne .Ed .It Sy Example 5 No Listing Tn ZFS No Datasets .Pp The following command lists all active file systems and volumes in the system. Snapshots are displayed if the .Sy listsnaps property is .Cm on . The default is .Cm off . See .Xr zpool 8 for more information on pool properties. .Bd -literal -offset 2n .Li # Ic zfs list NAME USED AVAIL REFER MOUNTPOINT pool 450K 457G 18K /pool pool/home 315K 457G 21K /home pool/home/anne 18K 457G 18K /home/anne pool/home/bob 276K 457G 276K /home/bob .Ed .It Sy Example 6 No Setting a Quota on a Tn ZFS No File System .Pp The following command sets a quota of 50 Gbytes for .Em pool/home/bob . .Bd -literal -offset 2n .Li # Ic zfs set quota=50G pool/home/bob .Ed .It Sy Example 7 No Listing Tn ZFS No Properties .Pp The following command lists all properties for .Em pool/home/bob . .Bd -literal -offset 2n .Li # Ic zfs get all pool/home/bob NAME PROPERTY VALUE SOURCE pool/home/bob type filesystem - pool/home/bob creation Tue Jul 21 15:53 2009 - pool/home/bob used 21K - pool/home/bob available 20.0G - pool/home/bob referenced 21K - pool/home/bob compressratio 1.00x - pool/home/bob mounted yes - pool/home/bob quota 20G local pool/home/bob reservation none default pool/home/bob recordsize 128K default pool/home/bob mountpoint /home/bob default pool/home/bob sharenfs off default pool/home/bob checksum on default pool/home/bob compression on local pool/home/bob atime on default pool/home/bob devices on default pool/home/bob exec on default pool/home/bob filesystem_limit none default pool/home/bob setuid on default pool/home/bob readonly off default pool/home/bob jailed off default pool/home/bob snapdir hidden default pool/home/bob snapshot_limit none default pool/home/bob aclmode discard default pool/home/bob aclinherit restricted default pool/home/bob canmount on default pool/home/bob xattr on default pool/home/bob copies 1 default pool/home/bob version 5 - pool/home/bob utf8only off - pool/home/bob normalization none - pool/home/bob casesensitivity sensitive - pool/home/bob vscan off default pool/home/bob nbmand off default pool/home/bob sharesmb off default pool/home/bob refquota none default pool/home/bob refreservation none default pool/home/bob primarycache all default pool/home/bob secondarycache all default pool/home/bob usedbysnapshots 0 - pool/home/bob usedbydataset 21K - pool/home/bob usedbychildren 0 - pool/home/bob usedbyrefreservation 0 - pool/home/bob logbias latency default pool/home/bob dedup off default pool/home/bob mlslabel - pool/home/bob sync standard default pool/home/bob refcompressratio 1.00x - .Ed .Pp The following command gets a single property value. .Bd -literal -offset 2n .Li # Ic zfs get -H -o value compression pool/home/bob on .Ed .Pp The following command lists all properties with local settings for .Em pool/home/bob . .Bd -literal -offset 2n .Li # Ic zfs get -s local -o name,property,value all pool/home/bob NAME PROPERTY VALUE pool/home/bob quota 20G pool/home/bob compression on .Ed .It Sy Example 8 No Rolling Back a Tn ZFS No File System .Pp The following command reverts the contents of .Em pool/home/anne to the snapshot named .Em yesterday , deleting all intermediate snapshots. .Bd -literal -offset 2n .Li # Ic zfs rollback -r pool/home/anne@yesterday .Ed .It Sy Example 9 No Creating a Tn ZFS No Clone .Pp The following command creates a writable file system whose initial contents are the same as .Em pool/home/bob@yesterday . .Bd -literal -offset 2n .Li # Ic zfs clone pool/home/bob@yesterday pool/clone .Ed .It Sy Example 10 No Promoting a Tn ZFS No Clone .Pp The following commands illustrate how to test out changes to a file system, and then replace the original file system with the changed one, using clones, clone promotion, and renaming: .Bd -literal -offset 2n .Li # Ic zfs create pool/project/production .Ed .Pp Populate .Pa /pool/project/production with data and continue with the following commands: .Bd -literal -offset 2n .Li # Ic zfs snapshot pool/project/production@today .Li # Ic zfs clone pool/project/production@today pool/project/beta .Ed .Pp Now make changes to .Pa /pool/project/beta and continue with the following commands: .Bd -literal -offset 2n .Li # Ic zfs promote pool/project/beta .Li # Ic zfs rename pool/project/production pool/project/legacy .Li # Ic zfs rename pool/project/beta pool/project/production .Ed .Pp Once the legacy version is no longer needed, it can be destroyed. .Bd -literal -offset 2n .Li # Ic zfs destroy pool/project/legacy .Ed .It Sy Example 11 No Inheriting Tn ZFS No Properties .Pp The following command causes .Em pool/home/bob and .Em pool/home/anne to inherit the .Sy checksum property from their parent. .Bd -literal -offset 2n .Li # Ic zfs inherit checksum pool/home/bob pool/home/anne .Ed .It Sy Example 12 No Remotely Replicating Tn ZFS No Data .Pp The following commands send a full stream and then an incremental stream to a remote machine, restoring them into .Sy poolB/received/fs@a and .Sy poolB/received/fs@b , respectively. .Sy poolB must contain the file system .Sy poolB/received , and must not initially contain .Sy poolB/received/fs . .Bd -literal -offset 2n .Li # Ic zfs send pool/fs@a | ssh host zfs receive poolB/received/fs@a .Li # Ic zfs send -i a pool/fs@b | ssh host zfs receive poolB/received/fs .Ed .It Xo .Sy Example 13 Using the .Qq zfs receive -d Option .Xc .Pp The following command sends a full stream of .Sy poolA/fsA/fsB@snap to a remote machine, receiving it into .Sy poolB/received/fsA/fsB@snap . The .Sy fsA/fsB@snap portion of the received snapshot's name is determined from the name of the sent snapshot. .Sy poolB must contain the file system .Sy poolB/received . If .Sy poolB/received/fsA does not exist, it is created as an empty file system. .Bd -literal -offset 2n .Li # Ic zfs send poolA/fsA/fsB@snap | ssh host zfs receive -d poolB/received .Ed .It Sy Example 14 No Setting User Properties .Pp The following example sets the user-defined .Sy com.example:department property for a dataset. .Bd -literal -offset 2n .Li # Ic zfs set com.example:department=12345 tank/accounting .Ed .It Sy Example 15 No Performing a Rolling Snapshot .Pp The following example shows how to maintain a history of snapshots with a consistent naming scheme. To keep a week's worth of snapshots, the user destroys the oldest snapshot, renames the remaining snapshots, and then creates a new snapshot, as follows: .Bd -literal -offset 2n .Li # Ic zfs destroy -r pool/users@7daysago .Li # Ic zfs rename -r pool/users@6daysago @7daysago .Li # Ic zfs rename -r pool/users@5daysago @6daysago .Li # Ic zfs rename -r pool/users@4daysago @5daysago .Li # Ic zfs rename -r pool/users@3daysago @4daysago .Li # Ic zfs rename -r pool/users@2daysago @3daysago .Li # Ic zfs rename -r pool/users@yesterday @2daysago .Li # Ic zfs rename -r pool/users@today @yesterday .Li # Ic zfs snapshot -r pool/users@today .Ed .It Xo .Sy Example 16 Setting .Qq sharenfs Property Options on a ZFS File System .Xc .Pp The following command shows how to set .Sy sharenfs property options to enable root access for a specific network on the .Em tank/home file system. The contents of the .Sy sharenfs property are valid .Xr exports 5 options. .Bd -literal -offset 2n .Li # Ic zfs set sharenfs="maproot=root,network 192.168.0.0/24" tank/home .Ed .Pp Another way to write this command with the same result is: .Bd -literal -offset 2n .Li # Ic set zfs sharenfs="-maproot=root -network 192.168.0.0/24" tank/home .Ed .It Xo .Sy Example 17 Delegating .Tn ZFS Administration Permissions on a .Tn ZFS Dataset .Xc .Pp The following example shows how to set permissions so that user .Em cindys can create, destroy, mount, and take snapshots on .Em tank/cindys . The permissions on .Em tank/cindys are also displayed. .Bd -literal -offset 2n .Li # Ic zfs allow cindys create,destroy,mount,snapshot tank/cindys .Li # Ic zfs allow tank/cindys ---- Permissions on tank/cindys -------------------------------------- Local+Descendent permissions: user cindys create,destroy,mount,snapshot .Ed .It Sy Example 18 No Delegating Create Time Permissions on a Tn ZFS No Dataset .Pp The following example shows how to grant anyone in the group .Em staff to create file systems in .Em tank/users . This syntax also allows staff members to destroy their own file systems, but not destroy anyone else's file system. The permissions on .Em tank/users are also displayed. .Bd -literal -offset 2n .Li # Ic zfs allow staff create,mount tank/users .Li # Ic zfs allow -c destroy tank/users .Li # Ic zfs allow tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: destroy Local+Descendent permissions: group staff create,mount .Ed .It Xo .Sy Example 19 Defining and Granting a Permission Set on a .Tn ZFS Dataset .Xc .Pp The following example shows how to define and grant a permission set on the .Em tank/users file system. The permissions on .Em tank/users are also displayed. .Bd -literal -offset 2n .Li # Ic zfs allow -s @pset create,destroy,snapshot,mount tank/users .Li # Ic zfs allow staff @pset tank/users .Li # Ic zfs allow tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: @pset create,destroy,mount,snapshot Local+Descendent permissions: group staff @pset .Ed .It Sy Example 20 No Delegating Property Permissions on a Tn ZFS No Dataset .Pp The following example shows to grant the ability to set quotas and reservations on the .Sy users/home file system. The permissions on .Sy users/home are also displayed. .Bd -literal -offset 2n .Li # Ic zfs allow cindys quota,reservation users/home .Li # Ic zfs allow users/home ---- Permissions on users/home --------------------------------------- Local+Descendent permissions: user cindys quota,reservation .Li # Ic su - cindys .Li cindys% Ic zfs set quota=10G users/home/marks .Li cindys% Ic zfs get quota users/home/marks NAME PROPERTY VALUE SOURCE users/home/marks quota 10G local .Ed .It Sy Example 21 No Removing ZFS Delegated Permissions on a Tn ZFS No Dataset .Pp The following example shows how to remove the snapshot permission from the .Em staff group on the .Em tank/users file system. The permissions on .Em tank/users are also displayed. .Bd -literal -offset 2n .Li # Ic zfs unallow staff snapshot tank/users .Li # Ic zfs allow tank/users ---- Permissions on tank/users --------------------------------------- Permission sets: @pset create,destroy,mount,snapshot Local+Descendent permissions: group staff @pset .Ed .It Sy Example 22 Showing the differences between a snapshot and a ZFS Dataset .Pp The following example shows how to see what has changed between a prior snapshot of a ZFS Dataset and its current state. The .Fl F option is used to indicate type information for the files affected. .Bd -literal -offset 2n .Li # Ic zfs diff tank/test@before tank/test M / /tank/test/ M F /tank/test/linked (+1) R F /tank/test/oldname -> /tank/test/newname - F /tank/test/deleted + F /tank/test/created M F /tank/test/modified .Ed .El .Sh SEE ALSO .Xr chmod 2 , .Xr fsync 2 , .Xr exports 5 , .Xr fstab 5 , .Xr rc.conf 5 , .Xr jail 8 , .Xr mount 8 , .Xr umount 8 , .Xr zpool 8 .Sh AUTHORS This manual page is a .Xr mdoc 7 reimplementation of the .Tn OpenSolaris manual page .Em zfs(1M) , modified and customized for .Fx and licensed under the Common Development and Distribution License .Pq Tn CDDL . .Pp The .Xr mdoc 7 implementation of this manual page was initially written by .An Martin Matuska Aq mm@FreeBSD.org . Index: projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c (revision 352536) +++ projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c (revision 352537) @@ -1,7546 +1,7537 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. * Copyright (c) 2012 Martin Matuska . All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Igor Kozhukhov . * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2018 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef illumos #include #include #include #include #endif #include "zfs_iter.h" #include "zfs_util.h" #include "zfs_comutil.h" libzfs_handle_t *g_zfs; static FILE *mnttab_file; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); static int zfs_do_destroy(int argc, char **argv); static int zfs_do_get(int argc, char **argv); static int zfs_do_inherit(int argc, char **argv); static int zfs_do_list(int argc, char **argv); static int zfs_do_mount(int argc, char **argv); static int zfs_do_rename(int argc, char **argv); static int zfs_do_rollback(int argc, char **argv); static int zfs_do_set(int argc, char **argv); static int zfs_do_upgrade(int argc, char **argv); static int zfs_do_snapshot(int argc, char **argv); static int zfs_do_unmount(int argc, char **argv); static int zfs_do_share(int argc, char **argv); static int zfs_do_unshare(int argc, char **argv); static int zfs_do_send(int argc, char **argv); static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv); static int zfs_do_allow(int argc, char **argv); static int zfs_do_unallow(int argc, char **argv); static int zfs_do_hold(int argc, char **argv); static int zfs_do_holds(int argc, char **argv); static int zfs_do_release(int argc, char **argv); static int zfs_do_diff(int argc, char **argv); static int zfs_do_jail(int argc, char **argv); static int zfs_do_unjail(int argc, char **argv); static int zfs_do_bookmark(int argc, char **argv); static int zfs_do_remap(int argc, char **argv); static int zfs_do_channel_program(int argc, char **argv); /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_CLONE, HELP_CREATE, HELP_DESTROY, HELP_GET, HELP_INHERIT, HELP_UPGRADE, HELP_JAIL, HELP_UNJAIL, HELP_LIST, HELP_MOUNT, HELP_PROMOTE, HELP_RECEIVE, HELP_RENAME, HELP_ROLLBACK, HELP_SEND, HELP_SET, HELP_SHARE, HELP_SNAPSHOT, HELP_UNMOUNT, HELP_UNSHARE, HELP_ALLOW, HELP_UNALLOW, HELP_USERSPACE, HELP_GROUPSPACE, HELP_HOLD, HELP_HOLDS, HELP_RELEASE, HELP_DIFF, HELP_REMAP, HELP_BOOKMARK, HELP_CHANNEL_PROGRAM, } zfs_help_t; typedef struct zfs_command { const char *name; int (*func)(int argc, char **argv); zfs_help_t usage; } zfs_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zfs_command_t command_table[] = { { "create", zfs_do_create, HELP_CREATE }, { "destroy", zfs_do_destroy, HELP_DESTROY }, { NULL }, { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, { "rollback", zfs_do_rollback, HELP_ROLLBACK }, { "clone", zfs_do_clone, HELP_CLONE }, { "promote", zfs_do_promote, HELP_PROMOTE }, { "rename", zfs_do_rename, HELP_RENAME }, { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, { NULL }, { "list", zfs_do_list, HELP_LIST }, { NULL }, { "set", zfs_do_set, HELP_SET }, { "get", zfs_do_get, HELP_GET }, { "inherit", zfs_do_inherit, HELP_INHERIT }, { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, { "userspace", zfs_do_userspace, HELP_USERSPACE }, { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, { NULL }, { "mount", zfs_do_mount, HELP_MOUNT }, { "unmount", zfs_do_unmount, HELP_UNMOUNT }, { "share", zfs_do_share, HELP_SHARE }, { "unshare", zfs_do_unshare, HELP_UNSHARE }, { NULL }, { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { NULL }, { "allow", zfs_do_allow, HELP_ALLOW }, { NULL }, { "unallow", zfs_do_unallow, HELP_UNALLOW }, { NULL }, { "hold", zfs_do_hold, HELP_HOLD }, { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, { "diff", zfs_do_diff, HELP_DIFF }, { NULL }, { "jail", zfs_do_jail, HELP_JAIL }, { "unjail", zfs_do_unjail, HELP_UNJAIL }, { "remap", zfs_do_remap, HELP_REMAP }, }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) zfs_command_t *current_command; static const char * get_usage(zfs_help_t idx) { switch (idx) { case HELP_CLONE: return (gettext("\tclone [-p] [-o property=value] ... " " \n")); case HELP_CREATE: return (gettext("\tcreate [-pu] [-o property=value] ... " "\n" "\tcreate [-ps] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: return (gettext("\tdestroy [-fnpRrv] \n" "\tdestroy [-dnpRrv] " "@[%][,...]\n" "\tdestroy #\n")); case HELP_GET: return (gettext("\tget [-rHp] [-d max] " "[-o \"all\" | field[,...]]\n" "\t [-t type[,...]] [-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot|bookmark] ...\n")); case HELP_INHERIT: return (gettext("\tinherit [-rS] " " ...\n")); case HELP_UPGRADE: return (gettext("\tupgrade [-v]\n" "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); case HELP_JAIL: return (gettext("\tjail \n")); case HELP_UNJAIL: return (gettext("\tunjail \n")); case HELP_LIST: return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " "[-s property]...\n\t [-S property]... [-t type[,...]] " "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" "\tmount [-vO] [-o opts] <-a | filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: return (gettext("\treceive|recv [-vnsFu] \n" "\treceive|recv [-vnsFu] [-o origin=] [-d | -e] " "\n" "\treceive|recv -A \n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" "\trename [-f] -p \n" "\trename -r \n" "\trename -u [-p] ")); case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] " "\n" - "\tsend [-Le] [-i snapshot|bookmark] " + "\tsend [-LPcenv] [-i snapshot|bookmark] " "\n" "\tsend [-nvPe] -t \n")); case HELP_SET: return (gettext("\tset ... " " ...\n")); case HELP_SHARE: return (gettext("\tshare <-a | filesystem>\n")); case HELP_SNAPSHOT: return (gettext("\tsnapshot|snap [-r] [-o property=value] ... " "@ ...\n")); case HELP_UNMOUNT: return (gettext("\tunmount|umount [-f] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: return (gettext("\tunshare " "<-a | filesystem|mountpoint>\n")); case HELP_ALLOW: return (gettext("\tallow \n" "\tallow [-ldug] " "<\"everyone\"|user|group>[,...] [,...]\n" "\t \n" "\tallow [-ld] -e [,...] " "\n" "\tallow -c [,...] \n" "\tallow -s @setname [,...] " "\n")); case HELP_UNALLOW: return (gettext("\tunallow [-rldug] " "<\"everyone\"|user|group>[,...]\n" "\t [[,...]] \n" "\tunallow [-rld] -e [[,...]] " "\n" "\tunallow [-r] -c [[,...]] " "\n" "\tunallow [-r] -s @setname [[,...]] " "\n")); case HELP_USERSPACE: return (gettext("\tuserspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_GROUPSPACE: return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_HOLD: return (gettext("\thold [-r] ...\n")); case HELP_HOLDS: return (gettext("\tholds [-Hp] [-r|-d depth] " " ...\n")); case HELP_RELEASE: return (gettext("\trelease [-r] ...\n")); case HELP_DIFF: return (gettext("\tdiff [-FHt] " "[snapshot|filesystem]\n")); case HELP_REMAP: return (gettext("\tremap \n")); case HELP_BOOKMARK: return (gettext("\tbookmark \n")); case HELP_CHANNEL_PROGRAM: return (gettext("\tprogram [-jn] [-t ] " "[-m ] " "[lua args...]\n")); } abort(); /* NOTREACHED */ } void nomem(void) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); exit(1); } /* * Utility function to guarantee malloc() success. */ void * safe_malloc(size_t size) { void *data; if ((data = calloc(1, size)) == NULL) nomem(); return (data); } void * safe_realloc(void *data, size_t size) { void *newp; if ((newp = realloc(data, size)) == NULL) { free(data); nomem(); } return (newp); } static char * safe_strdup(char *str) { char *dupstr = strdup(str); if (dupstr == NULL) nomem(); return (dupstr); } /* * Callback routine that will print out information for each of * the properties. */ static int usage_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); if (zfs_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, "YES "); if (zfs_prop_inheritable(prop)) (void) fprintf(fp, " YES "); else (void) fprintf(fp, " NO "); if (zfs_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zfs_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static void usage(boolean_t requested) { int i; boolean_t show_properties = B_FALSE; FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { (void) fprintf(fp, gettext("usage: zfs command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } (void) fprintf(fp, gettext("\nEach dataset is of the form: " "pool/[dataset/]*dataset[@name]\n")); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && (strcmp(current_command->name, "set") == 0 || strcmp(current_command->name, "get") == 0 || strcmp(current_command->name, "inherit") == 0 || strcmp(current_command->name, "list") == 0)) show_properties = B_TRUE; if (show_properties) { (void) fprintf(fp, gettext("\nThe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", "PROPERTY", "EDIT", "INHERIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); (void) fprintf(fp, "\t%-15s ", "userused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "groupused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "userquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "written@"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); (void) fprintf(fp, gettext("\nUser-defined properties can " "be specified by using a name containing a colon (:).\n")); (void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ " "properties must be appended with\n" "a user or group specifier of one of these forms:\n" " POSIX name (eg: \"matt\")\n" " POSIX id (eg: \"126829\")\n" " SMB name@domain (eg: \"matt@sun\")\n" " SMB SID (eg: \"S-1-234-567-89\")\n")); } else { (void) fprintf(fp, gettext("\nFor the property list, run: %s\n"), "zfs set|get"); (void) fprintf(fp, gettext("\nFor the delegated permission list, run: %s\n"), "zfs allow|unallow"); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * Take a property=value argument string and add it to the given nvlist. * Modifies the argument inplace. */ static int parseprop(nvlist_t *props, char *propname) { char *propval, *strval; if ((propval = strchr(propname, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for property=value argument\n")); return (-1); } *propval = '\0'; propval++; if (nvlist_lookup_string(props, propname, &strval) == 0) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (-1); } if (nvlist_add_string(props, propname, propval) != 0) nomem(); return (0); } static int parse_depth(char *opt, int *flags) { char *tmp; int depth; depth = (int)strtol(opt, &tmp, 0); if (*tmp) { (void) fprintf(stderr, gettext("%s is not an integer\n"), opt); usage(B_FALSE); } if (depth < 0) { (void) fprintf(stderr, gettext("Depth can not be negative.\n")); usage(B_FALSE); } *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); return (depth); } #define PROGRESS_DELAY 2 /* seconds */ static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; static time_t pt_begin; static char *pt_header = NULL; static boolean_t pt_shown; static void start_progress_timer(void) { pt_begin = time(NULL) + PROGRESS_DELAY; pt_shown = B_FALSE; } static void set_progress_header(char *header) { assert(pt_header == NULL); pt_header = safe_strdup(header); if (pt_shown) { (void) printf("%s: ", header); (void) fflush(stdout); } } static void update_progress(char *update) { if (!pt_shown && time(NULL) > pt_begin) { int len = strlen(update); (void) printf("%s: %s%*.*s", pt_header, update, len, len, pt_reverse); (void) fflush(stdout); pt_shown = B_TRUE; } else if (pt_shown) { int len = strlen(update); (void) printf("%s%*.*s", update, len, len, pt_reverse); (void) fflush(stdout); } } static void finish_progress(char *done) { if (pt_shown) { (void) printf("%s\n", done); (void) fflush(stdout); } free(pt_header); pt_header = NULL; } /* * Check if the dataset is mountable and should be automatically mounted. */ static boolean_t should_auto_mount(zfs_handle_t *zhp) { if (!zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, zfs_get_type(zhp))) return (B_FALSE); return (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON); } /* * zfs clone [-p] [-o prop=value] ... * * Given an existing dataset, create a writable copy whose initial contents * are the same as the source. The newly created dataset maintains a * dependency on the original; the original cannot be destroyed so long as * the clone exists. * * The '-p' flag creates all the non-existing ancestors of the target first. */ static int zfs_do_clone(int argc, char **argv) { zfs_handle_t *zhp = NULL; boolean_t parents = B_FALSE; nvlist_t *props; int ret = 0; int c; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "o:p")) != -1) { switch (c) { case 'o': if (parseprop(props, optarg) != 0) return (1); break; case 'p': parents = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); goto usage; } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); goto usage; } /* open the source dataset */ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) return (1); if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { /* * Now create the ancestors of the target dataset. If the * target already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) return (0); if (zfs_create_ancestors(g_zfs, argv[1]) != 0) return (1); } /* pass to libzfs */ ret = zfs_clone(zhp, argv[1], props); /* create the mountpoint if necessary */ if (ret == 0) { zfs_handle_t *clone; clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET); if (clone != NULL) { /* * If the user doesn't want the dataset * automatically mounted, then skip the mount/share * step. */ if (should_auto_mount(clone)) { if ((ret = zfs_mount(clone, NULL, 0)) != 0) { (void) fprintf(stderr, gettext("clone " "successfully created, " "but not mounted\n")); } else if ((ret = zfs_share(clone)) != 0) { (void) fprintf(stderr, gettext("clone " "successfully created, " "but not shared\n")); } } zfs_close(clone); } } zfs_close(zhp); nvlist_free(props); return (!!ret); usage: if (zhp) zfs_close(zhp); nvlist_free(props); usage(B_FALSE); return (-1); } /* * zfs create [-pu] [-o prop=value] ... fs * zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size * * Create a new dataset. This command can be used to create filesystems * and volumes. Snapshot creation is handled by 'zfs snapshot'. * For volumes, the user must specify a size to be used. * * The '-s' flag applies only to volumes, and indicates that we should not try * to set the reservation for this volume. By default we set a reservation * equal to the size for any volume. For pools with SPA_VERSION >= * SPA_VERSION_REFRESERVATION, we set a refreservation instead. * * The '-p' flag creates all the non-existing ancestors of the target first. * * The '-u' flag prevents mounting of newly created file system. */ static int zfs_do_create(int argc, char **argv) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; zfs_handle_t *zhp = NULL; uint64_t volsize = 0; int c; boolean_t noreserve = B_FALSE; boolean_t bflag = B_FALSE; boolean_t parents = B_FALSE; boolean_t nomount = B_FALSE; int ret = 1; nvlist_t *props; uint64_t intval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":V:b:so:pu")) != -1) { switch (c) { case 'V': type = ZFS_TYPE_VOLUME; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) nomem(); volsize = intval; break; case 'p': parents = B_TRUE; break; case 'b': bflag = B_TRUE; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "block size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), intval) != 0) nomem(); break; case 'o': if (parseprop(props, optarg) != 0) goto error; break; case 's': noreserve = B_TRUE; break; case 'u': nomount = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing size " "argument\n")); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { (void) fprintf(stderr, gettext("'-s' and '-b' can only be " "used when creating a volume\n")); goto badusage; } if (nomount && type != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("'-u' can only be " "used when creating a file system\n")); goto badusage; } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing %s argument\n"), zfs_type_to_name(type)); goto badusage; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); goto badusage; } if (type == ZFS_TYPE_VOLUME && !noreserve) { zpool_handle_t *zpool_handle; nvlist_t *real_props = NULL; uint64_t spa_version; char *p; zfs_prop_t resv_prop; char *strval; char msg[1024]; if ((p = strchr(argv[0], '/')) != NULL) *p = '\0'; zpool_handle = zpool_open(g_zfs, argv[0]); if (p != NULL) *p = '/'; if (zpool_handle == NULL) goto error; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); if (spa_version >= SPA_VERSION_REFRESERVATION) resv_prop = ZFS_PROP_REFRESERVATION; else resv_prop = ZFS_PROP_RESERVATION; (void) snprintf(msg, sizeof (msg), gettext("cannot create '%s'"), argv[0]); if (props && (real_props = zfs_valid_proplist(g_zfs, type, props, 0, NULL, zpool_handle, msg)) == NULL) { zpool_close(zpool_handle); goto error; } zpool_close(zpool_handle); volsize = zvol_volsize_to_reservation(volsize, real_props); nvlist_free(real_props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { if (nvlist_add_uint64(props, zfs_prop_to_name(resv_prop), volsize) != 0) { nvlist_free(props); nomem(); } } } if (parents && zfs_name_valid(argv[0], type)) { /* * Now create the ancestors of target dataset. If the target * already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[0], type)) { ret = 0; goto error; } if (zfs_create_ancestors(g_zfs, argv[0]) != 0) goto error; } /* pass to libzfs */ if (zfs_create(g_zfs, argv[0], type, props) != 0) goto error; if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) goto error; ret = 0; /* * Mount and/or share the new filesystem as appropriate. We provide a * verbose error message to let the user know that their filesystem was * in fact created, even if we failed to mount or share it. * If the user doesn't want the dataset automatically mounted, * then skip the mount/share step altogether. */ if (!nomount && should_auto_mount(zhp)) { if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); ret = 1; } else if (zfs_share(zhp) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not shared\n")); ret = 1; } } error: if (zhp) zfs_close(zhp); nvlist_free(props); return (ret); badusage: nvlist_free(props); usage(B_FALSE); return (2); } /* * zfs destroy [-rRf] * zfs destroy [-rRd] * * -r Recursively destroy all children * -R Recursively destroy all dependents, including clones * -f Force unmounting of any dependents * -d If we can't destroy now, mark for deferred destruction * * Destroys the given dataset. By default, it will unmount any filesystems, * and refuse to destroy a dataset that has any dependents. A dependent can * either be a child, or a clone of a child. */ typedef struct destroy_cbdata { boolean_t cb_first; boolean_t cb_force; boolean_t cb_recurse; boolean_t cb_error; boolean_t cb_doclones; zfs_handle_t *cb_target; boolean_t cb_defer_destroy; boolean_t cb_verbose; boolean_t cb_parsable; boolean_t cb_dryrun; nvlist_t *cb_nvl; nvlist_t *cb_batchedsnaps; /* first snap in contiguous run */ char *cb_firstsnap; /* previous snap in contiguous run */ char *cb_prevsnap; int64_t cb_snapused; char *cb_snapspec; char *cb_bookmark; } destroy_cbdata_t; /* * Check for any dependents based on the '-r' or '-R' flags. */ static int destroy_check_dependent(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cbp = data; const char *tname = zfs_get_name(cbp->cb_target); const char *name = zfs_get_name(zhp); if (strncmp(tname, name, strlen(tname)) == 0 && (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { /* * This is a direct descendant, not a clone somewhere else in * the hierarchy. */ if (cbp->cb_recurse) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has children\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-r' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } else { /* * This is a clone. We only want to report this if the '-r' * wasn't specified, or the target is a snapshot. */ if (!cbp->cb_recurse && zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has dependent clones\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-R' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; cbp->cb_dryrun = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } out: zfs_close(zhp); return (0); } static int destroy_callback(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cb = data; const char *name = zfs_get_name(zhp); if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } /* * Ignore pools (which we've already flagged as an error before getting * here). */ if (strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { zfs_close(zhp); return (0); } if (cb->cb_dryrun) { zfs_close(zhp); return (0); } /* * We batch up all contiguous snapshots (even of different * filesystems) and destroy them with one ioctl. We can't * simply do all snap deletions and then all fs deletions, * because we must delete a clone before its origin. */ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { fnvlist_add_boolean(cb->cb_batchedsnaps, name); } else { int error = zfs_destroy_snaps_nvl(g_zfs, cb->cb_batchedsnaps, B_FALSE); fnvlist_free(cb->cb_batchedsnaps); cb->cb_batchedsnaps = fnvlist_alloc(); if (error != 0 || zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { zfs_close(zhp); return (-1); } } zfs_close(zhp); return (0); } static int destroy_print_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; const char *name = zfs_get_name(zhp); int err = 0; if (nvlist_exists(cb->cb_nvl, name)) { if (cb->cb_firstsnap == NULL) cb->cb_firstsnap = strdup(name); if (cb->cb_prevsnap != NULL) free(cb->cb_prevsnap); /* this snap continues the current range */ cb->cb_prevsnap = strdup(name); if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) nomem(); if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } } else if (cb->cb_firstsnap != NULL) { /* end of this range */ uint64_t used = 0; err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } zfs_close(zhp); return (err); } static int destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) { int err = 0; assert(cb->cb_firstsnap == NULL); assert(cb->cb_prevsnap == NULL); err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb); if (cb->cb_firstsnap != NULL) { uint64_t used = 0; if (err == 0) { err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); } cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } return (err); } static int snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; /* Check for clones. */ if (!cb->cb_doclones && !cb->cb_defer_destroy) { cb->cb_target = zhp; cb->cb_first = B_TRUE; err = zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, cb); } if (err == 0) { if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) nomem(); } zfs_close(zhp); return (err); } static int gather_snapshots(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); if (err == ENOENT) err = 0; if (err != 0) goto out; if (cb->cb_verbose) { err = destroy_print_snapshots(zhp, cb); if (err != 0) goto out; } if (cb->cb_recurse) err = zfs_iter_filesystems(zhp, gather_snapshots, cb); out: zfs_close(zhp); return (err); } static int destroy_clones(destroy_cbdata_t *cb) { nvpair_t *pair; for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); pair != NULL; pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { boolean_t defer = cb->cb_defer_destroy; int err = 0; /* * We can't defer destroy non-snapshots, so set it to * false while destroying the clones. */ cb->cb_defer_destroy = B_FALSE; err = zfs_iter_dependents(zhp, B_FALSE, destroy_callback, cb); cb->cb_defer_destroy = defer; zfs_close(zhp); if (err != 0) return (err); } } return (0); } static int zfs_do_destroy(int argc, char **argv) { destroy_cbdata_t cb = { 0 }; int rv = 0; int err = 0; int c; zfs_handle_t *zhp = NULL; char *at, *pound; zfs_type_t type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, "vpndfrR")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; break; case 'p': cb.cb_verbose = B_TRUE; cb.cb_parsable = B_TRUE; break; case 'n': cb.cb_dryrun = B_TRUE; break; case 'd': cb.cb_defer_destroy = B_TRUE; type = ZFS_TYPE_SNAPSHOT; break; case 'f': cb.cb_force = B_TRUE; break; case 'r': cb.cb_recurse = B_TRUE; break; case 'R': cb.cb_recurse = B_TRUE; cb.cb_doclones = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } at = strchr(argv[0], '@'); pound = strchr(argv[0], '#'); if (at != NULL) { /* Build the list of snaps to destroy in cb_nvl. */ cb.cb_nvl = fnvlist_alloc(); *at = '\0'; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); cb.cb_snapspec = at + 1; if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || cb.cb_error) { rv = 1; goto out; } if (nvlist_empty(cb.cb_nvl)) { (void) fprintf(stderr, gettext("could not find any " "snapshots to destroy; check snapshot names.\n")); rv = 1; goto out; } if (cb.cb_verbose) { char buf[16]; zfs_nicenum(cb.cb_snapused, buf, sizeof (buf)); if (cb.cb_parsable) { (void) printf("reclaim\t%llu\n", cb.cb_snapused); } else if (cb.cb_dryrun) { (void) printf(gettext("would reclaim %s\n"), buf); } else { (void) printf(gettext("will reclaim %s\n"), buf); } } if (!cb.cb_dryrun) { if (cb.cb_doclones) { cb.cb_batchedsnaps = fnvlist_alloc(); err = destroy_clones(&cb); if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, B_FALSE); } if (err != 0) { rv = 1; goto out; } } if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, cb.cb_defer_destroy); } } if (err != 0) rv = 1; } else if (pound != NULL) { int err; nvlist_t *nvl; if (cb.cb_dryrun) { (void) fprintf(stderr, "dryrun is not supported with bookmark\n"); return (-1); } if (cb.cb_defer_destroy) { (void) fprintf(stderr, "defer destroy is not supported with bookmark\n"); return (-1); } if (cb.cb_recurse) { (void) fprintf(stderr, "recursive is not supported with bookmark\n"); return (-1); } if (!zfs_bookmark_exists(argv[0])) { (void) fprintf(stderr, gettext("bookmark '%s' " "does not exist.\n"), argv[0]); return (1); } nvl = fnvlist_alloc(); fnvlist_add_boolean(nvl, argv[0]); err = lzc_destroy_bookmarks(nvl, NULL); if (err != 0) { (void) zfs_standard_error(g_zfs, err, "cannot destroy bookmark"); } nvlist_free(cb.cb_nvl); return (err); } else { /* Open the given dataset */ if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) return (1); cb.cb_target = zhp; /* * Perform an explicit check for pools before going any further. */ if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "operation does not apply to pools\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zfs destroy -r " "%s' to destroy all datasets in the pool\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zpool destroy %s' " "to destroy the pool itself\n"), zfs_get_name(zhp)); rv = 1; goto out; } /* * Check for any dependents and/or clones. */ cb.cb_first = B_TRUE; if (!cb.cb_doclones && zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, &cb) != 0) { rv = 1; goto out; } if (cb.cb_error) { rv = 1; goto out; } cb.cb_batchedsnaps = fnvlist_alloc(); if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) { rv = 1; goto out; } /* * Do the real thing. The callback will close the * handle regardless of whether it succeeds or not. */ err = destroy_callback(zhp, &cb); zhp = NULL; if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, cb.cb_defer_destroy); } if (err != 0) rv = 1; } out: fnvlist_free(cb.cb_batchedsnaps); fnvlist_free(cb.cb_nvl); if (zhp != NULL) zfs_close(zhp); return (rv); } static boolean_t is_recvd_column(zprop_get_cbdata_t *cbp) { int i; zfs_get_column_t col; for (i = 0; i < ZFS_GET_NCOLS && (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) if (col == GET_COL_RECVD) return (B_TRUE); return (B_FALSE); } /* * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] * < all | property[,property]... > < fs | snap | vol > ... * * -r recurse over any child datasets * -H scripted mode. Headers are stripped, and fields are separated * by tabs instead of spaces. * -o Set of fields to display. One of "name,property,value, * received,source". Default is "name,property,value,source". * "all" is an alias for all five. * -s Set of sources to allow. One of * "local,default,inherited,received,temporary,none". Default is * all six. * -p Display values in parsable (literal) format. * * Prints properties for the given datasets. The user can control which * columns to display as well as which property types to allow. */ /* * Invoked to display the properties for a single dataset. */ static int get_callback(zfs_handle_t *zhp, void *data) { char buf[ZFS_MAXPROPLEN]; char rbuf[ZFS_MAXPROPLEN]; zprop_source_t sourcetype; char source[ZFS_MAX_DATASET_NAME_LEN]; zprop_get_cbdata_t *cbp = data; nvlist_t *user_props = zfs_get_user_props(zhp); zprop_list_t *pl = cbp->cb_proplist; nvlist_t *propval; char *strval; char *sourceval; boolean_t received = is_recvd_column(cbp); for (; pl != NULL; pl = pl->pl_next) { char *recvdval = NULL; /* * Skip the special fake placeholder. This will also skip over * the name property when 'all' is specified. */ if (pl->pl_prop == ZFS_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, pl->pl_prop, buf, sizeof (buf), &sourcetype, source, sizeof (source), cbp->cb_literal) != 0) { if (pl->pl_all) continue; if (!zfs_prop_valid_for_type(pl->pl_prop, ZFS_TYPE_DATASET)) { (void) fprintf(stderr, gettext("No such property '%s'\n"), zfs_prop_to_name(pl->pl_prop)); continue; } sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } if (received && (zfs_prop_get_recvd(zhp, zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, zfs_prop_to_name(pl->pl_prop), buf, sourcetype, source, recvdval); } else if (zfs_prop_userquota(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else if (zfs_prop_written(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_written(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else { if (nvlist_lookup_nvlist(user_props, pl->pl_user_prop, &propval) != 0) { if (pl->pl_all) continue; sourcetype = ZPROP_SRC_NONE; strval = "-"; } else { verify(nvlist_lookup_string(propval, ZPROP_VALUE, &strval) == 0); verify(nvlist_lookup_string(propval, ZPROP_SOURCE, &sourceval) == 0); if (strcmp(sourceval, zfs_get_name(zhp)) == 0) { sourcetype = ZPROP_SRC_LOCAL; } else if (strcmp(sourceval, ZPROP_SOURCE_VAL_RECVD) == 0) { sourcetype = ZPROP_SRC_RECEIVED; } else { sourcetype = ZPROP_SRC_INHERITED; (void) strlcpy(source, sourceval, sizeof (source)); } } if (received && (zfs_prop_get_recvd(zhp, pl->pl_user_prop, rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, strval, sourcetype, source, recvdval); } } return (0); } static int zfs_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; char *value, *fields; int ret = 0; int limit = 0; zprop_list_t fake_name = { 0 }; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'o': /* * Process the set of columns to display. We zero out * the structure to give us a blank slate. */ bzero(&cb.cb_columns, sizeof (cb.cb_columns)); i = 0; while (*optarg != '\0') { static char *col_subopts[] = { "name", "property", "value", "received", "source", "all", NULL }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } switch (getsubopt(&optarg, col_subopts, &value)) { case 0: cb.cb_columns[i++] = GET_COL_NAME; break; case 1: cb.cb_columns[i++] = GET_COL_PROPERTY; break; case 2: cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: cb.cb_columns[i++] = GET_COL_RECVD; flags |= ZFS_ITER_RECVD_PROPS; break; case 4: cb.cb_columns[i++] = GET_COL_SOURCE; break; case 5: if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_RECVD; cb.cb_columns[4] = GET_COL_SOURCE; flags |= ZFS_ITER_RECVD_PROPS; i = ZFS_GET_NCOLS; break; default: (void) fprintf(stderr, gettext("invalid column name " "'%s'\n"), suboptarg); usage(B_FALSE); } } break; case 's': cb.cb_sources = 0; while (*optarg != '\0') { static char *source_subopts[] = { "local", "default", "inherited", "received", "temporary", "none", NULL }; switch (getsubopt(&optarg, source_subopts, &value)) { case 0: cb.cb_sources |= ZPROP_SRC_LOCAL; break; case 1: cb.cb_sources |= ZPROP_SRC_DEFAULT; break; case 2: cb.cb_sources |= ZPROP_SRC_INHERITED; break; case 3: cb.cb_sources |= ZPROP_SRC_RECEIVED; break; case 4: cb.cb_sources |= ZPROP_SRC_TEMPORARY; break; case 5: cb.cb_sources |= ZPROP_SRC_NONE; break; default: (void) fprintf(stderr, gettext("invalid source " "'%s'\n"), suboptarg); usage(B_FALSE); } } break; case 't': types = 0; flags &= ~ZFS_ITER_PROP_LISTSNAPS; while (*optarg != '\0') { static char *type_subopts[] = { "filesystem", "volume", "snapshot", "bookmark", "all", NULL }; switch (getsubopt(&optarg, type_subopts, &value)) { case 0: types |= ZFS_TYPE_FILESYSTEM; break; case 1: types |= ZFS_TYPE_VOLUME; break; case 2: types |= ZFS_TYPE_SNAPSHOT; break; case 3: types |= ZFS_TYPE_BOOKMARK; break; case 4: types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; break; default: (void) fprintf(stderr, gettext("invalid type '%s'\n"), suboptarg); usage(B_FALSE); } } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } fields = argv[0]; if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); argc--; argv++; /* * As part of zfs_expand_proplist(), we keep track of the maximum column * width for each property. For the 'NAME' (and 'SOURCE') columns, we * need to know the maximum name length. However, the user likely did * not specify 'name' as one of the properties to fetch, so we need to * make sure we always include at least this property for * print_get_headers() to work properly. */ if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZFS_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } cb.cb_first = B_TRUE; /* run for each object */ ret = zfs_for_each(argc, argv, flags, types, NULL, &cb.cb_proplist, limit, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } /* * inherit [-rS] ... * * -r Recurse over all children * -S Revert to received value, if any * * For each dataset specified on the command line, inherit the given property * from its parent. Inheriting a property at the pool level will cause it to * use the default value. The '-r' flag will recurse over all children, and is * useful for setting a property on a hierarchy-wide basis, regardless of any * local modifications for each dataset. */ typedef struct inherit_cbdata { const char *cb_propname; boolean_t cb_received; } inherit_cbdata_t; static int inherit_recurse_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); /* * If we're doing it recursively, then ignore properties that * are not valid for this type of dataset. */ if (prop != ZPROP_INVAL && !zfs_prop_valid_for_type(prop, zfs_get_type(zhp))) return (0); return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int inherit_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int zfs_do_inherit(int argc, char **argv) { int c; zfs_prop_t prop; inherit_cbdata_t cb = { 0 }; char *propname; int ret = 0; int flags = 0; boolean_t received = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "rS")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'S': received = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing property argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } propname = argv[0]; argc--; argv++; if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { if (zfs_prop_readonly(prop)) { (void) fprintf(stderr, gettext( "%s property is read-only\n"), propname); return (1); } if (!zfs_prop_inheritable(prop) && !received) { (void) fprintf(stderr, gettext("'%s' property cannot " "be inherited\n"), propname); if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION) { (void) fprintf(stderr, gettext("use 'zfs set " "%s=none' to clear\n"), propname); (void) fprintf(stderr, gettext("use 'zfs " "inherit -S %s' to revert to received " "value\n"), propname); } return (1); } if (received && (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION)) { (void) fprintf(stderr, gettext("'%s' property cannot " "be reverted to a received value\n"), propname); return (1); } } else if (!zfs_prop_user(propname)) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), propname); usage(B_FALSE); } cb.cb_propname = propname; cb.cb_received = received; if (flags & ZFS_ITER_RECURSE) { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_recurse_cb, &cb); } else { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_cb, &cb); } return (ret); } typedef struct upgrade_cbdata { uint64_t cb_numupgraded; uint64_t cb_numsamegraded; uint64_t cb_numfailed; uint64_t cb_version; boolean_t cb_newer; boolean_t cb_foundone; char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; } upgrade_cbdata_t; static int same_pool(zfs_handle_t *zhp, const char *name) { int len1 = strcspn(name, "/@"); const char *zhname = zfs_get_name(zhp); int len2 = strcspn(zhname, "/@"); if (len1 != len2) return (B_FALSE); return (strncmp(name, zhname, len1) == 0); } static int upgrade_list_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); /* list if it's old/new */ if ((!cb->cb_newer && version < ZPL_VERSION) || (cb->cb_newer && version > ZPL_VERSION)) { char *str; if (cb->cb_newer) { str = gettext("The following filesystems are " "formatted using a newer software version and\n" "cannot be accessed on the current system.\n\n"); } else { str = gettext("The following filesystems are " "out of date, and can be upgraded. After being\n" "upgraded, these filesystems (and any 'zfs send' " "streams generated from\n" "subsequent snapshots) will no longer be " "accessible by older software versions.\n\n"); } if (!cb->cb_foundone) { (void) puts(str); (void) printf(gettext("VER FILESYSTEM\n")); (void) printf(gettext("--- ------------\n")); cb->cb_foundone = B_TRUE; } (void) printf("%2u %s\n", version, zfs_get_name(zhp)); } return (0); } static int upgrade_set_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int needed_spa_version; int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); needed_spa_version = zfs_spa_version_map(cb->cb_version); if (needed_spa_version < 0) return (-1); if (spa_version < needed_spa_version) { /* can't upgrade */ (void) printf(gettext("%s: can not be " "upgraded; the pool version needs to first " "be upgraded\nto version %d\n\n"), zfs_get_name(zhp), needed_spa_version); cb->cb_numfailed++; return (0); } /* upgrade */ if (version < cb->cb_version) { char verstr[16]; (void) snprintf(verstr, sizeof (verstr), "%llu", cb->cb_version); if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { /* * If they did "zfs upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (zfs_prop_set(zhp, "version", verstr) == 0) cb->cb_numupgraded++; else cb->cb_numfailed++; (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp)); } else if (version > cb->cb_version) { /* can't downgrade */ (void) printf(gettext("%s: can not be downgraded; " "it is already at version %u\n"), zfs_get_name(zhp), version); cb->cb_numfailed++; } else { cb->cb_numsamegraded++; } return (0); } /* * zfs upgrade * zfs upgrade -v * zfs upgrade [-r] [-V ] <-a | filesystem> */ static int zfs_do_upgrade(int argc, char **argv) { boolean_t all = B_FALSE; boolean_t showversions = B_FALSE; int ret = 0; upgrade_cbdata_t cb = { 0 }; int c; int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "rvV:a")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'v': showversions = B_TRUE; break; case 'V': if (zfs_prop_string_to_index(ZFS_PROP_VERSION, optarg, &cb.cb_version) != 0) { (void) fprintf(stderr, gettext("invalid version %s\n"), optarg); usage(B_FALSE); } break; case 'a': all = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) usage(B_FALSE); if (showversions && (flags & ZFS_ITER_RECURSE || all || cb.cb_version || argc)) usage(B_FALSE); if ((all || argc) && (showversions)) usage(B_FALSE); if (all && argc) usage(B_FALSE); if (showversions) { /* Show info on available versions. */ (void) printf(gettext("The following filesystem versions are " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 3 Case insensitive and filesystem " "user identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); (void) printf(gettext(" 5 System attributes\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf("see the ZFS Administration Guide.\n\n"); ret = 0; } else if (argc || all) { /* Upgrade filesystems */ if (cb.cb_version == 0) cb.cb_version = ZPL_VERSION; ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_set_callback, &cb); (void) printf(gettext("%llu filesystems upgraded\n"), cb.cb_numupgraded); if (cb.cb_numsamegraded) { (void) printf(gettext("%llu filesystems already at " "this version\n"), cb.cb_numsamegraded); } if (cb.cb_numfailed != 0) ret = 1; } else { /* List old-version filesystems */ boolean_t found; (void) printf(gettext("This system is currently running " "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); flags |= ZFS_ITER_RECURSE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); found = cb.cb_foundone; cb.cb_foundone = B_FALSE; cb.cb_newer = B_TRUE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); if (!cb.cb_foundone && !found) { (void) printf(gettext("All filesystems are " "formatted with the current version.\n")); } } return (ret); } /* * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot * * -H Scripted mode; elide headers and separate columns by tabs. * -i Translate SID to POSIX ID. * -n Print numeric ID instead of user/group name. * -o Control which fields to display. * -p Use exact (parsable) numeric output. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * Displays space consumed by, and quotas on, each user in the specified * filesystem or snapshot. */ /* us_field_types, us_field_hdr and us_field_names should be kept in sync */ enum us_field_types { USFIELD_TYPE, USFIELD_NAME, USFIELD_USED, USFIELD_QUOTA }; static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" }; static char *us_field_names[] = { "type", "name", "used", "quota" }; #define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) #define USTYPE_PSX_GRP (1 << 0) #define USTYPE_PSX_USR (1 << 1) #define USTYPE_SMB_GRP (1 << 2) #define USTYPE_SMB_USR (1 << 3) #define USTYPE_ALL \ (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR) static int us_type_bits[] = { USTYPE_PSX_GRP, USTYPE_PSX_USR, USTYPE_SMB_GRP, USTYPE_SMB_USR, USTYPE_ALL }; static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup", "smbuser", "all" }; typedef struct us_node { nvlist_t *usn_nvl; uu_avl_node_t usn_avlnode; uu_list_node_t usn_listnode; } us_node_t; typedef struct us_cbdata { nvlist_t **cb_nvlp; uu_avl_pool_t *cb_avl_pool; uu_avl_t *cb_avl; boolean_t cb_numname; boolean_t cb_nicenum; boolean_t cb_sid2posix; zfs_userquota_prop_t cb_prop; zfs_sort_column_t *cb_sortcol; size_t cb_width[USFIELD_LAST]; } us_cbdata_t; static boolean_t us_populated = B_FALSE; typedef struct { zfs_sort_column_t *si_sortcol; boolean_t si_numname; } us_sort_info_t; static int us_field_index(char *field) { int i; for (i = 0; i < USFIELD_LAST; i++) { if (strcmp(field, us_field_names[i]) == 0) return (i); } return (-1); } static int us_compare(const void *larg, const void *rarg, void *unused) { const us_node_t *l = larg; const us_node_t *r = rarg; us_sort_info_t *si = (us_sort_info_t *)unused; zfs_sort_column_t *sortcol = si->si_sortcol; boolean_t numname = si->si_numname; nvlist_t *lnvl = l->usn_nvl; nvlist_t *rnvl = r->usn_nvl; int rc = 0; boolean_t lvb, rvb; for (; sortcol != NULL; sortcol = sortcol->sc_next) { char *lvstr = ""; char *rvstr = ""; uint32_t lv32 = 0; uint32_t rv32 = 0; uint64_t lv64 = 0; uint64_t rv64 = 0; zfs_prop_t prop = sortcol->sc_prop; const char *propname = NULL; boolean_t reverse = sortcol->sc_reverse; switch (prop) { case ZFS_PROP_TYPE: propname = "type"; (void) nvlist_lookup_uint32(lnvl, propname, &lv32); (void) nvlist_lookup_uint32(rnvl, propname, &rv32); if (rv32 != lv32) rc = (rv32 < lv32) ? 1 : -1; break; case ZFS_PROP_NAME: propname = "name"; if (numname) { compare_nums: (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; } else { if ((nvlist_lookup_string(lnvl, propname, &lvstr) == ENOENT) || (nvlist_lookup_string(rnvl, propname, &rvstr) == ENOENT)) { goto compare_nums; } rc = strcmp(lvstr, rvstr); } break; case ZFS_PROP_USED: case ZFS_PROP_QUOTA: if (!us_populated) break; if (prop == ZFS_PROP_USED) propname = "used"; else propname = "quota"; (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; break; default: break; } if (rc != 0) { if (rc < 0) return (reverse ? 1 : -1); else return (reverse ? -1 : 1); } } /* * If entries still seem to be the same, check if they are of the same * type (smbentity is added only if we are doing SID to POSIX ID * translation where we can have duplicate type/name combinations). */ if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && lvb != rvb) return (lvb < rvb ? -1 : 1); return (0); } static inline const char * us_type2str(unsigned field_type) { switch (field_type) { case USTYPE_PSX_USR: return ("POSIX User"); case USTYPE_PSX_GRP: return ("POSIX Group"); case USTYPE_SMB_USR: return ("SMB User"); case USTYPE_SMB_GRP: return ("SMB Group"); default: return ("Undefined"); } } static int userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) { us_cbdata_t *cb = (us_cbdata_t *)arg; zfs_userquota_prop_t prop = cb->cb_prop; char *name = NULL; char *propname; char sizebuf[32]; us_node_t *node; uu_avl_pool_t *avl_pool = cb->cb_avl_pool; uu_avl_t *avl = cb->cb_avl; uu_avl_index_t idx; nvlist_t *props; us_node_t *n; zfs_sort_column_t *sortcol = cb->cb_sortcol; unsigned type = 0; const char *typestr; size_t namelen; size_t typelen; size_t sizelen; int typeidx, nameidx, sizeidx; us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; boolean_t smbentity = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); node = safe_malloc(sizeof (us_node_t)); uu_avl_node_init(node, &node->usn_avlnode, avl_pool); node->usn_nvl = props; if (domain != NULL && domain[0] != '\0') { /* SMB */ char sid[MAXNAMELEN + 32]; uid_t id; #ifdef illumos int err; int flag = IDMAP_REQ_FLG_USE_CACHE; #endif smbentity = B_TRUE; (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { type = USTYPE_SMB_GRP; #ifdef illumos err = sid_to_id(sid, B_FALSE, &id); #endif } else { type = USTYPE_SMB_USR; #ifdef illumos err = sid_to_id(sid, B_TRUE, &id); #endif } #ifdef illumos if (err == 0) { rid = id; if (!cb->cb_sid2posix) { if (type == USTYPE_SMB_USR) { (void) idmap_getwinnamebyuid(rid, flag, &name, NULL); } else { (void) idmap_getwinnamebygid(rid, flag, &name, NULL); } if (name == NULL) name = sid; } } #endif } if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { /* POSIX or -i */ if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { type = USTYPE_PSX_GRP; if (!cb->cb_numname) { struct group *g; if ((g = getgrgid(rid)) != NULL) name = g->gr_name; } } else { type = USTYPE_PSX_USR; if (!cb->cb_numname) { struct passwd *p; if ((p = getpwuid(rid)) != NULL) name = p->pw_name; } } } /* * Make sure that the type/name combination is unique when doing * SID to POSIX ID translation (hence changing the type from SMB to * POSIX). */ if (cb->cb_sid2posix && nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) nomem(); /* Calculate/update width of TYPE field */ typestr = us_type2str(type); typelen = strlen(gettext(typestr)); typeidx = us_field_index("type"); if (typelen > cb->cb_width[typeidx]) cb->cb_width[typeidx] = typelen; if (nvlist_add_uint32(props, "type", type) != 0) nomem(); /* Calculate/update width of NAME field */ if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { if (nvlist_add_uint64(props, "name", rid) != 0) nomem(); namelen = snprintf(NULL, 0, "%u", rid); } else { if (nvlist_add_string(props, "name", name) != 0) nomem(); namelen = strlen(name); } nameidx = us_field_index("name"); if (namelen > cb->cb_width[nameidx]) cb->cb_width[nameidx] = namelen; /* * Check if this type/name combination is in the list and update it; * otherwise add new node to the list. */ if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { uu_avl_insert(avl, node, idx); } else { nvlist_free(props); free(node); node = n; props = node->usn_nvl; } /* Calculate/update width of USED/QUOTA fields */ if (cb->cb_nicenum) zfs_nicenum(space, sizebuf, sizeof (sizebuf)); else (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", space); sizelen = strlen(sizebuf); if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED) { propname = "used"; if (!nvlist_exists(props, "quota")) (void) nvlist_add_uint64(props, "quota", 0); } else { propname = "quota"; if (!nvlist_exists(props, "used")) (void) nvlist_add_uint64(props, "used", 0); } sizeidx = us_field_index(propname); if (sizelen > cb->cb_width[sizeidx]) cb->cb_width[sizeidx] = sizelen; if (nvlist_add_uint64(props, propname, space) != 0) nomem(); return (0); } static void print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, us_node_t *node) { nvlist_t *nvl = node->usn_nvl; char valstr[MAXNAMELEN]; boolean_t first = B_TRUE; int cfield = 0; int field; uint32_t ustype; /* Check type */ (void) nvlist_lookup_uint32(nvl, "type", &ustype); if (!(ustype & types)) return; while ((field = fields[cfield]) != USFIELD_LAST) { nvpair_t *nvp = NULL; data_type_t type; uint32_t val32; uint64_t val64; char *strval = NULL; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { if (strcmp(nvpair_name(nvp), us_field_names[field]) == 0) break; } type = nvpair_type(nvp); switch (type) { case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &val32); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &val64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &strval); break; default: (void) fprintf(stderr, "invalid data type\n"); } switch (field) { case USFIELD_TYPE: strval = (char *)us_type2str(val32); break; case USFIELD_NAME: if (type == DATA_TYPE_UINT64) { (void) sprintf(valstr, "%llu", val64); strval = valstr; } break; case USFIELD_USED: case USFIELD_QUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", val64); } else { zfs_nicenum(val64, valstr, sizeof (valstr)); } if (field == USFIELD_QUOTA && strcmp(valstr, "0") == 0) strval = "none"; else strval = valstr; } break; } if (!first) { if (scripted) (void) printf("\t"); else (void) printf(" "); } if (scripted) (void) printf("%s", strval); else if (field == USFIELD_TYPE || field == USFIELD_NAME) (void) printf("%-*s", width[field], strval); else (void) printf("%*s", width[field], strval); first = B_FALSE; cfield++; } (void) printf("\n"); } static void print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, boolean_t rmnode, uu_avl_t *avl) { us_node_t *node; const char *col; int cfield = 0; int field; if (!scripted) { boolean_t first = B_TRUE; while ((field = fields[cfield]) != USFIELD_LAST) { col = gettext(us_field_hdr[field]); if (field == USFIELD_TYPE || field == USFIELD_NAME) { (void) printf(first ? "%-*s" : " %-*s", width[field], col); } else { (void) printf(first ? "%*s" : " %*s", width[field], col); } first = B_FALSE; cfield++; } (void) printf("\n"); } for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { print_us_node(scripted, parsable, fields, types, width, node); if (rmnode) nvlist_free(node->usn_nvl); } } static int zfs_do_userspace(int argc, char **argv) { zfs_handle_t *zhp; zfs_userquota_prop_t p; uu_avl_pool_t *avl_pool; uu_avl_t *avl_tree; uu_avl_walk_t *walk; char *delim; char deffields[] = "type,name,used,quota"; char *ofield = NULL; char *tfield = NULL; int cfield = 0; int fields[256]; int i; boolean_t scripted = B_FALSE; boolean_t prtnum = B_FALSE; boolean_t parsable = B_FALSE; boolean_t sid2posix = B_FALSE; int ret = 0; int c; zfs_sort_column_t *sortcol = NULL; int types = USTYPE_PSX_USR | USTYPE_SMB_USR; us_cbdata_t cb; us_node_t *node; us_node_t *rmnode; uu_list_pool_t *listpool; uu_list_t *list; uu_avl_index_t idx = 0; uu_list_index_t idx2 = 0; if (argc < 2) usage(B_FALSE); if (strcmp(argv[0], "groupspace") == 0) /* Toggle default group types */ types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { switch (c) { case 'n': prtnum = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'o': ofield = optarg; break; case 's': case 'S': if (zfs_add_sort_column(&sortcol, optarg, c == 's' ? B_FALSE : B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid field '%s'\n"), optarg); usage(B_FALSE); } break; case 't': tfield = optarg; break; case 'i': sid2posix = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* Use default output fields if not specified using -o */ if (ofield == NULL) ofield = deffields; do { if ((delim = strchr(ofield, ',')) != NULL) *delim = '\0'; if ((fields[cfield++] = us_field_index(ofield)) == -1) { (void) fprintf(stderr, gettext("invalid type '%s' " "for -o option\n"), ofield); return (-1); } if (delim != NULL) ofield = delim + 1; } while (delim != NULL); fields[cfield] = USFIELD_LAST; /* Override output types (-t option) */ if (tfield != NULL) { types = 0; do { boolean_t found = B_FALSE; if ((delim = strchr(tfield, ',')) != NULL) *delim = '\0'; for (i = 0; i < sizeof (us_type_bits) / sizeof (int); i++) { if (strcmp(tfield, us_type_names[i]) == 0) { found = B_TRUE; types |= us_type_bits[i]; break; } } if (!found) { (void) fprintf(stderr, gettext("invalid type " "'%s' for -t option\n"), tfield); return (-1); } if (delim != NULL) tfield = delim + 1; } while (delim != NULL); } if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) return (1); if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) nomem(); if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) nomem(); /* Always add default sorting columns */ (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); cb.cb_sortcol = sortcol; cb.cb_numname = prtnum; cb.cb_nicenum = !parsable; cb.cb_avl_pool = avl_pool; cb.cb_avl = avl_tree; cb.cb_sid2posix = sid2posix; for (i = 0; i < USFIELD_LAST; i++) cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { if (((p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA) && !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || ((p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) && !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP)))) continue; cb.cb_prop = p; if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) return (ret); } /* Sort the list */ if ((node = uu_avl_first(avl_tree)) == NULL) return (0); us_populated = B_TRUE; listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); list = uu_list_create(listpool, NULL, UU_DEFAULT); uu_list_node_init(node, &node->usn_listnode, listpool); while (node != NULL) { rmnode = node; node = uu_avl_next(avl_tree, node); uu_avl_remove(avl_tree, rmnode); if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) uu_list_insert(list, rmnode, idx2); } for (node = uu_list_first(list); node != NULL; node = uu_list_next(list, node)) { us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) uu_avl_insert(avl_tree, node, idx); } uu_list_destroy(list); uu_list_pool_destroy(listpool); /* Print and free node nvlist memory */ print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, cb.cb_avl); zfs_free_sort_columns(sortcol); /* Clean up the AVL tree */ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(cb.cb_avl, node); free(node); } uu_avl_walk_end(walk); uu_avl_destroy(avl_tree); uu_avl_pool_destroy(avl_pool); return (ret); } /* * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] ... * [-t type[,...]] [filesystem|volume|snapshot] ... * * -H Scripted mode; elide headers and separate columns by tabs. * -p Display values in parsable (literal) format. * -r Recurse over all children. * -d Limit recursion by depth. * -o Control which fields to display. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * When given no arguments, list all filesystems in the system. * Otherwise, list the specified datasets, optionally recursing down them if * '-r' is specified. */ typedef struct list_cbdata { boolean_t cb_first; boolean_t cb_literal; boolean_t cb_scripted; zprop_list_t *cb_proplist; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZFS_MAXPROPLEN]; const char *header; int i; boolean_t first = B_TRUE; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { header = zfs_prop_column_name(pl->pl_prop); right_justify = zfs_prop_align_right(pl->pl_prop); } else { for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", pl->pl_width, header); else (void) printf("%-*s", pl->pl_width, header); } (void) printf("\n"); } /* * Given a dataset and a list of fields, print out all the properties according * to the described layout. */ static void print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZFS_MAXPROPLEN]; nvlist_t *userprops = zfs_get_user_props(zhp); nvlist_t *propval; char *propstr; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); } else { first = B_FALSE; } if (pl->pl_prop == ZFS_PROP_NAME) { (void) strlcpy(property, zfs_get_name(zhp), sizeof (property)); propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (pl->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, pl->pl_prop, property, sizeof (property), NULL, NULL, 0, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (zfs_prop_userquota(pl->pl_user_prop)) { if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else if (zfs_prop_written(pl->pl_user_prop)) { if (zfs_prop_get_written(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else { if (nvlist_lookup_nvlist(userprops, pl->pl_user_prop, &propval) != 0) propstr = "-"; else verify(nvlist_lookup_string(propval, ZPROP_VALUE, &propstr) == 0); right_justify = B_FALSE; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", pl->pl_width, propstr); else (void) printf("%-*s", pl->pl_width, propstr); } (void) printf("\n"); } /* * Generic callback function to list a dataset or snapshot. */ static int list_callback(zfs_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; if (cbp->cb_first) { if (!cbp->cb_scripted) print_header(cbp); cbp->cb_first = B_FALSE; } print_dataset(zhp, cbp); return (0); } static int zfs_do_list(int argc, char **argv) { int c; static char default_fields[] = "name,used,available,referenced,mountpoint"; int types = ZFS_TYPE_DATASET; boolean_t types_specified = B_FALSE; char *fields = NULL; list_cbdata_t cb = { 0 }; char *value; int limit = 0; int ret = 0; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { switch (c) { case 'o': fields = optarg; break; case 'p': cb.cb_literal = B_TRUE; flags |= ZFS_ITER_LITERAL_PROPS; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 's': if (zfs_add_sort_column(&sortcol, optarg, B_FALSE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 'S': if (zfs_add_sort_column(&sortcol, optarg, B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 't': types = 0; types_specified = B_TRUE; flags &= ~ZFS_ITER_PROP_LISTSNAPS; while (*optarg != '\0') { static char *type_subopts[] = { "filesystem", "volume", "snapshot", "snap", "bookmark", "all", NULL }; switch (getsubopt(&optarg, type_subopts, &value)) { case 0: types |= ZFS_TYPE_FILESYSTEM; break; case 1: types |= ZFS_TYPE_VOLUME; break; case 2: case 3: types |= ZFS_TYPE_SNAPSHOT; break; case 4: types |= ZFS_TYPE_BOOKMARK; break; case 5: types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; break; default: (void) fprintf(stderr, gettext("invalid type '%s'\n"), suboptarg); usage(B_FALSE); } } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (fields == NULL) fields = default_fields; /* * If we are only going to list snapshot names and sort by name, * then we can use faster version. */ if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol)) flags |= ZFS_ITER_SIMPLE; /* * If "-o space" and no types were specified, don't display snapshots. */ if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) types &= ~ZFS_TYPE_SNAPSHOT; /* * If the user specifies '-o all', the zprop_get_list() doesn't * normally include the name of the dataset. For 'zfs list', we always * want this property to be first. */ if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); cb.cb_first = B_TRUE; ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, limit, list_callback, &cb); zprop_free_list(cb.cb_proplist); zfs_free_sort_columns(sortcol); if (ret == 0 && cb.cb_first && !cb.cb_scripted) (void) printf(gettext("no datasets available\n")); return (ret); } /* * zfs rename [-f] * zfs rename [-f] -p * zfs rename -r * zfs rename -u [-p] * * Renames the given dataset to another of the same type. * * The '-p' flag creates all the non-existing ancestors of the target first. */ /* ARGSUSED */ static int zfs_do_rename(int argc, char **argv) { zfs_handle_t *zhp; renameflags_t flags = { 0 }; int c; int ret = 0; int types; boolean_t parents = B_FALSE; char *snapshot = NULL; /* check options */ while ((c = getopt(argc, argv, "fpru")) != -1) { switch (c) { case 'p': parents = B_TRUE; break; case 'r': flags.recurse = B_TRUE; break; case 'u': flags.nounmount = B_TRUE; break; case 'f': flags.forceunmount = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (flags.recurse && parents) { (void) fprintf(stderr, gettext("-p and -r options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.recurse && strchr(argv[0], '@') == 0) { (void) fprintf(stderr, gettext("source dataset for recursive " "rename must be a snapshot\n")); usage(B_FALSE); } if (flags.nounmount && parents) { (void) fprintf(stderr, gettext("-u and -p options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.nounmount) types = ZFS_TYPE_FILESYSTEM; else if (parents) types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; else types = ZFS_TYPE_DATASET; if (flags.recurse) { /* * When we do recursive rename we are fine when the given * snapshot for the given dataset doesn't exist - it can * still exists below. */ snapshot = strchr(argv[0], '@'); assert(snapshot != NULL); *snapshot = '\0'; snapshot++; } if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) return (1); /* If we were asked and the name looks good, try to create ancestors. */ if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); return (1); } ret = (zfs_rename(zhp, snapshot, argv[1], flags) != 0); zfs_close(zhp); return (ret); } /* * zfs promote * * Promotes the given clone fs to be the parent */ /* ARGSUSED */ static int zfs_do_promote(int argc, char **argv) { zfs_handle_t *zhp; int ret = 0; /* check options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing clone filesystem" " argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); ret = (zfs_promote(zhp) != 0); zfs_close(zhp); return (ret); } /* * zfs rollback [-rRf] * * -r Delete any intervening snapshots before doing rollback * -R Delete any snapshots and their clones * -f ignored for backwards compatability * * Given a filesystem, rollback to a specific snapshot, discarding any changes * since then and making it the active dataset. If more recent snapshots exist, * the command will complain unless the '-r' flag is given. */ typedef struct rollback_cbdata { uint64_t cb_create; boolean_t cb_first; int cb_doclones; char *cb_target; int cb_error; boolean_t cb_recurse; } rollback_cbdata_t; static int rollback_check_dependent(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; if (cbp->cb_first && cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot rollback to " "'%s': clones of previous snapshots exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-R' to " "force deletion of the following clones and " "dependents:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); zfs_close(zhp); return (0); } /* * Report any snapshots more recent than the one specified. Used when '-r' is * not specified. We reuse this same callback for the snapshot dependents - if * 'cb_dependent' is set, then this is a dependent and we should report it * without checking the transaction group. */ static int rollback_check(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; if (cbp->cb_doclones) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { if (cbp->cb_first && !cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot " "rollback to '%s': more recent snapshots " "or bookmarks exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-r' to " "force deletion of the following " "snapshots and bookmarks:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } if (cbp->cb_recurse) { if (zfs_iter_dependents(zhp, B_TRUE, rollback_check_dependent, cbp) != 0) { zfs_close(zhp); return (-1); } } else { (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } } zfs_close(zhp); return (0); } static int zfs_do_rollback(int argc, char **argv) { int ret = 0; int c; boolean_t force = B_FALSE; rollback_cbdata_t cb = { 0 }; zfs_handle_t *zhp, *snap; char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *delim; /* check options */ while ((c = getopt(argc, argv, "rRf")) != -1) { switch (c) { case 'r': cb.cb_recurse = 1; break; case 'R': cb.cb_recurse = 1; cb.cb_doclones = 1; break; case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* open the snapshot */ if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) return (1); /* open the parent dataset */ (void) strlcpy(parentname, argv[0], sizeof (parentname)); verify((delim = strrchr(parentname, '@')) != NULL); *delim = '\0'; if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { zfs_close(snap); return (1); } /* * Check for more recent snapshots and/or clones based on the presence * of '-r' and '-R'. */ cb.cb_target = argv[0]; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); cb.cb_first = B_TRUE; cb.cb_error = 0; if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb)) != 0) goto out; if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0) goto out; if ((ret = cb.cb_error) != 0) goto out; /* * Rollback parent to the given snapshot. */ ret = zfs_rollback(zhp, snap, force); out: zfs_close(snap); zfs_close(zhp); if (ret == 0) return (0); else return (1); } /* * zfs set property=value ... { fs | snap | vol } ... * * Sets the given properties for all datasets specified on the command line. */ static int set_callback(zfs_handle_t *zhp, void *data) { nvlist_t *props = data; if (zfs_prop_set_list(zhp, props) != 0) { switch (libzfs_errno(g_zfs)) { case EZFS_MOUNTFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to remount filesystem\n")); break; case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); break; } return (1); } return (0); } static int zfs_do_set(int argc, char **argv) { nvlist_t *props = NULL; int ds_start = -1; /* argv idx of first dataset arg */ int ret = 0; /* check for options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing arguments\n")); usage(B_FALSE); } if (argc < 3) { if (strchr(argv[1], '=') == NULL) { (void) fprintf(stderr, gettext("missing property=value " "argument(s)\n")); } else { (void) fprintf(stderr, gettext("missing dataset " "name(s)\n")); } usage(B_FALSE); } /* validate argument order: prop=val args followed by dataset args */ for (int i = 1; i < argc; i++) { if (strchr(argv[i], '=') != NULL) { if (ds_start > 0) { /* out-of-order prop=val argument */ (void) fprintf(stderr, gettext("invalid " "argument order\n"), i); usage(B_FALSE); } } else if (ds_start < 0) { ds_start = i; } } if (ds_start < 0) { (void) fprintf(stderr, gettext("missing dataset name(s)\n")); usage(B_FALSE); } /* Populate a list of property settings */ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); for (int i = 1; i < ds_start; i++) { if ((ret = parseprop(props, argv[i])) != 0) goto error; } ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); error: nvlist_free(props); return (ret); } typedef struct snap_cbdata { nvlist_t *sd_nvl; boolean_t sd_recursive; const char *sd_snapname; } snap_cbdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snap_cbdata_t *sd = arg; char *name; int rv = 0; int error; if (sd->sd_recursive && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { zfs_close(zhp); return (0); } error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); if (error == -1) nomem(); fnvlist_add_boolean(sd->sd_nvl, name); free(name); if (sd->sd_recursive) rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); zfs_close(zhp); return (rv); } /* * zfs snapshot [-r] [-o prop=value] ... * * Creates a snapshot with the given name. While functionally equivalent to * 'zfs create', it is a separate command to differentiate intent. */ static int zfs_do_snapshot(int argc, char **argv) { int ret = 0; int c; nvlist_t *props; snap_cbdata_t sd = { 0 }; boolean_t multiple_snaps = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "ro:")) != -1) { switch (c) { case 'o': if (parseprop(props, optarg) != 0) return (1); break; case 'r': sd.sd_recursive = B_TRUE; multiple_snaps = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); goto usage; } if (argc > 1) multiple_snaps = B_TRUE; for (; argc > 0; argc--, argv++) { char *atp; zfs_handle_t *zhp; atp = strchr(argv[0], '@'); if (atp == NULL) goto usage; *atp = '\0'; sd.sd_snapname = atp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) goto usage; if (zfs_snapshot_cb(zhp, &sd) != 0) goto usage; } ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); nvlist_free(sd.sd_nvl); nvlist_free(props); if (ret != 0 && multiple_snaps) (void) fprintf(stderr, gettext("no snapshots were created\n")); return (ret != 0); usage: nvlist_free(sd.sd_nvl); nvlist_free(props); usage(B_FALSE); return (-1); } /* * Send a backup stream to stdout. */ static int zfs_do_send(int argc, char **argv) { char *fromname = NULL; char *toname = NULL; char *resume_token = NULL; char *cp; zfs_handle_t *zhp; sendflags_t flags = { 0 }; int c, err; nvlist_t *dbgnv = NULL; boolean_t extraverbose = B_FALSE; struct option long_options[] = { {"replicate", no_argument, NULL, 'R'}, {"props", no_argument, NULL, 'p'}, {"parsable", no_argument, NULL, 'P'}, {"dedup", no_argument, NULL, 'D'}, {"verbose", no_argument, NULL, 'v'}, {"dryrun", no_argument, NULL, 'n'}, {"large-block", no_argument, NULL, 'L'}, {"embed", no_argument, NULL, 'e'}, {"resume", required_argument, NULL, 't'}, {"compressed", no_argument, NULL, 'c'}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":i:I:RbDpVvnPLet:c", long_options, NULL)) != -1) { switch (c) { case 'i': if (fromname) usage(B_FALSE); fromname = optarg; break; case 'I': if (fromname) usage(B_FALSE); fromname = optarg; flags.doall = B_TRUE; break; case 'R': flags.replicate = B_TRUE; break; case 'p': flags.props = B_TRUE; break; case 'P': flags.parsable = B_TRUE; flags.verbose = B_TRUE; break; case 'V': flags.progress = B_TRUE; flags.progressastitle = B_TRUE; break; case 'v': if (flags.verbose) extraverbose = B_TRUE; flags.verbose = B_TRUE; flags.progress = B_TRUE; break; case 'D': flags.dedup = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'L': flags.largeblock = B_TRUE; break; case 'e': flags.embed_data = B_TRUE; break; case 't': resume_token = optarg; break; case 'c': flags.compress = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': /*FALLTHROUGH*/ default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (resume_token != NULL) { if (fromname != NULL || flags.replicate || flags.props || flags.dedup) { (void) fprintf(stderr, gettext("invalid flags combined with -t\n")); usage(B_FALSE); } if (argc != 0) { (void) fprintf(stderr, gettext("no additional " "arguments are permitted with -t\n")); usage(B_FALSE); } } else { if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } if (!flags.dryrun && isatty(STDOUT_FILENO)) { (void) fprintf(stderr, gettext("Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n")); return (1); } if (resume_token != NULL) { return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, resume_token)); } /* * Special case sending a filesystem, or from a bookmark. */ if (strchr(argv[0], '@') == NULL || (fromname && strchr(fromname, '#') != NULL)) { char frombuf[ZFS_MAX_DATASET_NAME_LEN]; - enum lzc_send_flags lzc_flags = 0; if (flags.replicate || flags.doall || flags.props || - flags.dedup || flags.dryrun || flags.verbose || - flags.progress) { - (void) fprintf(stderr, - gettext("Error: " + flags.dedup || (strchr(argv[0], '@') == NULL && + (flags.dryrun || flags.verbose || flags.progress))) { + (void) fprintf(stderr, gettext("Error: " "Unsupported flag with filesystem or bookmark.\n")); return (1); } zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); - if (flags.largeblock) - lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; - if (flags.embed_data) - lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; - if (flags.compress) - lzc_flags |= LZC_SEND_FLAG_COMPRESS; - if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { /* * Incremental source name begins with # or @. * Default to same fs as target. */ (void) strncpy(frombuf, argv[0], sizeof (frombuf)); cp = strchr(frombuf, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(frombuf, fromname, sizeof (frombuf)); fromname = frombuf; } - err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags); + err = zfs_send_one(zhp, fromname, STDOUT_FILENO, flags); zfs_close(zhp); return (err != 0); } cp = strchr(argv[0], '@'); *cp = '\0'; toname = cp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); /* * If they specified the full path to the snapshot, chop off * everything except the short name of the snapshot, but special * case if they specify the origin. */ if (fromname && (cp = strchr(fromname, '@')) != NULL) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE); if (strcmp(origin, fromname) == 0) { fromname = NULL; flags.fromorigin = B_TRUE; } else { *cp = '\0'; if (cp != fromname && strcmp(argv[0], fromname)) { (void) fprintf(stderr, gettext("incremental source must be " "in same filesystem\n")); usage(B_FALSE); } fromname = cp + 1; if (strchr(fromname, '@') || strchr(fromname, '/')) { (void) fprintf(stderr, gettext("invalid incremental source\n")); usage(B_FALSE); } } } if (flags.replicate && fromname == NULL) flags.doall = B_TRUE; err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, extraverbose ? &dbgnv : NULL); if (extraverbose && dbgnv != NULL) { /* * dump_nvlist prints to stdout, but that's been * redirected to a file. Make it print to stderr * instead. */ (void) dup2(STDERR_FILENO, STDOUT_FILENO); dump_nvlist(dbgnv, 0); nvlist_free(dbgnv); } zfs_close(zhp); return (err != 0); } /* * Restore a backup stream from stdin. */ static int zfs_do_receive(int argc, char **argv) { int c, err = 0; recvflags_t flags = { 0 }; boolean_t abort_resumable = B_FALSE; nvlist_t *props; nvpair_t *nvp = NULL; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) { switch (c) { case 'o': if (parseprop(props, optarg) != 0) return (1); break; case 'd': flags.isprefix = B_TRUE; break; case 'e': flags.isprefix = B_TRUE; flags.istail = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'u': flags.nomount = B_TRUE; break; case 'v': flags.verbose = B_TRUE; break; case 's': flags.resumable = B_TRUE; break; case 'F': flags.force = B_TRUE; break; case 'A': abort_resumable = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } while ((nvp = nvlist_next_nvpair(props, nvp))) { if (strcmp(nvpair_name(nvp), "origin") != 0) { (void) fprintf(stderr, gettext("invalid option")); usage(B_FALSE); } } if (abort_resumable) { if (flags.isprefix || flags.istail || flags.dryrun || flags.resumable || flags.nomount) { (void) fprintf(stderr, gettext("invalid option")); usage(B_FALSE); } char namebuf[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(namebuf, sizeof (namebuf), "%s/%%recv", argv[0]); if (zfs_dataset_exists(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_handle_t *zhp = zfs_open(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); err = zfs_destroy(zhp, B_FALSE); } else { zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { (void) fprintf(stderr, gettext("'%s' does not have any " "resumable receive state to abort\n"), argv[0]); return (1); } err = zfs_destroy(zhp, B_FALSE); } return (err != 0); } if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " "from a terminal.\n" "You must redirect standard input.\n")); return (1); } err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); return (err != 0); } /* * allow/unallow stuff */ /* copied from zfs/sys/dsl_deleg.h */ #define ZFS_DELEG_PERM_CREATE "create" #define ZFS_DELEG_PERM_DESTROY "destroy" #define ZFS_DELEG_PERM_SNAPSHOT "snapshot" #define ZFS_DELEG_PERM_ROLLBACK "rollback" #define ZFS_DELEG_PERM_CLONE "clone" #define ZFS_DELEG_PERM_PROMOTE "promote" #define ZFS_DELEG_PERM_RENAME "rename" #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ #define ZFS_DELEG_PERM_USERQUOTA "userquota" #define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" #define ZFS_DELEG_PERM_USERUSED "userused" #define ZFS_DELEG_PERM_GROUPUSED "groupused" #define ZFS_DELEG_PERM_HOLD "hold" #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" #define ZFS_DELEG_PERM_REMAP "remap" #define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, { ZFS_DELEG_PERM_REMAP, ZFS_DELEG_NOTE_REMAP }, { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, { NULL, ZFS_DELEG_NOTE_NONE } }; /* permission structure */ typedef struct deleg_perm { zfs_deleg_who_type_t dp_who_type; const char *dp_name; boolean_t dp_local; boolean_t dp_descend; } deleg_perm_t; /* */ typedef struct deleg_perm_node { deleg_perm_t dpn_perm; uu_avl_node_t dpn_avl_node; } deleg_perm_node_t; typedef struct fs_perm fs_perm_t; /* permissions set */ typedef struct who_perm { zfs_deleg_who_type_t who_type; const char *who_name; /* id */ char who_ug_name[256]; /* user/group name */ fs_perm_t *who_fsperm; /* uplink */ uu_avl_t *who_deleg_perm_avl; /* permissions */ } who_perm_t; /* */ typedef struct who_perm_node { who_perm_t who_perm; uu_avl_node_t who_avl_node; } who_perm_node_t; typedef struct fs_perm_set fs_perm_set_t; /* fs permissions */ struct fs_perm { const char *fsp_name; uu_avl_t *fsp_sc_avl; /* sets,create */ uu_avl_t *fsp_uge_avl; /* user,group,everyone */ fs_perm_set_t *fsp_set; /* uplink */ }; /* */ typedef struct fs_perm_node { fs_perm_t fspn_fsperm; uu_avl_t *fspn_avl; uu_list_node_t fspn_list_node; } fs_perm_node_t; /* top level structure */ struct fs_perm_set { uu_list_pool_t *fsps_list_pool; uu_list_t *fsps_list; /* list of fs_perms */ uu_avl_pool_t *fsps_named_set_avl_pool; uu_avl_pool_t *fsps_who_perm_avl_pool; uu_avl_pool_t *fsps_deleg_perm_avl_pool; }; static inline const char * deleg_perm_type(zfs_deleg_note_t note) { /* subcommands */ switch (note) { /* SUBCOMMANDS */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: case ZFS_DELEG_NOTE_GROUPUSED: case ZFS_DELEG_NOTE_USERPROP: case ZFS_DELEG_NOTE_USERQUOTA: case ZFS_DELEG_NOTE_USERUSED: /* other */ return (gettext("other")); default: return (gettext("subcommand")); } } static int who_type2weight(zfs_deleg_who_type_t who_type) { int res; switch (who_type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: res = 0; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: res = 1; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: res = 2; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: res = 3; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: res = 4; break; default: res = -1; } return (res); } /* ARGSUSED */ static int who_perm_compare(const void *larg, const void *rarg, void *unused) { const who_perm_node_t *l = larg; const who_perm_node_t *r = rarg; zfs_deleg_who_type_t ltype = l->who_perm.who_type; zfs_deleg_who_type_t rtype = r->who_perm.who_type; int lweight = who_type2weight(ltype); int rweight = who_type2weight(rtype); int res = lweight - rweight; if (res == 0) res = strncmp(l->who_perm.who_name, r->who_perm.who_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } /* ARGSUSED */ static int deleg_perm_compare(const void *larg, const void *rarg, void *unused) { const deleg_perm_node_t *l = larg; const deleg_perm_node_t *r = rarg; int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } static inline void fs_perm_set_init(fs_perm_set_t *fspset) { bzero(fspset, sizeof (fs_perm_set_t)); if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) == NULL) nomem(); } static inline void fs_perm_fini(fs_perm_t *); static inline void who_perm_fini(who_perm_t *); static inline void fs_perm_set_fini(fs_perm_set_t *fspset) { fs_perm_node_t *node = uu_list_first(fspset->fsps_list); while (node != NULL) { fs_perm_node_t *next_node = uu_list_next(fspset->fsps_list, node); fs_perm_t *fsperm = &node->fspn_fsperm; fs_perm_fini(fsperm); uu_list_remove(fspset->fsps_list, node); free(node); node = next_node; } uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); } static inline void deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, const char *name) { deleg_perm->dp_who_type = type; deleg_perm->dp_name = name; } static inline void who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, zfs_deleg_who_type_t type, const char *name) { uu_avl_pool_t *pool; pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; bzero(who_perm, sizeof (who_perm_t)); if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) nomem(); who_perm->who_type = type; who_perm->who_name = name; who_perm->who_fsperm = fsperm; } static inline void who_perm_fini(who_perm_t *who_perm) { deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); while (node != NULL) { deleg_perm_node_t *next_node = uu_avl_next(who_perm->who_deleg_perm_avl, node); uu_avl_remove(who_perm->who_deleg_perm_avl, node); free(node); node = next_node; } uu_avl_destroy(who_perm->who_deleg_perm_avl); } static inline void fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) { uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; bzero(fsperm, sizeof (fs_perm_t)); if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) == NULL) nomem(); fsperm->fsp_set = fspset; fsperm->fsp_name = fsname; } static inline void fs_perm_fini(fs_perm_t *fsperm) { who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_sc_avl, node); free(node); node = next_node; } node = uu_avl_first(fsperm->fsp_uge_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_uge_avl, node); free(node); node = next_node; } uu_avl_destroy(fsperm->fsp_sc_avl); uu_avl_destroy(fsperm->fsp_uge_avl); } static void set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, zfs_deleg_who_type_t who_type, const char *name, char locality) { uu_avl_index_t idx = 0; deleg_perm_node_t *found_node = NULL; deleg_perm_t *deleg_perm = &node->dpn_perm; deleg_perm_init(deleg_perm, who_type, name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) uu_avl_insert(avl, node, idx); else { node = found_node; deleg_perm = &node->dpn_perm; } switch (locality) { case ZFS_DELEG_LOCAL: deleg_perm->dp_local = B_TRUE; break; case ZFS_DELEG_DESCENDENT: deleg_perm->dp_descend = B_TRUE; break; case ZFS_DELEG_NA: break; default: assert(B_FALSE); /* invalid locality */ } } static inline int parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; uu_avl_t *avl = who_perm->who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_perm->who_type; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *name = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; deleg_perm_node_t *node = safe_malloc(sizeof (deleg_perm_node_t)); assert(type == DATA_TYPE_BOOLEAN); uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); set_deleg_perm_node(avl, node, who_type, name, locality); } return (0); } static inline int parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = fsperm->fsp_set; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *name = nvpair_name(nvp); uu_avl_t *avl = NULL; uu_avl_pool_t *avl_pool = NULL; zfs_deleg_who_type_t perm_type = name[0]; char perm_locality = name[1]; const char *perm_name = name + 3; boolean_t is_set = B_TRUE; who_perm_t *who_perm = NULL; assert('$' == name[2]); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); switch (perm_type) { case ZFS_DELEG_CREATE: case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_NAMED_SET: case ZFS_DELEG_NAMED_SET_SETS: avl_pool = fspset->fsps_named_set_avl_pool; avl = fsperm->fsp_sc_avl; break; case ZFS_DELEG_USER: case ZFS_DELEG_USER_SETS: case ZFS_DELEG_GROUP: case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_EVERYONE: case ZFS_DELEG_EVERYONE_SETS: avl_pool = fspset->fsps_who_perm_avl_pool; avl = fsperm->fsp_uge_avl; break; default: assert(!"unhandled zfs_deleg_who_type_t"); } if (is_set) { who_perm_node_t *found_node = NULL; who_perm_node_t *node = safe_malloc( sizeof (who_perm_node_t)); who_perm = &node->who_perm; uu_avl_index_t idx = 0; uu_avl_node_init(node, &node->who_avl_node, avl_pool); who_perm_init(who_perm, fsperm, perm_type, perm_name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) { if (avl == fsperm->fsp_uge_avl) { uid_t rid = 0; struct passwd *p = NULL; struct group *g = NULL; const char *nice_name = NULL; switch (perm_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: rid = atoi(perm_name); p = getpwuid(rid); if (p) nice_name = p->pw_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: rid = atoi(perm_name); g = getgrgid(rid); if (g) nice_name = g->gr_name; break; default: break; } if (nice_name != NULL) (void) strlcpy( node->who_perm.who_ug_name, nice_name, 256); } uu_avl_insert(avl, node, idx); } else { node = found_node; who_perm = &node->who_perm; } } (void) parse_who_perm(who_perm, nvl2, perm_locality); } return (0); } static inline int parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) { nvpair_t *nvp = NULL; uu_avl_index_t idx = 0; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *fsname = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); fs_perm_t *fsperm = NULL; fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); if (node == NULL) nomem(); fsperm = &node->fspn_fsperm; assert(DATA_TYPE_NVLIST == type); uu_list_node_init(node, &node->fspn_list_node, fspset->fsps_list_pool); idx = uu_list_numnodes(fspset->fsps_list); fs_perm_init(fsperm, fspset, fsname); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); (void) parse_fs_perm(fsperm, nvl2); uu_list_insert(fspset->fsps_list, node, idx); } return (0); } static inline const char * deleg_perm_comment(zfs_deleg_note_t note) { const char *str = ""; /* subcommands */ switch (note) { /* SUBCOMMANDS */ case ZFS_DELEG_NOTE_ALLOW: str = gettext("Must also have the permission that is being" "\n\t\t\t\tallowed"); break; case ZFS_DELEG_NOTE_CLONE: str = gettext("Must also have the 'create' ability and 'mount'" "\n\t\t\t\tability in the origin file system"); break; case ZFS_DELEG_NOTE_CREATE: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DESTROY: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DIFF: str = gettext("Allows lookup of paths within a dataset;" "\n\t\t\t\tgiven an object number. Ordinary users need this" "\n\t\t\t\tin order to use zfs diff"); break; case ZFS_DELEG_NOTE_HOLD: str = gettext("Allows adding a user hold to a snapshot"); break; case ZFS_DELEG_NOTE_MOUNT: str = gettext("Allows mount/umount of ZFS datasets"); break; case ZFS_DELEG_NOTE_PROMOTE: str = gettext("Must also have the 'mount'\n\t\t\t\tand" " 'promote' ability in the origin file system"); break; case ZFS_DELEG_NOTE_RECEIVE: str = gettext("Must also have the 'mount' and 'create'" " ability"); break; case ZFS_DELEG_NOTE_RELEASE: str = gettext("Allows releasing a user hold which\n\t\t\t\t" "might destroy the snapshot"); break; case ZFS_DELEG_NOTE_RENAME: str = gettext("Must also have the 'mount' and 'create'" "\n\t\t\t\tability in the new parent"); break; case ZFS_DELEG_NOTE_ROLLBACK: str = gettext(""); break; case ZFS_DELEG_NOTE_SEND: str = gettext(""); break; case ZFS_DELEG_NOTE_SHARE: str = gettext("Allows sharing file systems over NFS or SMB" "\n\t\t\t\tprotocols"); break; case ZFS_DELEG_NOTE_SNAPSHOT: str = gettext(""); break; /* * case ZFS_DELEG_NOTE_VSCAN: * str = gettext(""); * break; */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: str = gettext("Allows accessing any groupquota@... property"); break; case ZFS_DELEG_NOTE_GROUPUSED: str = gettext("Allows reading any groupused@... property"); break; case ZFS_DELEG_NOTE_USERPROP: str = gettext("Allows changing any user property"); break; case ZFS_DELEG_NOTE_USERQUOTA: str = gettext("Allows accessing any userquota@... property"); break; case ZFS_DELEG_NOTE_USERUSED: str = gettext("Allows reading any userused@... property"); break; /* other */ default: str = ""; } return (str); } struct allow_opts { boolean_t local; boolean_t descend; boolean_t user; boolean_t group; boolean_t everyone; boolean_t create; boolean_t set; boolean_t recursive; /* unallow only */ boolean_t prt_usage; boolean_t prt_perms; char *who; char *perms; const char *dataset; }; static inline int prop_cmp(const void *a, const void *b) { const char *str1 = *(const char **)a; const char *str2 = *(const char **)b; return (strcmp(str1, str2)); } static void allow_usage(boolean_t un, boolean_t requested, const char *msg) { const char *opt_desc[] = { "-h", gettext("show this help message and exit"), "-l", gettext("set permission locally"), "-d", gettext("set permission for descents"), "-u", gettext("set permission for user"), "-g", gettext("set permission for group"), "-e", gettext("set permission for everyone"), "-c", gettext("set create time permission"), "-s", gettext("define permission set"), /* unallow only */ "-r", gettext("remove permissions recursively"), }; size_t unallow_size = sizeof (opt_desc) / sizeof (char *); size_t allow_size = unallow_size - 2; const char *props[ZFS_NUM_PROPS]; int i; size_t count = 0; FILE *fp = requested ? stdout : stderr; zprop_desc_t *pdtbl = zfs_prop_get_table(); const char *fmt = gettext("%-16s %-14s\t%s\n"); (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : HELP_ALLOW)); (void) fprintf(fp, gettext("Options:\n")); for (i = 0; i < (un ? unallow_size : allow_size); i++) { const char *opt = opt_desc[i++]; const char *optdsc = opt_desc[i]; (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); } (void) fprintf(fp, gettext("\nThe following permissions are " "supported:\n\n")); (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), gettext("NOTES")); for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; const char *perm_type = deleg_perm_type(perm_note); const char *perm_comment = deleg_perm_comment(perm_note); (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); } for (i = 0; i < ZFS_NUM_PROPS; i++) { zprop_desc_t *pd = &pdtbl[i]; if (pd->pd_visible != B_TRUE) continue; if (pd->pd_attr == PROP_READONLY) continue; props[count++] = pd->pd_name; } props[count] = NULL; qsort(props, count, sizeof (char *), prop_cmp); for (i = 0; i < count; i++) (void) fprintf(fp, fmt, props[i], gettext("property"), ""); if (msg != NULL) (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); exit(requested ? 0 : 2); } static inline const char * munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, char **permsp) { if (un && argc == expected_argc - 1) *permsp = NULL; else if (argc == expected_argc) *permsp = argv[argc - 2]; else allow_usage(un, B_FALSE, gettext("wrong number of parameters\n")); return (argv[argc - 1]); } static void parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) { int uge_sum = opts->user + opts->group + opts->everyone; int csuge_sum = opts->create + opts->set + uge_sum; int ldcsuge_sum = csuge_sum + opts->local + opts->descend; int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; if (uge_sum > 1) allow_usage(un, B_FALSE, gettext("-u, -g, and -e are mutually exclusive\n")); if (opts->prt_usage) { if (argc == 0 && all_sum == 0) allow_usage(un, B_TRUE, NULL); else usage(B_FALSE); } if (opts->set) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -s\n")); opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); if (argv[0][0] != '@') allow_usage(un, B_FALSE, gettext("invalid set name: missing '@' prefix\n")); opts->who = argv[0]; } else if (opts->create) { if (ldcsuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -c\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (opts->everyone) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -e\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") == 0) { opts->everyone = B_TRUE; argc--; argv++; opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (argc == 1 && !un) { opts->prt_perms = B_TRUE; opts->dataset = argv[argc-1]; } else { opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); opts->who = argv[0]; } if (!opts->local && !opts->descend) { opts->local = B_TRUE; opts->descend = B_TRUE; } } static void store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, const char *who, char *perms, nvlist_t *top_nvl) { int i; char ld[2] = { '\0', '\0' }; char who_buf[MAXNAMELEN + 32]; char base_type = '\0'; char set_type = '\0'; nvlist_t *base_nvl = NULL; nvlist_t *set_nvl = NULL; nvlist_t *nvl; if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); switch (type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: set_type = ZFS_DELEG_NAMED_SET_SETS; base_type = ZFS_DELEG_NAMED_SET; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: set_type = ZFS_DELEG_CREATE_SETS; base_type = ZFS_DELEG_CREATE; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: set_type = ZFS_DELEG_USER_SETS; base_type = ZFS_DELEG_USER; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: set_type = ZFS_DELEG_GROUP_SETS; base_type = ZFS_DELEG_GROUP; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: set_type = ZFS_DELEG_EVERYONE_SETS; base_type = ZFS_DELEG_EVERYONE; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; default: assert(set_type != '\0' && base_type != '\0'); } if (perms != NULL) { char *curr = perms; char *end = curr + strlen(perms); while (curr < end) { char *delim = strchr(curr, ','); if (delim == NULL) delim = end; else *delim = '\0'; if (curr[0] == '@') nvl = set_nvl; else nvl = base_nvl; (void) nvlist_add_boolean(nvl, curr); if (delim != end) *delim = ','; curr = delim + 1; } for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (!nvlist_empty(base_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, base_nvl); } if (!nvlist_empty(set_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, set_nvl); } } } else { for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); } } } static int construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) { if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) nomem(); if (opts->set) { store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, opts->descend, opts->who, opts->perms, *nvlp); } else if (opts->create) { store_allow_perm(ZFS_DELEG_CREATE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else if (opts->everyone) { store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else { char *curr = opts->who; char *end = curr + strlen(curr); while (curr < end) { const char *who; zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; char *endch; char *delim = strchr(curr, ','); char errbuf[256]; char id[64]; struct passwd *p = NULL; struct group *g = NULL; uid_t rid; if (delim == NULL) delim = end; else *delim = '\0'; rid = (uid_t)strtol(curr, &endch, 0); if (opts->user) { who_type = ZFS_DELEG_USER; if (*endch != '\0') p = getpwnam(curr); else p = getpwuid(rid); if (p != NULL) rid = p->pw_uid; else { (void) snprintf(errbuf, 256, gettext( "invalid user %s"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { who_type = ZFS_DELEG_GROUP; if (*endch != '\0') g = getgrnam(curr); else g = getgrgid(rid); if (g != NULL) rid = g->gr_gid; else { (void) snprintf(errbuf, 256, gettext( "invalid group %s"), curr); allow_usage(un, B_TRUE, errbuf); } } else { if (*endch != '\0') { p = getpwnam(curr); } else { p = getpwuid(rid); } if (p == NULL) { if (*endch != '\0') { g = getgrnam(curr); } else { g = getgrgid(rid); } } if (p != NULL) { who_type = ZFS_DELEG_USER; rid = p->pw_uid; } else if (g != NULL) { who_type = ZFS_DELEG_GROUP; rid = g->gr_gid; } else { (void) snprintf(errbuf, 256, gettext( "invalid user/group %s"), curr); allow_usage(un, B_TRUE, errbuf); } } (void) sprintf(id, "%u", rid); who = id; store_allow_perm(who_type, opts->local, opts->descend, who, opts->perms, *nvlp); curr = delim + 1; } } return (0); } static void print_set_creat_perms(uu_avl_t *who_avl) { const char *sc_title[] = { gettext("Permission sets:\n"), gettext("Create time permissions:\n"), NULL }; const char **title_ptr = sc_title; who_perm_node_t *who_node = NULL; int prev_weight = -1; for (who_node = uu_avl_first(who_avl); who_node != NULL; who_node = uu_avl_next(who_avl, who_node)) { uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; const char *who_name = who_node->who_perm.who_name; int weight = who_type2weight(who_type); boolean_t first = B_TRUE; deleg_perm_node_t *deleg_node; if (prev_weight != weight) { (void) printf(*title_ptr++); prev_weight = weight; } if (who_name == NULL || strnlen(who_name, 1) == 0) (void) printf("\t"); else (void) printf("\t%s ", who_name); for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (first) { (void) printf("%s", deleg_node->dpn_perm.dp_name); first = B_FALSE; } else (void) printf(",%s", deleg_node->dpn_perm.dp_name); } (void) printf("\n"); } } static void print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, const char *title) { who_perm_node_t *who_node = NULL; boolean_t prt_title = B_TRUE; uu_avl_walk_t *walk; if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((who_node = uu_avl_walk_next(walk)) != NULL) { const char *who_name = who_node->who_perm.who_name; const char *nice_who_name = who_node->who_perm.who_ug_name; uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; char delim = ' '; deleg_perm_node_t *deleg_node; boolean_t prt_who = B_TRUE; for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (local != deleg_node->dpn_perm.dp_local || descend != deleg_node->dpn_perm.dp_descend) continue; if (prt_who) { const char *who = NULL; if (prt_title) { prt_title = B_FALSE; (void) printf(title); } switch (who_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: who = gettext("user"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: who = gettext("group"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: who = gettext("everyone"); who_name = NULL; break; default: assert(who != NULL); } prt_who = B_FALSE; if (who_name == NULL) (void) printf("\t%s", who); else (void) printf("\t%s %s", who, who_name); } (void) printf("%c%s", delim, deleg_node->dpn_perm.dp_name); delim = ','; } if (!prt_who) (void) printf("\n"); } uu_avl_walk_end(walk); } static void print_fs_perms(fs_perm_set_t *fspset) { fs_perm_node_t *node = NULL; char buf[MAXNAMELEN + 32]; const char *dsname = buf; for (node = uu_list_first(fspset->fsps_list); node != NULL; node = uu_list_next(fspset->fsps_list, node)) { uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; int left = 0; (void) snprintf(buf, sizeof (buf), gettext("---- Permissions on %s "), node->fspn_fsperm.fsp_name); (void) printf(dsname); left = 70 - strlen(buf); while (left-- > 0) (void) printf("-"); (void) printf("\n"); print_set_creat_perms(sc_avl); print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, gettext("Local permissions:\n")); print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, gettext("Descendent permissions:\n")); print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, gettext("Local+Descendent permissions:\n")); } } static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; struct deleg_perms { boolean_t un; nvlist_t *nvl; }; static int set_deleg_perms(zfs_handle_t *zhp, void *data) { struct deleg_perms *perms = (struct deleg_perms *)data; zfs_type_t zfs_type = zfs_get_type(zhp); if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) return (0); return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); } static int zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) { zfs_handle_t *zhp; nvlist_t *perm_nvl = NULL; nvlist_t *update_perm_nvl = NULL; int error = 1; int c; struct allow_opts opts = { 0 }; const char *optstr = un ? "ldugecsrh" : "ldugecsh"; /* check opts */ while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'l': opts.local = B_TRUE; break; case 'd': opts.descend = B_TRUE; break; case 'u': opts.user = B_TRUE; break; case 'g': opts.group = B_TRUE; break; case 'e': opts.everyone = B_TRUE; break; case 's': opts.set = B_TRUE; break; case 'c': opts.create = B_TRUE; break; case 'r': opts.recursive = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'h': opts.prt_usage = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ parse_allow_args(argc, argv, un, &opts); /* try to open the dataset */ if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { (void) fprintf(stderr, "Failed to open dataset: %s\n", opts.dataset); return (-1); } if (zfs_get_fsacl(zhp, &perm_nvl) != 0) goto cleanup2; fs_perm_set_init(&fs_perm_set); if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); goto cleanup1; } if (opts.prt_perms) print_fs_perms(&fs_perm_set); else { (void) construct_fsacl_list(un, &opts, &update_perm_nvl); if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) goto cleanup0; if (un && opts.recursive) { struct deleg_perms data = { un, update_perm_nvl }; if (zfs_iter_filesystems(zhp, set_deleg_perms, &data) != 0) goto cleanup0; } } error = 0; cleanup0: nvlist_free(perm_nvl); nvlist_free(update_perm_nvl); cleanup1: fs_perm_set_fini(&fs_perm_set); cleanup2: zfs_close(zhp); return (error); } static int zfs_do_allow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); } static int zfs_do_unallow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); } static int zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) { int errors = 0; int i; const char *tag; boolean_t recursive = B_FALSE; const char *opts = holding ? "rt" : "r"; int c; /* check options */ while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 2) usage(B_FALSE); tag = argv[0]; --argc; ++argv; if (holding && tag[0] == '.') { /* tags starting with '.' are reserved for libzfs */ (void) fprintf(stderr, gettext("tag may not start with '.'\n")); usage(B_FALSE); } for (i = 0; i < argc; ++i) { zfs_handle_t *zhp; char parent[ZFS_MAX_DATASET_NAME_LEN]; const char *delim; char *path = argv[i]; delim = strchr(path, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), path); ++errors; continue; } (void) strncpy(parent, path, delim - path); parent[delim - path] = '\0'; zhp = zfs_open(g_zfs, parent, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { ++errors; continue; } if (holding) { if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) ++errors; } else { if (zfs_release(zhp, delim+1, tag, recursive) != 0) ++errors; } zfs_close(zhp); } return (errors != 0); } /* * zfs hold [-r] [-t] ... * * -r Recursively hold * * Apply a user-hold with the given tag to the list of snapshots. */ static int zfs_do_hold(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); } /* * zfs release [-r] ... * * -r Recursively release * * Release a user-hold with the given tag from the list of snapshots. */ static int zfs_do_release(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); } typedef struct holds_cbdata { boolean_t cb_recursive; const char *cb_snapname; nvlist_t **cb_nvlp; size_t cb_max_namelen; size_t cb_max_taglen; } holds_cbdata_t; #define STRFTIME_FMT_STR "%a %b %e %k:%M %Y" #define DATETIME_BUF_LEN (32) /* * */ static void print_holds(boolean_t scripted, boolean_t literal, size_t nwidth, size_t tagwidth, nvlist_t *nvl) { int i; nvpair_t *nvp = NULL; char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; const char *col; if (!scripted) { for (i = 0; i < 3; i++) { col = gettext(hdr_cols[i]); if (i < 2) (void) printf("%-*s ", i ? tagwidth : nwidth, col); else (void) printf("%s\n", col); } } while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { char *zname = nvpair_name(nvp); nvlist_t *nvl2; nvpair_t *nvp2 = NULL; (void) nvpair_value_nvlist(nvp, &nvl2); while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { char tsbuf[DATETIME_BUF_LEN]; char *tagname = nvpair_name(nvp2); uint64_t val = 0; time_t time; struct tm t; (void) nvpair_value_uint64(nvp2, &val); if (literal) snprintf(tsbuf, DATETIME_BUF_LEN, "%llu", val); else { time = (time_t)val; (void) localtime_r(&time, &t); (void) strftime(tsbuf, DATETIME_BUF_LEN, gettext(STRFTIME_FMT_STR), &t); } if (scripted) { (void) printf("%s\t%s\t%s\n", zname, tagname, tsbuf); } else { (void) printf("%-*s %-*s %s\n", nwidth, zname, tagwidth, tagname, tsbuf); } } } } /* * Generic callback function to list a dataset or snapshot. */ static int holds_callback(zfs_handle_t *zhp, void *data) { holds_cbdata_t *cbp = data; nvlist_t *top_nvl = *cbp->cb_nvlp; nvlist_t *nvl = NULL; nvpair_t *nvp = NULL; const char *zname = zfs_get_name(zhp); size_t znamelen = strlen(zname); if (cbp->cb_recursive && cbp->cb_snapname != NULL) { const char *snapname; char *delim = strchr(zname, '@'); if (delim == NULL) return (0); snapname = delim + 1; if (strcmp(cbp->cb_snapname, snapname)) return (0); } if (zfs_get_holds(zhp, &nvl) != 0) return (-1); if (znamelen > cbp->cb_max_namelen) cbp->cb_max_namelen = znamelen; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *tag = nvpair_name(nvp); size_t taglen = strlen(tag); if (taglen > cbp->cb_max_taglen) cbp->cb_max_taglen = taglen; } return (nvlist_add_nvlist(top_nvl, zname, nvl)); } /* * zfs holds [-Hp] [-r | -d max] ... * * -H Suppress header output * -p Output literal values * -r Recursively search for holds * -d max Limit depth of recursive search */ static int zfs_do_holds(int argc, char **argv) { int errors = 0; int c; int i; boolean_t scripted = B_FALSE; boolean_t literal = B_FALSE; boolean_t recursive = B_FALSE; const char *opts = "d:rHp"; nvlist_t *nvl; int types = ZFS_TYPE_SNAPSHOT; holds_cbdata_t cb = { 0 }; int limit = 0; int ret = 0; int flags = 0; /* check options */ while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'd': limit = parse_depth(optarg, &flags); recursive = B_TRUE; break; case 'r': recursive = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'p': literal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (recursive) { types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; flags |= ZFS_ITER_RECURSE; } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) usage(B_FALSE); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); for (i = 0; i < argc; ++i) { char *snapshot = argv[i]; const char *delim; const char *snapname = NULL; delim = strchr(snapshot, '@'); if (delim != NULL) { snapname = delim + 1; if (recursive) snapshot[delim - snapshot] = '\0'; } cb.cb_recursive = recursive; cb.cb_snapname = snapname; cb.cb_nvlp = &nvl; /* * 1. collect holds data, set format options */ ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit, holds_callback, &cb); if (ret != 0) ++errors; } /* * 2. print holds data */ print_holds(scripted, literal, cb.cb_max_namelen, cb.cb_max_taglen, nvl); if (nvlist_empty(nvl)) (void) printf(gettext("no datasets available\n")); nvlist_free(nvl); return (0 != errors); } #define CHECK_SPINNER 30 #define SPINNER_TIME 3 /* seconds */ #define MOUNT_TIME 1 /* seconds */ typedef struct get_all_state { boolean_t ga_verbose; get_all_cb_t *ga_cbp; } get_all_state_t; static int get_one_dataset(zfs_handle_t *zhp, void *data) { static char *spin[] = { "-", "\\", "|", "/" }; static int spinval = 0; static int spincheck = 0; static time_t last_spin_time = (time_t)0; get_all_state_t *state = data; zfs_type_t type = zfs_get_type(zhp); if (state->ga_verbose) { if (--spincheck < 0) { time_t now = time(NULL); if (last_spin_time + SPINNER_TIME < now) { update_progress(spin[spinval++ % 4]); last_spin_time = now; } spincheck = CHECK_SPINNER; } } /* * Interate over any nested datasets. */ if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { zfs_close(zhp); return (1); } /* * Skip any datasets whose type does not match. */ if ((type & ZFS_TYPE_FILESYSTEM) == 0) { zfs_close(zhp); return (0); } libzfs_add_handle(state->ga_cbp, zhp); assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc); return (0); } static void get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) { get_all_state_t state = { .ga_verbose = verbose, .ga_cbp = cbp }; if (verbose) set_progress_header(gettext("Reading ZFS config")); (void) zfs_iter_root(g_zfs, get_one_dataset, &state); if (verbose) finish_progress(gettext("done.")); } /* * Generic callback for sharing or mounting filesystems. Because the code is so * similar, we have a common function with an extra parameter to determine which * mode we are using. */ typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; typedef struct share_mount_state { share_mount_op_t sm_op; boolean_t sm_verbose; int sm_flags; char *sm_options; char *sm_proto; /* only valid for OP_SHARE */ pthread_mutex_t sm_lock; /* protects the remaining fields */ uint_t sm_total; /* number of filesystems to process */ uint_t sm_done; /* number of filesystems processed */ int sm_status; /* -1 if any of the share/mount operations failed */ } share_mount_state_t; /* * Share or mount a dataset. */ static int share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, boolean_t explicit, const char *options) { char mountpoint[ZFS_MAXPROPLEN]; char shareopts[ZFS_MAXPROPLEN]; char smbshareopts[ZFS_MAXPROPLEN]; const char *cmdname = op == OP_SHARE ? "share" : "mount"; struct mnttab mnt; uint64_t zoned, canmount; boolean_t shared_nfs, shared_smb; assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); /* * Check to make sure we can mount/share this dataset. If we * are in the global zone and the filesystem is exported to a * local zone, or if we are in a local zone and the * filesystem is not exported, then it is an error. */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned && getzoneid() == GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "dataset is exported to a local zone\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "permission denied\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * Ignore any filesystems which don't apply to us. This * includes those with a legacy mountpoint, or those with * legacy share options. */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && strcmp(smbshareopts, "off") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share '%s': " "legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("to " "share this filesystem set " "sharenfs property on\n")); return (1); } /* * We cannot share or mount legacy filesystems. If the * shareopts is non-legacy but the mountpoint is legacy, we * treat it as a legacy share. */ if (strcmp(mountpoint, "legacy") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use %s(8) to " "%s this filesystem\n"), cmdname, cmdname); return (1); } if (strcmp(mountpoint, "none") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': no " "mountpoint set\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * canmount explicit outcome * on no pass through * on yes pass through * off no return 0 * off yes display error, return 1 * noauto no return 0 * noauto yes pass through */ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); if (canmount == ZFS_CANMOUNT_OFF) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "'canmount' property is set to 'off'\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { return (0); } /* * If this filesystem is inconsistent and has a receive resume * token, we can not mount it. */ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Contains partially-completed state from " "\"zfs receive -r\", which can be resumed with " "\"zfs send -t\"\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the * filesystem is already mounted or shared, return (failing * for explicit requests); otherwise mount or share the * filesystem. */ switch (op) { case OP_SHARE: shared_nfs = zfs_is_shared_nfs(zhp, NULL); shared_smb = zfs_is_shared_smb(zhp, NULL); if ((shared_nfs && shared_smb) || (shared_nfs && strcmp(shareopts, "on") == 0 && strcmp(smbshareopts, "off") == 0) || (shared_smb && strcmp(smbshareopts, "on") == 0 && strcmp(shareopts, "off") == 0)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share " "'%s': filesystem already shared\n"), zfs_get_name(zhp)); return (1); } if (!zfs_is_mounted(zhp, NULL) && zfs_mount(zhp, NULL, 0) != 0) return (1); if (protocol == NULL) { if (zfs_shareall(zhp) != 0) return (1); } else if (strcmp(protocol, "nfs") == 0) { if (zfs_share_nfs(zhp)) return (1); } else if (strcmp(protocol, "smb") == 0) { if (zfs_share_smb(zhp)) return (1); } else { (void) fprintf(stderr, gettext("cannot share " "'%s': invalid share type '%s' " "specified\n"), zfs_get_name(zhp), protocol); return (1); } break; case OP_MOUNT: if (options == NULL) mnt.mnt_mntopts = ""; else mnt.mnt_mntopts = (char *)options; if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && zfs_is_mounted(zhp, NULL)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot mount " "'%s': filesystem already mounted\n"), zfs_get_name(zhp)); return (1); } if (zfs_mount(zhp, options, flags) != 0) return (1); break; } return (0); } /* * Reports progress in the form "(current/total)". Not thread-safe. */ static void report_mount_progress(int current, int total) { static time_t last_progress_time = 0; time_t now = time(NULL); char info[32]; /* display header if we're here for the first time */ if (current == 1) { set_progress_header(gettext("Mounting ZFS filesystems")); } else if (current != total && last_progress_time + MOUNT_TIME >= now) { /* too soon to report again */ return; } last_progress_time = now; (void) sprintf(info, "(%d/%d)", current, total); if (current == total) finish_progress(info); else update_progress(info); } /* * zfs_foreach_mountpoint() callback that mounts or shares on filesystem and * updates the progress meter */ static int share_mount_one_cb(zfs_handle_t *zhp, void *arg) { share_mount_state_t *sms = arg; int ret; ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, B_FALSE, sms->sm_options); pthread_mutex_lock(&sms->sm_lock); if (ret != 0) sms->sm_status = ret; sms->sm_done++; if (sms->sm_verbose) report_mount_progress(sms->sm_done, sms->sm_total); pthread_mutex_unlock(&sms->sm_lock); return (ret); } static void append_options(char *mntopts, char *newopts) { int len = strlen(mntopts); /* original length plus new string to append plus 1 for the comma */ if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { (void) fprintf(stderr, gettext("the opts argument for " "'%c' option is too long (more than %d chars)\n"), "-o", MNT_LINE_MAX); usage(B_FALSE); } if (*mntopts) mntopts[len++] = ','; (void) strcpy(&mntopts[len], newopts); } static int share_mount(int op, int argc, char **argv) { int do_all = 0; boolean_t verbose = B_FALSE; int c, ret = 0; char *options = NULL; int flags = 0; /* check options */ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'v': verbose = B_TRUE; break; case 'o': if (*optarg == '\0') { (void) fprintf(stderr, gettext("empty mount " "options (-o) specified\n")); usage(B_FALSE); } if (options == NULL) options = safe_malloc(MNT_LINE_MAX + 1); /* option validation is done later */ append_options(options, optarg); break; case 'O': warnx("no overlay mounts support on FreeBSD, ignoring"); break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (do_all) { char *protocol = NULL; if (op == OP_SHARE && argc > 0) { if (strcmp(argv[0], "nfs") != 0 && strcmp(argv[0], "smb") != 0) { (void) fprintf(stderr, gettext("share type " "must be 'nfs' or 'smb'\n")); usage(B_FALSE); } protocol = argv[0]; argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } start_progress_timer(); get_all_cb_t cb = { 0 }; get_all_datasets(&cb, verbose); if (cb.cb_used == 0) { if (options != NULL) free(options); return (0); } #ifdef illumos if (op == OP_SHARE) { sa_init_selective_arg_t sharearg; sharearg.zhandle_arr = cb.cb_handles; sharearg.zhandle_len = cb.cb_used; if ((ret = zfs_init_libshare_arg(g_zfs, SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) { (void) fprintf(stderr, gettext( "Could not initialize libshare, %d"), ret); return (ret); } } #endif share_mount_state_t share_mount_state = { 0 }; share_mount_state.sm_op = op; share_mount_state.sm_verbose = verbose; share_mount_state.sm_flags = flags; share_mount_state.sm_options = options; share_mount_state.sm_proto = protocol; share_mount_state.sm_total = cb.cb_used; pthread_mutex_init(&share_mount_state.sm_lock, NULL); /* * libshare isn't mt-safe, so only do the operation in parallel * if we're mounting. */ zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, share_mount_one_cb, &share_mount_state, op == OP_MOUNT); ret = share_mount_state.sm_status; for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); } else if (argc == 0) { struct mnttab entry; if ((op == OP_SHARE) || (options != NULL)) { (void) fprintf(stderr, gettext("missing filesystem " "argument (specify -a for all)\n")); usage(B_FALSE); } /* * When mount is given no arguments, go through /etc/mnttab and * display any active ZFS mounts. We hide any snapshots, since * they are controlled automatically. */ rewind(mnttab_file); while (getmntent(mnttab_file, &entry) == 0) { if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || strchr(entry.mnt_special, '@') != NULL) continue; (void) printf("%-30s %s\n", entry.mnt_special, entry.mnt_mountp); } } else { zfs_handle_t *zhp; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; } else { ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, options); zfs_close(zhp); } } return (ret); } /* * zfs mount -a [nfs] * zfs mount filesystem * * Mount all filesystems, or mount the given filesystem. */ static int zfs_do_mount(int argc, char **argv) { return (share_mount(OP_MOUNT, argc, argv)); } /* * zfs share -a [nfs | smb] * zfs share filesystem * * Share all filesystems, or share the given filesystem. */ static int zfs_do_share(int argc, char **argv) { return (share_mount(OP_SHARE, argc, argv)); } typedef struct unshare_unmount_node { zfs_handle_t *un_zhp; char *un_mountp; uu_avl_node_t un_avlnode; } unshare_unmount_node_t; /* ARGSUSED */ static int unshare_unmount_compare(const void *larg, const void *rarg, void *unused) { const unshare_unmount_node_t *l = larg; const unshare_unmount_node_t *r = rarg; return (strcmp(l->un_mountp, r->un_mountp)); } /* * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an * absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem, * and unmount it appropriately. */ static int unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) { zfs_handle_t *zhp; int ret = 0; struct stat64 statbuf; struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; /* * Search for the path in /etc/mnttab. Rather than looking for the * specific path, which can be fooled by non-standard paths (i.e. ".." * or "//"), we stat() the path and search for the corresponding * (major,minor) device pair. */ if (stat64(path, &statbuf) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), cmdname, path, strerror(errno)); return (1); } path_inode = statbuf.st_ino; /* * Search for the given (major,minor) pair in the mount table. */ #ifdef illumos rewind(mnttab_file); while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) { if (entry.mnt_major == major(statbuf.st_dev) && entry.mnt_minor == minor(statbuf.st_dev)) break; } #else { struct statfs sfs; if (statfs(path, &sfs) != 0) { (void) fprintf(stderr, "%s: %s\n", path, strerror(errno)); ret = -1; } statfs2mnttab(&sfs, &entry); } #endif if (ret != 0) { if (op == OP_SHARE) { (void) fprintf(stderr, gettext("cannot %s '%s': not " "currently mounted\n"), cmdname, path); return (1); } (void) fprintf(stderr, gettext("warning: %s not in mnttab\n"), path); if ((ret = umount2(path, flags)) != 0) (void) fprintf(stderr, gettext("%s: %s\n"), path, strerror(errno)); return (ret != 0); } if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " "filesystem\n"), cmdname, path); return (1); } if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) return (1); ret = 1; if (stat64(entry.mnt_mountp, &statbuf) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), cmdname, path, strerror(errno)); goto out; } else if (statbuf.st_ino != path_inode) { (void) fprintf(stderr, gettext("cannot " "%s '%s': not a mountpoint\n"), cmdname, path); goto out; } if (op == OP_SHARE) { char nfs_mnt_prop[ZFS_MAXPROPLEN]; char smbshare_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(smbshare_prop, "off") == 0) { (void) fprintf(stderr, gettext("cannot unshare " "'%s': legacy share\n"), path); #ifdef illumos (void) fprintf(stderr, gettext("use " "unshare(1M) to unshare this filesystem\n")); #endif } else if (!zfs_is_shared(zhp)) { (void) fprintf(stderr, gettext("cannot unshare '%s': " "not currently shared\n"), path); } else { ret = zfs_unshareall_bypath(zhp, path); } } else { char mtpt_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); if (is_manual) { ret = zfs_unmount(zhp, NULL, flags); } else if (strcmp(mtpt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot unmount " "'%s': legacy mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use umount(8) " "to unmount this filesystem\n")); } else { ret = zfs_unmountall(zhp, flags); } } out: zfs_close(zhp); return (ret != 0); } /* * Generic callback for unsharing or unmounting a filesystem. */ static int unshare_unmount(int op, int argc, char **argv) { int do_all = 0; int flags = 0; int ret = 0; int c; zfs_handle_t *zhp; char nfs_mnt_prop[ZFS_MAXPROPLEN]; char sharesmb[ZFS_MAXPROPLEN]; /* check options */ while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'f': flags = MS_FORCE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (do_all) { /* * We could make use of zfs_for_each() to walk all datasets in * the system, but this would be very inefficient, especially * since we would have to linearly search /etc/mnttab for each * one. Instead, do one pass through /etc/mnttab looking for * zfs entries and call zfs_unmount() for each one. * * Things get a little tricky if the administrator has created * mountpoints beneath other ZFS filesystems. In this case, we * have to unmount the deepest filesystems first. To accomplish * this, we place all the mountpoints in an AVL tree sorted by * the special type (dataset name), and walk the result in * reverse to make sure to get any snapshots first. */ struct mnttab entry; uu_avl_pool_t *pool; uu_avl_t *tree = NULL; unshare_unmount_node_t *node; uu_avl_index_t idx; uu_avl_walk_t *walk; if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (((pool = uu_avl_pool_create("unmount_pool", sizeof (unshare_unmount_node_t), offsetof(unshare_unmount_node_t, un_avlnode), unshare_unmount_compare, UU_DEFAULT)) == NULL) || ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) nomem(); rewind(mnttab_file); while (getmntent(mnttab_file, &entry) == 0) { /* ignore non-ZFS entries */ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* ignore snapshots */ if (strchr(entry.mnt_special, '@') != NULL) continue; if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; continue; } /* * Ignore datasets that are excluded/restricted by * parent pool name. */ if (zpool_skip_pool(zfs_get_pool_name(zhp))) { zfs_close(zhp); continue; } switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") != 0) break; verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0) continue; break; case OP_MOUNT: /* Ignore legacy mounts */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "legacy") == 0) continue; /* Ignore canmount=noauto mounts */ if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) continue; default: break; } node = safe_malloc(sizeof (unshare_unmount_node_t)); node->un_zhp = zhp; node->un_mountp = safe_strdup(entry.mnt_mountp); uu_avl_node_init(node, &node->un_avlnode, pool); if (uu_avl_find(tree, node, NULL, &idx) == NULL) { uu_avl_insert(tree, node, idx); } else { zfs_close(node->un_zhp); free(node->un_mountp); free(node); } } /* * Walk the AVL tree in reverse, unmounting each filesystem and * removing it from the AVL tree in the process. */ if ((walk = uu_avl_walk_start(tree, UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(tree, node); switch (op) { case OP_SHARE: if (zfs_unshareall_bypath(node->un_zhp, node->un_mountp) != 0) ret = 1; break; case OP_MOUNT: if (zfs_unmount(node->un_zhp, node->un_mountp, flags) != 0) ret = 1; break; } zfs_close(node->un_zhp); free(node->un_mountp); free(node); } uu_avl_walk_end(walk); uu_avl_destroy(tree); uu_avl_pool_destroy(pool); } else { if (argc != 1) { if (argc == 0) (void) fprintf(stderr, gettext("missing filesystem argument\n")); else (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * We have an argument, but it may be a full path or a ZFS * filesystem. Pass full paths off to unmount_path() (shared by * manual_unmount), otherwise open the filesystem and pass to * zfs_unmount(). */ if (argv[0][0] == '/') return (unshare_unmount_path(op, argv[0], flags, B_FALSE)); if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) return (1); verify(zfs_prop_get(zhp, op == OP_SHARE ? ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, sharesmb, sizeof (sharesmb), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(sharesmb, "off") == 0) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': legacy share\n"), zfs_get_name(zhp)); #ifdef illumos (void) fprintf(stderr, gettext("use " "unshare(1M) to unshare this " "filesystem\n")); #endif ret = 1; } else if (!zfs_is_shared(zhp)) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': not currently " "shared\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unshareall(zhp) != 0) { ret = 1; } break; case OP_MOUNT: if (strcmp(nfs_mnt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': legacy " "mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "umount(8) to unmount this " "filesystem\n")); ret = 1; } else if (!zfs_is_mounted(zhp, NULL)) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': not currently " "mounted\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unmountall(zhp, flags) != 0) { ret = 1; } break; } zfs_close(zhp); } return (ret); } /* * zfs unmount -a * zfs unmount filesystem * * Unmount all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unmount(int argc, char **argv) { return (unshare_unmount(OP_MOUNT, argc, argv)); } /* * zfs unshare -a * zfs unshare filesystem * * Unshare all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unshare(int argc, char **argv) { return (unshare_unmount(OP_SHARE, argc, argv)); } /* * Attach/detach the given dataset to/from the given jail */ /* ARGSUSED */ static int do_jail(int argc, char **argv, int attach) { zfs_handle_t *zhp; int jailid, ret; /* check number of arguments */ if (argc < 3) { (void) fprintf(stderr, gettext("missing argument(s)\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } jailid = jail_getid(argv[1]); if (jailid < 0) { (void) fprintf(stderr, gettext("invalid jail id or name\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); ret = (zfs_jail(zhp, jailid, attach) != 0); zfs_close(zhp); return (ret); } /* * zfs jail jailid filesystem * * Attach the given dataset to the given jail */ /* ARGSUSED */ static int zfs_do_jail(int argc, char **argv) { return (do_jail(argc, argv, 1)); } /* * zfs unjail jailid filesystem * * Detach the given dataset from the given jail */ /* ARGSUSED */ static int zfs_do_unjail(int argc, char **argv) { return (do_jail(argc, argv, 0)); } /* * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is * 'legacy'. Otherwise, complain that use should be using 'zfs mount'. */ static int manual_mount(int argc, char **argv) { zfs_handle_t *zhp; char mountpoint[ZFS_MAXPROPLEN]; char mntopts[MNT_LINE_MAX] = { '\0' }; int ret = 0; int c; int flags = 0; char *dataset, *path; /* check options */ while ((c = getopt(argc, argv, ":mo:O")) != -1) { switch (c) { case 'o': (void) strlcpy(mntopts, optarg, sizeof (mntopts)); break; case 'O': flags |= MS_OVERLAY; break; case 'm': flags |= MS_NOMNTTAB; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); (void) fprintf(stderr, gettext("usage: mount [-o opts] " "\n")); return (2); } } argc -= optind; argv += optind; /* check that we only have two arguments */ if (argc != 2) { if (argc == 0) (void) fprintf(stderr, gettext("missing dataset " "argument\n")); else if (argc == 1) (void) fprintf(stderr, gettext("missing mountpoint argument\n")); else (void) fprintf(stderr, gettext("too many arguments\n")); (void) fprintf(stderr, "usage: mount \n"); return (2); } dataset = argv[0]; path = argv[1]; /* try to open the dataset */ if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_FILESYSTEM)) == NULL) return (1); (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE); /* check for legacy mountpoint and complain appropriately */ ret = 0; if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) { if (zmount(dataset, path, flags, MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) { (void) fprintf(stderr, gettext("mount failed: %s\n"), strerror(errno)); ret = 1; } } else { (void) fprintf(stderr, gettext("filesystem '%s' cannot be " "mounted using 'mount -t zfs'\n"), dataset); (void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' " "instead.\n"), path); (void) fprintf(stderr, gettext("If you must use 'mount -t zfs' " "or /etc/fstab, use 'zfs set mountpoint=legacy'.\n")); (void) fprintf(stderr, gettext("See zfs(8) for more " "information.\n")); ret = 1; } return (ret); } /* * Called when invoked as /etc/fs/zfs/umount. Unlike a manual mount, we allow * unmounts of non-legacy filesystems, as this is the dominant administrative * interface. */ static int manual_unmount(int argc, char **argv) { int flags = 0; int c; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': flags = MS_FORCE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); (void) fprintf(stderr, gettext("usage: unmount [-f] " "\n")); return (2); } } argc -= optind; argv += optind; /* check arguments */ if (argc != 1) { if (argc == 0) (void) fprintf(stderr, gettext("missing path " "argument\n")); else (void) fprintf(stderr, gettext("too many arguments\n")); (void) fprintf(stderr, gettext("usage: unmount [-f] \n")); return (2); } return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE)); } static int find_command_idx(char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } static int zfs_do_diff(int argc, char **argv) { zfs_handle_t *zhp; int flags = 0; char *tosnap = NULL; char *fromsnap = NULL; char *atp, *copy; int err = 0; int c; while ((c = getopt(argc, argv, "FHt")) != -1) { switch (c) { case 'F': flags |= ZFS_DIFF_CLASSIFY; break; case 'H': flags |= ZFS_DIFF_PARSEABLE; break; case 't': flags |= ZFS_DIFF_TIMESTAMP; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("must provide at least one snapshot name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } fromsnap = argv[0]; tosnap = (argc == 2) ? argv[1] : NULL; copy = NULL; if (*fromsnap != '@') copy = strdup(fromsnap); else if (tosnap) copy = strdup(tosnap); if (copy == NULL) usage(B_FALSE); if ((atp = strchr(copy, '@')) != NULL) *atp = '\0'; if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) return (1); free(copy); /* * Ignore SIGPIPE so that the library can give us * information on any failure */ (void) sigignore(SIGPIPE); err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); zfs_close(zhp); return (err != 0); } /* * zfs remap * * Remap the indirect blocks in the given fileystem or volume. */ static int zfs_do_remap(int argc, char **argv) { const char *fsname; int err = 0; int c; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (argc != 2) { (void) fprintf(stderr, gettext("wrong number of arguments\n")); usage(B_FALSE); } fsname = argv[1]; err = zfs_remap_indirects(g_zfs, fsname); return (err); } /* * zfs bookmark * * Creates a bookmark with the given name from the given snapshot. */ static int zfs_do_bookmark(int argc, char **argv) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; zfs_handle_t *zhp; nvlist_t *nvl; int ret = 0; int c; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing bookmark argument\n")); goto usage; } if (strchr(argv[1], '#') == NULL) { (void) fprintf(stderr, gettext("invalid bookmark name '%s' -- " "must contain a '#'\n"), argv[1]); goto usage; } if (argv[0][0] == '@') { /* * Snapshot name begins with @. * Default to same fs as bookmark. */ (void) strncpy(snapname, argv[1], sizeof (snapname)); *strchr(snapname, '#') = '\0'; (void) strlcat(snapname, argv[0], sizeof (snapname)); } else { (void) strncpy(snapname, argv[0], sizeof (snapname)); } zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT); if (zhp == NULL) goto usage; zfs_close(zhp); nvl = fnvlist_alloc(); fnvlist_add_string(nvl, argv[1], snapname); ret = lzc_bookmark(nvl, NULL); fnvlist_free(nvl); if (ret != 0) { const char *err_msg; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create bookmark '%s'"), argv[1]); switch (ret) { case EXDEV: err_msg = "bookmark is in a different pool"; break; case EEXIST: err_msg = "bookmark exists"; break; case EINVAL: err_msg = "invalid argument"; break; case ENOTSUP: err_msg = "bookmark feature not enabled"; break; case ENOSPC: err_msg = "out of space"; break; default: err_msg = "unknown error"; break; } (void) fprintf(stderr, "%s: %s\n", errbuf, dgettext(TEXT_DOMAIN, err_msg)); } return (ret != 0); usage: usage(B_FALSE); return (-1); } static int zfs_do_channel_program(int argc, char **argv) { int ret, fd; char c; char *progbuf, *filename, *poolname; size_t progsize, progread; nvlist_t *outnvl; uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT; uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT; boolean_t sync_flag = B_TRUE, json_output = B_FALSE; zpool_handle_t *zhp; /* check options */ while (-1 != (c = getopt(argc, argv, "jnt:(instr-limit)m:(memory-limit)"))) { switch (c) { case 't': case 'm': { uint64_t arg; char *endp; errno = 0; arg = strtoull(optarg, &endp, 0); if (errno != 0 || *endp != '\0') { (void) fprintf(stderr, gettext( "invalid argument " "'%s': expected integer\n"), optarg); goto usage; } if (c == 't') { if (arg > ZCP_MAX_INSTRLIMIT || arg == 0) { (void) fprintf(stderr, gettext( "Invalid instruction limit: " "%s\n"), optarg); return (1); } else { instrlimit = arg; } } else { ASSERT3U(c, ==, 'm'); if (arg > ZCP_MAX_MEMLIMIT || arg == 0) { (void) fprintf(stderr, gettext( "Invalid memory limit: " "%s\n"), optarg); return (1); } else { memlimit = arg; } } break; } case 'n': { sync_flag = B_FALSE; break; } case 'j': { json_output = B_TRUE; break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; if (argc < 2) { (void) fprintf(stderr, gettext("invalid number of arguments\n")); goto usage; } poolname = argv[0]; filename = argv[1]; if (strcmp(filename, "-") == 0) { fd = 0; filename = "standard input"; } else if ((fd = open(filename, O_RDONLY)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), filename, strerror(errno)); return (1); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("cannot open pool '%s'"), poolname); return (1); } zpool_close(zhp); /* * Read in the channel program, expanding the program buffer as * necessary. */ progread = 0; progsize = 1024; progbuf = safe_malloc(progsize); do { ret = read(fd, progbuf + progread, progsize - progread); progread += ret; if (progread == progsize && ret > 0) { progsize *= 2; progbuf = safe_realloc(progbuf, progsize); } } while (ret > 0); if (fd != 0) (void) close(fd); if (ret < 0) { free(progbuf); (void) fprintf(stderr, gettext("cannot read '%s': %s\n"), filename, strerror(errno)); return (1); } progbuf[progread] = '\0'; /* * Any remaining arguments are passed as arguments to the lua script as * a string array: * { * "argv" -> [ "arg 1", ... "arg n" ], * } */ nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2); if (sync_flag) { ret = lzc_channel_program(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } else { ret = lzc_channel_program_nosync(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } if (ret != 0) { /* * On error, report the error message handed back by lua if one * exists. Otherwise, generate an appropriate error message, * falling back on strerror() for an unexpected return code. */ char *errstring = NULL; if (nvlist_exists(outnvl, ZCP_RET_ERROR)) { (void) nvlist_lookup_string(outnvl, ZCP_RET_ERROR, &errstring); if (errstring == NULL) errstring = strerror(ret); } else { switch (ret) { case EINVAL: errstring = "Invalid instruction or memory limit."; break; case ENOMEM: errstring = "Return value too large."; break; case ENOSPC: errstring = "Memory limit exhausted."; break; #ifdef illumos case ETIME: #else case ETIMEDOUT: #endif errstring = "Timed out."; break; case EPERM: errstring = "Permission denied. Channel " "programs must be run as root."; break; default: errstring = strerror(ret); } } (void) fprintf(stderr, gettext("Channel program execution failed:\n%s\n"), errstring); } else { if (json_output) { (void) nvlist_print_json(stdout, outnvl); } else if (nvlist_empty(outnvl)) { (void) fprintf(stdout, gettext("Channel program fully " "executed and did not produce output.\n")); } else { (void) fprintf(stdout, gettext("Channel program fully " "executed and produced output:\n")); dump_nvlist(outnvl, 4); } } free(progbuf); fnvlist_free(outnvl); fnvlist_free(argnvl); return (ret != 0); usage: usage(B_FALSE); return (-1); } int main(int argc, char **argv) { int ret = 0; int i; char *progname; char *cmdname; (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); opterr = 0; if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, gettext("internal error: failed to " "initialize ZFS library\n")); return (1); } zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) { (void) fprintf(stderr, gettext("internal error: unable to " "open %s\n"), MNTTAB); return (1); } /* * This command also doubles as the /etc/fs mount and unmount program. * Determine if we should take this behavior based on argv[0]. */ progname = basename(argv[0]); if (strcmp(progname, "mount") == 0) { ret = manual_mount(argc, argv); } else if (strcmp(progname, "umount") == 0) { ret = manual_unmount(argc, argv); } else { /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * The 'umount' command is an alias for 'unmount' */ if (strcmp(cmdname, "umount") == 0) cmdname = "unmount"; /* * The 'recv' command is an alias for 'receive' */ if (strcmp(cmdname, "recv") == 0) cmdname = "receive"; /* * The 'snap' command is an alias for 'snapshot' */ if (strcmp(cmdname, "snap") == 0) cmdname = "snapshot"; /* * Special case '-?' */ if (strcmp(cmdname, "-?") == 0) usage(B_TRUE); /* * Run the appropriate command. */ libzfs_mnttab_cache(g_zfs, B_TRUE); if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, argv + 1); } else if (strchr(cmdname, '=') != NULL) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, argv); } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); } libzfs_mnttab_cache(g_zfs, B_FALSE); } (void) fclose(mnttab_file); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } Index: projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs (revision 352536) +++ projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs (revision 352537) Property changes on: projects/clang900-import/cddl/contrib/opensolaris/cmd/zfs ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl/contrib/opensolaris/cmd/zfs:r351317-352536 Index: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h (revision 352536) +++ projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h (revision 352537) @@ -1,864 +1,864 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Martin Matuska . All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2017 Datto Inc. */ #ifndef _LIBZFS_H #define _LIBZFS_H #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Miscellaneous ZFS constants */ #define ZFS_MAXPROPLEN MAXPATHLEN #define ZPOOL_MAXPROPLEN MAXPATHLEN /* * libzfs errors */ typedef enum zfs_error { EZFS_SUCCESS = 0, /* no error -- success */ EZFS_NOMEM = 2000, /* out of memory */ EZFS_BADPROP, /* invalid property value */ EZFS_PROPREADONLY, /* cannot set readonly property */ EZFS_PROPTYPE, /* property does not apply to dataset type */ EZFS_PROPNONINHERIT, /* property is not inheritable */ EZFS_PROPSPACE, /* bad quota or reservation */ EZFS_BADTYPE, /* dataset is not of appropriate type */ EZFS_BUSY, /* pool or dataset is busy */ EZFS_EXISTS, /* pool or dataset already exists */ EZFS_NOENT, /* no such pool or dataset */ EZFS_BADSTREAM, /* bad backup stream */ EZFS_DSREADONLY, /* dataset is readonly */ EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ EZFS_INVALIDNAME, /* invalid dataset name */ EZFS_BADRESTORE, /* unable to restore to destination */ EZFS_BADBACKUP, /* backup failed */ EZFS_BADTARGET, /* bad attach/detach/replace target */ EZFS_NODEVICE, /* no such device in pool */ EZFS_BADDEV, /* invalid device to add */ EZFS_NOREPLICAS, /* no valid replicas */ EZFS_RESILVERING, /* currently resilvering */ EZFS_BADVERSION, /* unsupported version */ EZFS_POOLUNAVAIL, /* pool is currently unavailable */ EZFS_DEVOVERFLOW, /* too many devices in one vdev */ EZFS_BADPATH, /* must be an absolute path */ EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ EZFS_ZONED, /* used improperly in local zone */ EZFS_MOUNTFAILED, /* failed to mount dataset */ EZFS_UMOUNTFAILED, /* failed to unmount dataset */ EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ EZFS_SHARENFSFAILED, /* share(1M) failed */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ EZFS_FAULT, /* bad address */ EZFS_IO, /* I/O error */ EZFS_INTR, /* signal received */ EZFS_ISSPARE, /* device is a hot spare */ EZFS_INVALCONFIG, /* invalid vdev configuration */ EZFS_RECURSIVE, /* recursive dependency */ EZFS_NOHISTORY, /* no history object */ EZFS_POOLPROPS, /* couldn't retrieve pool props */ EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ EZFS_NAMETOOLONG, /* dataset name is too long */ EZFS_OPENFAILED, /* open of device failed */ EZFS_NOCAP, /* couldn't get capacity */ EZFS_LABELFAILED, /* write of label failed */ EZFS_BADWHO, /* invalid permission who */ EZFS_BADPERM, /* invalid permission */ EZFS_BADPERMSET, /* invalid permission set name */ EZFS_NODELEGATION, /* delegated administration is disabled */ EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ EZFS_SHARESMBFAILED, /* failed to share over smb */ EZFS_BADCACHE, /* bad cache file */ EZFS_ISL2CACHE, /* device is for the level 2 ARC */ EZFS_VDEVNOTSUP, /* unsupported vdev type */ EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ EZFS_REFTAG_RELE, /* snapshot release: tag not found */ EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ EZFS_PIPEFAILED, /* pipe create failed */ EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_SCRUBBING, /* currently scrubbing */ EZFS_NO_SCRUB, /* no active scrub */ EZFS_DIFF, /* general failure of zfs diff */ EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_SCRUB_PAUSED, /* scrub currently paused */ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */ EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */ EZFS_NO_CHECKPOINT, /* pool has no checkpoint */ EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ EZFS_TOOMANY, /* argument list too long */ EZFS_INITIALIZING, /* currently initializing */ EZFS_NO_INITIALIZE, /* no active initialize */ EZFS_UNKNOWN } zfs_error_t; /* * UEFI boot support parameters. When creating whole disk boot pool, * zpool create should allow to create EFI System partition for UEFI boot * program. In case of BIOS, the EFI System partition is not used * even if it does exist. */ typedef enum zpool_boot_label { ZPOOL_NO_BOOT_LABEL = 0, ZPOOL_CREATE_BOOT_LABEL, ZPOOL_COPY_BOOT_LABEL } zpool_boot_label_t; /* * The following data structures are all part * of the zfs_allow_t data structure which is * used for printing 'allow' permissions. * It is a linked list of zfs_allow_t's which * then contain avl tree's for user/group/sets/... * and each one of the entries in those trees have * avl tree's for the permissions they belong to and * whether they are local,descendent or local+descendent * permissions. The AVL trees are used primarily for * sorting purposes, but also so that we can quickly find * a given user and or permission. */ typedef struct zfs_perm_node { avl_node_t z_node; char z_pname[MAXPATHLEN]; } zfs_perm_node_t; typedef struct zfs_allow_node { avl_node_t z_node; char z_key[MAXPATHLEN]; /* name, such as joe */ avl_tree_t z_localdescend; /* local+descendent perms */ avl_tree_t z_local; /* local permissions */ avl_tree_t z_descend; /* descendent permissions */ } zfs_allow_node_t; typedef struct zfs_allow { struct zfs_allow *z_next; char z_setpoint[MAXPATHLEN]; avl_tree_t z_sets; avl_tree_t z_crperms; avl_tree_t z_user; avl_tree_t z_group; avl_tree_t z_everyone; } zfs_allow_t; /* * Basic handle types */ typedef struct zfs_handle zfs_handle_t; typedef struct zpool_handle zpool_handle_t; typedef struct libzfs_handle libzfs_handle_t; /* * Library initialization */ extern libzfs_handle_t *libzfs_init(void); extern void libzfs_fini(libzfs_handle_t *); extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *); extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *); extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); extern void zfs_save_arguments(int argc, char **, char *, int); extern int zpool_log_history(libzfs_handle_t *, const char *); extern int libzfs_errno(libzfs_handle_t *); extern const char *libzfs_error_action(libzfs_handle_t *); extern const char *libzfs_error_description(libzfs_handle_t *); extern int zfs_standard_error(libzfs_handle_t *, int, const char *); extern void libzfs_mnttab_init(libzfs_handle_t *); extern void libzfs_mnttab_fini(libzfs_handle_t *); extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, struct mnttab *); extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, const char *, const char *); extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *); /* * Basic handle functions */ extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); extern void zpool_close(zpool_handle_t *); extern const char *zpool_get_name(zpool_handle_t *); extern int zpool_get_state(zpool_handle_t *); extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); extern const char *zpool_pool_state_to_name(pool_state_t); extern void zpool_free_handles(libzfs_handle_t *); extern int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *); /* * Iterate over all active pools in the system. */ typedef int (*zpool_iter_f)(zpool_handle_t *, void *); extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); extern boolean_t zpool_skip_pool(const char *); /* * Functions to create and destroy pools */ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); extern int zpool_destroy(zpool_handle_t *, const char *); extern int zpool_add(zpool_handle_t *, nvlist_t *); typedef struct splitflags { /* do not split, but return the config that would be split off */ int dryrun : 1; /* after splitting, import the pool */ int import : 1; } splitflags_t; /* * Functions to manipulate pool and vdev state */ extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); extern int zpool_reguid(zpool_handle_t *); extern int zpool_reopen(zpool_handle_t *); extern int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); extern int zpool_vdev_remove_cancel(zpool_handle_t *); extern int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, splitflags_t); extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *, zpool_boot_label_t, uint64_t, int *); /* * Functions to manage pool properties */ extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, size_t proplen, zprop_source_t *, boolean_t); extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, zprop_source_t *); extern const char *zpool_prop_to_name(zpool_prop_t); extern const char *zpool_prop_values(zpool_prop_t); /* * Pool health statistics. */ typedef enum { /* * The following correspond to faults as defined in the (fault.fs.zfs.*) * event namespace. Each is associated with a corresponding message ID. */ ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ /* * If the pool has unsupported features but can still be opened in * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the * pool has unsupported features but cannot be opened at all, its * status is ZPOOL_STATUS_UNSUP_FEAT_READ. */ ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ /* * These faults have no corresponding message ID. At the time we are * checking the status, the original reason for the FMA fault (I/O or * checksum errors) has been lost. */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ /* * The following are not faults per se, but still an error possibly * requiring administrative attention. There is no corresponding * message ID. */ ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ /* * Finally, the following indicates a healthy pool. */ ZPOOL_STATUS_OK } zpool_status_t; extern zpool_status_t zpool_get_status(zpool_handle_t *, char **); extern zpool_status_t zpool_import_status(nvlist_t *, char **); extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); /* * Statistics and configuration functions. */ extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); extern nvlist_t *zpool_get_features(zpool_handle_t *); extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); extern boolean_t zpool_is_bootable(zpool_handle_t *); /* * Import and export functions */ extern int zpool_export(zpool_handle_t *, boolean_t, const char *); extern int zpool_export_force(zpool_handle_t *, const char *); extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, nvlist_t *, int); extern void zpool_print_unsup_feat(nvlist_t *config); /* * Search for pools to import */ typedef struct importargs { char **path; /* a list of paths to search */ int paths; /* number of paths to search */ char *poolname; /* name of a pool to find */ uint64_t guid; /* guid of a pool to find */ char *cachefile; /* cachefile to use for import */ int can_be_active : 1; /* can the pool be active? */ int unique : 1; /* does 'poolname' already exist? */ int exists : 1; /* set on return if pool already exists */ nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ } importargs_t; extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *); /* legacy pool search routines */ extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, char *, uint64_t); /* * Miscellaneous pool functions */ struct zfs_cmd; extern const char *zfs_history_event_names[]; extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, boolean_t verbose); extern int zpool_upgrade(zpool_handle_t *, uint64_t); extern int zpool_get_history(zpool_handle_t *, nvlist_t **); extern int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, uint_t *); extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t len); extern int zfs_ioctl(libzfs_handle_t *, int request, struct zfs_cmd *); extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); extern int zpool_checkpoint(zpool_handle_t *); extern int zpool_discard_checkpoint(zpool_handle_t *); /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. */ extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *); extern void zfs_close(zfs_handle_t *); extern zfs_type_t zfs_get_type(const zfs_handle_t *); extern const char *zfs_get_name(const zfs_handle_t *); extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); extern const char *zfs_get_pool_name(const zfs_handle_t *); /* * Property management functions. Some functions are shared with the kernel, * and are found in sys/fs/zfs.h. */ /* * zfs dataset property management */ extern const char *zfs_prop_default_string(zfs_prop_t); extern uint64_t zfs_prop_default_numeric(zfs_prop_t); extern const char *zfs_prop_column_name(zfs_prop_t); extern boolean_t zfs_prop_align_right(zfs_prop_t); extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, const char *); extern const char *zfs_prop_to_name(zfs_prop_t); extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, boolean_t); extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zprop_source_t *, char *, size_t); extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue); extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, char *buf, size_t len); extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); extern const char *zfs_prop_values(zfs_prop_t); extern int zfs_prop_is_string(zfs_prop_t prop); extern nvlist_t *zfs_get_user_props(zfs_handle_t *); extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); typedef struct zprop_list { int pl_prop; char *pl_user_prop; struct zprop_list *pl_next; boolean_t pl_all; size_t pl_width; size_t pl_recvd_width; boolean_t pl_fixed; } zprop_list_t; extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, boolean_t); extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" #define ZFS_FEATURE_DISABLED "disabled" #define ZFS_FEATURE_ENABLED "enabled" #define ZFS_FEATURE_ACTIVE "active" #define ZFS_UNSUPPORTED_INACTIVE "inactive" #define ZFS_UNSUPPORTED_READONLY "readonly" /* * zpool property management */ extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, size_t); extern const char *zpool_prop_default_string(zpool_prop_t); extern uint64_t zpool_prop_default_numeric(zpool_prop_t); extern const char *zpool_prop_column_name(zpool_prop_t); extern boolean_t zpool_prop_align_right(zpool_prop_t); /* * Functions shared by zfs and zpool property management. */ extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type); extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, zfs_type_t); extern void zprop_free_list(zprop_list_t *); #define ZFS_GET_NCOLS 5 typedef enum { GET_COL_NONE, GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_RECVD, GET_COL_SOURCE } zfs_get_column_t; /* * Functions for printing zfs or zpool properties */ typedef struct zprop_get_cbdata { int cb_sources; zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; int cb_colwidths[ZFS_GET_NCOLS + 1]; boolean_t cb_scripted; boolean_t cb_literal; boolean_t cb_first; zprop_list_t *cb_proplist; zfs_type_t cb_type; } zprop_get_cbdata_t; void zprop_print_one_property(const char *, zprop_get_cbdata_t *, const char *, const char *, zprop_source_t, const char *, const char *); /* * Iterator functions. */ typedef int (*zfs_iter_f)(zfs_handle_t *, void *); extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *); extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); typedef struct get_all_cb { zfs_handle_t **cb_handles; size_t cb_alloc; size_t cb_used; } get_all_cb_t; void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t, zfs_iter_f, void*, boolean_t); void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); /* * Functions to create and destroy datasets. */ extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, nvlist_t *); extern int zfs_create_ancestors(libzfs_handle_t *, const char *); extern int zfs_destroy(zfs_handle_t *, boolean_t); extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props); extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); typedef struct renameflags { /* recursive rename */ int recurse : 1; /* don't unmount file systems */ int nounmount : 1; /* force unmount file systems */ int forceunmount : 1; } renameflags_t; extern int zfs_rename(zfs_handle_t *, const char *, const char *, renameflags_t flags); typedef struct sendflags { /* print informational messages (ie, -v was specified) */ boolean_t verbose; /* recursive send (ie, -R) */ boolean_t replicate; /* for incrementals, do all intermediate snapshots */ boolean_t doall; /* if dataset is a clone, do incremental from its origin */ boolean_t fromorigin; /* do deduplication */ boolean_t dedup; /* send properties (ie, -p) */ boolean_t props; /* do not send (no-op, ie. -n) */ boolean_t dryrun; /* parsable verbose output (ie. -P) */ boolean_t parsable; /* show progress (ie. -v) */ boolean_t progress; /* large blocks (>128K) are permitted */ boolean_t largeblock; /* WRITE_EMBEDDED records of type DATA are permitted */ boolean_t embed_data; /* compressed WRITE records are permitted */ boolean_t compress; /* show progress as process title(ie. -V) */ boolean_t progressastitle; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); extern int zfs_send(zfs_handle_t *, const char *, const char *, sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); -extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags); +extern int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t flags); extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, const char *); extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token); extern int zfs_promote(zfs_handle_t *); extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, int); extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, zfs_userspace_cb_t, void *); extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ boolean_t verbose; /* the destination is a prefix, not the exact fs (ie, -d) */ boolean_t isprefix; /* * Only the tail of the sent snapshot path is appended to the * destination to determine the received snapshot name (ie, -e). */ boolean_t istail; /* do not actually do the recv, just check if it would work (ie, -n) */ boolean_t dryrun; /* rollback/destroy filesystems as necessary (eg, -F) */ boolean_t force; /* set "canmount=off" on all modified filesystems */ boolean_t canmountoff; /* * Mark the file systems as "resumable" and do not destroy them if the * receive is interrupted */ boolean_t resumable; /* byteswap flag is used internally; callers need not specify */ boolean_t byteswap; /* do not mount file systems as they are extracted (private) */ boolean_t nomount; } recvflags_t; extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, recvflags_t *, int, avl_tree_t *); typedef enum diff_flags { ZFS_DIFF_PARSEABLE = 0x1, ZFS_DIFF_TIMESTAMP = 0x2, ZFS_DIFF_CLASSIFY = 0x4 } diff_flags_t; extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, int); /* * Miscellaneous functions. */ extern const char *zfs_type_to_name(zfs_type_t); extern void zfs_refresh_properties(zfs_handle_t *); extern int zfs_name_valid(const char *, zfs_type_t); extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t); extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, zfs_type_t); extern int zfs_spa_version(zfs_handle_t *, int *); extern boolean_t zfs_bookmark_exists(const char *path); /* * Mount support functions. */ extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); extern boolean_t zfs_is_mounted(zfs_handle_t *, char **); extern int zfs_mount(zfs_handle_t *, const char *, int); extern int zfs_unmount(zfs_handle_t *, const char *, int); extern int zfs_unmountall(zfs_handle_t *, int); /* * Share support functions. */ extern boolean_t zfs_is_shared(zfs_handle_t *); extern int zfs_share(zfs_handle_t *); extern int zfs_unshare(zfs_handle_t *); /* * Protocol-specific share support functions. */ extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); extern int zfs_share_nfs(zfs_handle_t *); extern int zfs_share_smb(zfs_handle_t *); extern int zfs_shareall(zfs_handle_t *); extern int zfs_unshare_nfs(zfs_handle_t *, const char *); extern int zfs_unshare_smb(zfs_handle_t *, const char *); extern int zfs_unshareall_nfs(zfs_handle_t *); extern int zfs_unshareall_smb(zfs_handle_t *); extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); extern int zfs_unshareall(zfs_handle_t *); extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); /* * FreeBSD-specific jail support function. */ extern int zfs_jail(zfs_handle_t *, int, int); /* * When dealing with nvlists, verify() is extremely useful */ #ifndef verify #ifdef NDEBUG #define verify(EX) ((void)(EX)) #else #define verify(EX) assert(EX) #endif #endif /* * Utility function to convert a number to a human-readable form. */ extern void zfs_nicenum(uint64_t, char *, size_t); extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); /* * Given a device or file, determine if it is part of a pool. */ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, boolean_t *); /* * Label manipulation. */ extern int zpool_read_label(int, nvlist_t **); extern int zpool_read_all_labels(int, nvlist_t **); extern int zpool_clear_label(int); /* is this zvol valid for use as a dump device? */ extern int zvol_check_dump_config(char *); /* * Management interfaces for SMB ACL files */ int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); /* * Enable and disable datasets within a pool by mounting/unmounting and * sharing/unsharing them. */ extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); /* * Mappings between vdev and FRU. */ extern void libzfs_fru_refresh(libzfs_handle_t *); extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *); extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *); extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *, const char *); extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *); extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); #ifndef illumos extern int zmount(const char *, const char *, int, char *, char *, int, char *, int); #endif extern int zfs_remap_indirects(libzfs_handle_t *hdl, const char *); /* Allow consumers to initialize libshare externally for optimal performance */ extern int zfs_init_libshare_arg(libzfs_handle_t *, int, void *); /* * For most consumers, zfs_init_libshare_arg is sufficient on its own, and * zfs_uninit_libshare is unnecessary. zfs_uninit_libshare should only be called * if the caller has already initialized libshare for one set of zfs handles, * and wishes to share or unshare filesystems outside of that set. In that case, * the caller should uninitialize libshare, and then re-initialize it with the * new handles being shared or unshared. */ extern void zfs_uninit_libshare(libzfs_handle_t *); #ifdef __cplusplus } #endif #endif /* _LIBZFS_H */ Index: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c (revision 352536) +++ projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c (revision 352537) @@ -1,3834 +1,3855 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Igor Kozhukhov */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" #include #include #include #include #ifdef __FreeBSD__ extern int zfs_ioctl_version; #endif /* in libzfs_dataset.c */ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); /* We need to use something for ENODATA. */ #define ENODATA EIDRM static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *, const char *); static int guid_to_name(libzfs_handle_t *, const char *, uint64_t, boolean_t, char *); static const zio_cksum_t zero_cksum = { 0 }; typedef struct dedup_arg { int inputfd; int outputfd; libzfs_handle_t *dedup_hdl; } dedup_arg_t; typedef struct progress_arg { zfs_handle_t *pa_zhp; int pa_fd; boolean_t pa_parsable; boolean_t pa_astitle; uint64_t pa_size; } progress_arg_t; typedef struct dataref { uint64_t ref_guid; uint64_t ref_object; uint64_t ref_offset; } dataref_t; typedef struct dedup_entry { struct dedup_entry *dde_next; zio_cksum_t dde_chksum; uint64_t dde_prop; dataref_t dde_ref; } dedup_entry_t; #define MAX_DDT_PHYSMEM_PERCENT 20 #define SMALLEST_POSSIBLE_MAX_DDT_MB 128 typedef struct dedup_table { dedup_entry_t **dedup_hash_array; umem_cache_t *ddecache; uint64_t max_ddt_size; /* max dedup table size in bytes */ uint64_t cur_ddt_size; /* current dedup table size in bytes */ uint64_t ddt_count; int numhashbits; boolean_t ddt_full; } dedup_table_t; static int high_order_bit(uint64_t n) { int count; for (count = 0; n != 0; count++) n >>= 1; return (count); } static size_t ssread(void *buf, size_t len, FILE *stream) { size_t outlen; if ((outlen = fread(buf, len, 1, stream)) == 0) return (0); return (outlen); } static void ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp, zio_cksum_t *cs, uint64_t prop, dataref_t *dr) { dedup_entry_t *dde; if (ddt->cur_ddt_size >= ddt->max_ddt_size) { if (ddt->ddt_full == B_FALSE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Dedup table full. Deduplication will continue " "with existing table entries")); ddt->ddt_full = B_TRUE; } return; } if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT)) != NULL) { assert(*ddepp == NULL); dde->dde_next = NULL; dde->dde_chksum = *cs; dde->dde_prop = prop; dde->dde_ref = *dr; *ddepp = dde; ddt->cur_ddt_size += sizeof (dedup_entry_t); ddt->ddt_count++; } } /* * Using the specified dedup table, do a lookup for an entry with * the checksum cs. If found, return the block's reference info * in *dr. Otherwise, insert a new entry in the dedup table, using * the reference information specified by *dr. * * return value: true - entry was found * false - entry was not found */ static boolean_t ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs, uint64_t prop, dataref_t *dr) { uint32_t hashcode; dedup_entry_t **ddepp; hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits); for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL; ddepp = &((*ddepp)->dde_next)) { if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) && (*ddepp)->dde_prop == prop) { *dr = (*ddepp)->dde_ref; return (B_TRUE); } } ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr); return (B_FALSE); } static int dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, zio_cksum_t *zc, int outfd) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); (void) fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } (void) fletcher_4_incremental_native( &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { (void) fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } /* * This function is started in a separate thread when the dedup option * has been requested. The main send thread determines the list of * snapshots to be included in the send stream and makes the ioctl calls * for each one. But instead of having the ioctl send the output to the * the output fd specified by the caller of zfs_send()), the * ioctl is told to direct the output to a pipe, which is read by the * alternate thread running THIS function. This function does the * dedup'ing by: * 1. building a dedup table (the DDT) * 2. doing checksums on each data block and inserting a record in the DDT * 3. looking for matching checksums, and * 4. sending a DRR_WRITE_BYREF record instead of a write record whenever * a duplicate block is found. * The output of this function then goes to the output fd requested * by the caller of zfs_send(). */ static void * cksummer(void *arg) { dedup_arg_t *dda = arg; char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE); dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; FILE *ofp; int outfd; dedup_table_t ddt; zio_cksum_t stream_cksum; uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); uint64_t numbuckets; ddt.max_ddt_size = MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100, SMALLEST_POSSIBLE_MAX_DDT_MB << 20); numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t)); /* * numbuckets must be a power of 2. Increase number to * a power of 2 if necessary. */ if (!ISP2(numbuckets)) numbuckets = 1 << high_order_bit(numbuckets); ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *)); ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *); ddt.numhashbits = high_order_bit(numbuckets) - 1; ddt.ddt_full = B_FALSE; outfd = dda->outputfd; ofp = fdopen(dda->inputfd, "r"); while (ssread(drr, sizeof (*drr), ofp) != 0) { /* * kernel filled in checksum, we are going to write same * record, but need to regenerate checksum. */ if (drr->drr_type != DRR_BEGIN) { bzero(&drr->drr_u.drr_checksum.drr_checksum, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } switch (drr->drr_type) { case DRR_BEGIN: { struct drr_begin *drrb = &drr->drr_u.drr_begin; int fflags; int sz = 0; ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); /* set the DEDUP feature flag for this stream */ fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); fflags |= (DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); if (drr->drr_payloadlen != 0) { sz = drr->drr_payloadlen; if (sz > SPA_MAXBLOCKSIZE) { buf = zfs_realloc(dda->dedup_hdl, buf, SPA_MAXBLOCKSIZE, sz); } (void) ssread(buf, sz, ofp); if (ferror(stdin)) perror("fread"); } if (dump_record(drr, buf, sz, &stream_cksum, outfd) != 0) goto out; break; } case DRR_END: { struct drr_end *drre = &drr->drr_u.drr_end; /* use the recalculated checksum */ drre->drr_checksum = stream_cksum; if (dump_record(drr, NULL, 0, &stream_cksum, outfd) != 0) goto out; break; } case DRR_OBJECT: { struct drr_object *drro = &drr->drr_u.drr_object; if (drro->drr_bonuslen > 0) { (void) ssread(buf, P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), ofp); } if (dump_record(drr, buf, P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), &stream_cksum, outfd) != 0) goto out; break; } case DRR_SPILL: { struct drr_spill *drrs = &drr->drr_u.drr_spill; (void) ssread(buf, drrs->drr_length, ofp); if (dump_record(drr, buf, drrs->drr_length, &stream_cksum, outfd) != 0) goto out; break; } case DRR_FREEOBJECTS: { if (dump_record(drr, NULL, 0, &stream_cksum, outfd) != 0) goto out; break; } case DRR_WRITE: { struct drr_write *drrw = &drr->drr_u.drr_write; dataref_t dataref; uint64_t payload_size; payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); (void) ssread(buf, payload_size, ofp); /* * Use the existing checksum if it's dedup-capable, * else calculate a SHA256 checksum for it. */ if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum, zero_cksum) || !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) { SHA256_CTX ctx; zio_cksum_t tmpsha256; SHA256Init(&ctx); SHA256Update(&ctx, buf, payload_size); SHA256Final(&tmpsha256, &ctx); drrw->drr_key.ddk_cksum.zc_word[0] = BE_64(tmpsha256.zc_word[0]); drrw->drr_key.ddk_cksum.zc_word[1] = BE_64(tmpsha256.zc_word[1]); drrw->drr_key.ddk_cksum.zc_word[2] = BE_64(tmpsha256.zc_word[2]); drrw->drr_key.ddk_cksum.zc_word[3] = BE_64(tmpsha256.zc_word[3]); drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256; drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP; } dataref.ref_guid = drrw->drr_toguid; dataref.ref_object = drrw->drr_object; dataref.ref_offset = drrw->drr_offset; if (ddt_update(dda->dedup_hdl, &ddt, &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop, &dataref)) { dmu_replay_record_t wbr_drr = {0}; struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref; /* block already present in stream */ wbr_drr.drr_type = DRR_WRITE_BYREF; wbr_drrr->drr_object = drrw->drr_object; wbr_drrr->drr_offset = drrw->drr_offset; wbr_drrr->drr_length = drrw->drr_logical_size; wbr_drrr->drr_toguid = drrw->drr_toguid; wbr_drrr->drr_refguid = dataref.ref_guid; wbr_drrr->drr_refobject = dataref.ref_object; wbr_drrr->drr_refoffset = dataref.ref_offset; wbr_drrr->drr_checksumtype = drrw->drr_checksumtype; wbr_drrr->drr_checksumflags = drrw->drr_checksumtype; wbr_drrr->drr_key.ddk_cksum = drrw->drr_key.ddk_cksum; wbr_drrr->drr_key.ddk_prop = drrw->drr_key.ddk_prop; if (dump_record(&wbr_drr, NULL, 0, &stream_cksum, outfd) != 0) goto out; } else { /* block not previously seen */ if (dump_record(drr, buf, payload_size, &stream_cksum, outfd) != 0) goto out; } break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &drr->drr_u.drr_write_embedded; (void) ssread(buf, P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp); if (dump_record(drr, buf, P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), &stream_cksum, outfd) != 0) goto out; break; } case DRR_FREE: { if (dump_record(drr, NULL, 0, &stream_cksum, outfd) != 0) goto out; break; } default: (void) fprintf(stderr, "INVALID record type 0x%x\n", drr->drr_type); /* should never happen, so assert */ assert(B_FALSE); } } out: umem_cache_destroy(ddt.ddecache); free(ddt.dedup_hash_array); free(buf); (void) fclose(ofp); return (NULL); } /* * Routines for dealing with the AVL tree of fs-nvlists */ typedef struct fsavl_node { avl_node_t fn_node; nvlist_t *fn_nvfs; char *fn_snapname; uint64_t fn_guid; } fsavl_node_t; static int fsavl_compare(const void *arg1, const void *arg2) { const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; return (AVL_CMP(fn1->fn_guid, fn2->fn_guid)); } /* * Given the GUID of a snapshot, find its containing filesystem and * (optionally) name. */ static nvlist_t * fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname) { fsavl_node_t fn_find; fsavl_node_t *fn; fn_find.fn_guid = snapguid; fn = avl_find(avl, &fn_find, NULL); if (fn) { if (snapname) *snapname = fn->fn_snapname; return (fn->fn_nvfs); } return (NULL); } static void fsavl_destroy(avl_tree_t *avl) { fsavl_node_t *fn; void *cookie; if (avl == NULL) return; cookie = NULL; while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL) free(fn); avl_destroy(avl); free(avl); } /* * Given an nvlist, produce an avl tree of snapshots, ordered by guid */ static avl_tree_t * fsavl_create(nvlist_t *fss) { avl_tree_t *fsavl; nvpair_t *fselem = NULL; if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL) return (NULL); avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t), offsetof(fsavl_node_t, fn_node)); while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) { nvlist_t *nvfs, *snaps; nvpair_t *snapelem = NULL; VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); while ((snapelem = nvlist_next_nvpair(snaps, snapelem)) != NULL) { fsavl_node_t *fn; uint64_t guid; VERIFY(0 == nvpair_value_uint64(snapelem, &guid)); if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { fsavl_destroy(fsavl); return (NULL); } fn->fn_nvfs = nvfs; fn->fn_snapname = nvpair_name(snapelem); fn->fn_guid = guid; /* * Note: if there are multiple snaps with the * same GUID, we ignore all but one. */ if (avl_find(fsavl, fn, NULL) == NULL) avl_add(fsavl, fn); else free(fn); } } return (fsavl); } /* * Routines for dealing with the giant nvlist of fs-nvlists, etc. */ typedef struct send_data { /* * assigned inside every recursive call, * restored from *_save on return: * * guid of fromsnap snapshot in parent dataset * txg of fromsnap snapshot in current dataset * txg of tosnap snapshot in current dataset */ uint64_t parent_fromsnap_guid; uint64_t fromsnap_txg; uint64_t tosnap_txg; /* the nvlists get accumulated during depth-first traversal */ nvlist_t *parent_snaps; nvlist_t *fss; nvlist_t *snapprops; /* send-receive configuration, does not change during traversal */ const char *fsname; const char *fromsnap; const char *tosnap; boolean_t recursive; boolean_t verbose; /* * The header nvlist is of the following format: * { * "tosnap" -> string * "fromsnap" -> string (if incremental) * "fss" -> { * id -> { * * "name" -> string (full name; for debugging) * "parentfromsnap" -> number (guid of fromsnap in parent) * * "props" -> { name -> value (only if set here) } * "snaps" -> { name (lastname) -> number (guid) } * "snapprops" -> { name (lastname) -> { name -> value } } * * "origin" -> number (guid) (if clone) * "sent" -> boolean (not on-disk) * } * } * } * */ } send_data_t; static void send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv); static int send_iterate_snap(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; char *snapname; nvlist_t *nv; snapname = strrchr(zhp->zfs_name, '@')+1; if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping snapshot %s because it was created " "after the destination snapshot (%s)\n"), zhp->zfs_name, sd->tosnap); } zfs_close(zhp); return (0); } VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid)); /* * NB: if there is no fromsnap here (it's a newly created fs in * an incremental replication), we will substitute the tosnap. */ if ((sd->fromsnap && strcmp(snapname, sd->fromsnap) == 0) || (sd->parent_fromsnap_guid == 0 && sd->tosnap && strcmp(snapname, sd->tosnap) == 0)) { sd->parent_fromsnap_guid = guid; } VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); send_iterate_prop(zhp, nv); VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv)); nvlist_free(nv); zfs_close(zhp); return (0); } static void send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) { char *propname = nvpair_name(elem); zfs_prop_t prop = zfs_name_to_prop(propname); nvlist_t *propnv; if (!zfs_prop_user(propname)) { /* * Realistically, this should never happen. However, * we want the ability to add DSL properties without * needing to make incompatible version changes. We * need to ignore unknown properties to allow older * software to still send datasets containing these * properties, with the unknown properties elided. */ if (prop == ZPROP_INVAL) continue; if (zfs_prop_readonly(prop)) continue; } verify(nvpair_value_nvlist(elem, &propnv) == 0); if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION) { char *source; uint64_t value; verify(nvlist_lookup_uint64(propnv, ZPROP_VALUE, &value) == 0); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) continue; /* * May have no source before SPA_VERSION_RECVD_PROPS, * but is still modifiable. */ if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) == 0) { if ((strcmp(source, zhp->zfs_name) != 0) && (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) continue; } } else { char *source; if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) != 0) continue; if ((strcmp(source, zhp->zfs_name) != 0) && (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) continue; } if (zfs_prop_user(propname) || zfs_prop_get_type(prop) == PROP_TYPE_STRING) { char *value; verify(nvlist_lookup_string(propnv, ZPROP_VALUE, &value) == 0); VERIFY(0 == nvlist_add_string(nv, propname, value)); } else { uint64_t value; verify(nvlist_lookup_uint64(propnv, ZPROP_VALUE, &value) == 0); VERIFY(0 == nvlist_add_uint64(nv, propname, value)); } } } /* * returns snapshot creation txg * and returns 0 if the snapshot does not exist */ static uint64_t get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap) { char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t txg = 0; if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') return (txg); (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) { zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG); zfs_close(zhp); } } return (txg); } /* * recursively generate nvlists describing datasets. See comment * for the data structure send_data_t above for description of contents * of the nvlist. */ static int send_iterate_fs(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; nvlist_t *nvfs, *nv; int rv = 0; uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; uint64_t fromsnap_txg_save = sd->fromsnap_txg; uint64_t tosnap_txg_save = sd->tosnap_txg; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t fromsnap_txg, tosnap_txg; char guidstring[64]; fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap); if (fromsnap_txg != 0) sd->fromsnap_txg = fromsnap_txg; tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap); if (tosnap_txg != 0) sd->tosnap_txg = tosnap_txg; /* * on the send side, if the current dataset does not have tosnap, * perform two additional checks: * * - skip sending the current dataset if it was created later than * the parent tosnap * - return error if the current dataset was created earlier than * the parent tosnap */ if (sd->tosnap != NULL && tosnap_txg == 0) { if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping dataset %s: snapshot %s does " "not exist\n"), zhp->zfs_name, sd->tosnap); } } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s%s: snapshot %s@%s does not " "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? dgettext(TEXT_DOMAIN, " recursively") : "", zhp->zfs_name, sd->tosnap); rv = -1; } goto out; } VERIFY(0 == nvlist_alloc(&nvfs, NV_UNIQUE_NAME, 0)); VERIFY(0 == nvlist_add_string(nvfs, "name", zhp->zfs_name)); VERIFY(0 == nvlist_add_uint64(nvfs, "parentfromsnap", sd->parent_fromsnap_guid)); if (zhp->zfs_dmustats.dds_origin[0]) { zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); if (origin == NULL) { rv = -1; goto out; } VERIFY(0 == nvlist_add_uint64(nvfs, "origin", origin->zfs_dmustats.dds_guid)); } /* iterate over props */ VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); send_iterate_prop(zhp, nv); VERIFY(0 == nvlist_add_nvlist(nvfs, "props", nv)); nvlist_free(nv); /* iterate over snaps, and set sd->parent_fromsnap_guid */ sd->parent_fromsnap_guid = 0; VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0)); VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0)); (void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd); VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps)); VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops)); nvlist_free(sd->parent_snaps); nvlist_free(sd->snapprops); /* add this fs to nvlist */ (void) snprintf(guidstring, sizeof (guidstring), "0x%llx", (longlong_t)guid); VERIFY(0 == nvlist_add_nvlist(sd->fss, guidstring, nvfs)); nvlist_free(nvfs); /* iterate over children */ if (sd->recursive) rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); out: sd->parent_fromsnap_guid = parent_fromsnap_guid_save; sd->fromsnap_txg = fromsnap_txg_save; sd->tosnap_txg = tosnap_txg_save; zfs_close(zhp); return (rv); } static int gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, const char *tosnap, boolean_t recursive, boolean_t verbose, nvlist_t **nvlp, avl_tree_t **avlp) { zfs_handle_t *zhp; send_data_t sd = { 0 }; int error; zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (EZFS_BADTYPE); VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0)); sd.fsname = fsname; sd.fromsnap = fromsnap; sd.tosnap = tosnap; sd.recursive = recursive; sd.verbose = verbose; if ((error = send_iterate_fs(zhp, &sd)) != 0) { nvlist_free(sd.fss); if (avlp != NULL) *avlp = NULL; *nvlp = NULL; return (error); } if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { nvlist_free(sd.fss); *nvlp = NULL; return (EZFS_NOMEM); } *nvlp = sd.fss; return (0); } /* * Routines specific to "zfs send" */ typedef struct send_dump_data { /* these are all just the short snapname (the part after the @) */ const char *fromsnap; const char *tosnap; char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t verbose, dryrun, parsable, progress, embed_data, std_out; boolean_t progressastitle; boolean_t large_block, compress; int outfd; boolean_t err; nvlist_t *fss; nvlist_t *snapholds; avl_tree_t *fsavl; snapfilter_cb_t *filter_cb; void *filter_cb_arg; nvlist_t *debugnv; char holdtag[ZFS_MAX_DATASET_NAME_LEN]; int cleanup_fd; uint64_t size; } send_dump_data_t; static int estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, boolean_t fromorigin, enum lzc_send_flags flags, uint64_t *sizep) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); assert(fromsnap_obj == 0 || !fromorigin); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_obj = fromorigin; zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_guid = 1; /* estimate flag */ zc.zc_flags = flags; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot estimate space for '%s'"), zhp->zfs_name); switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (@%s) does not exist"), zc.zc_value); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } *sizep = zc.zc_objset_type; return (0); } /* * Dumps a backup of the given snapshot (incremental from fromsnap if it's not * NULL) to the file descriptor specified by outfd. */ static int dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, boolean_t fromorigin, int outfd, enum lzc_send_flags flags, nvlist_t *debugnv) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *thisdbg; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); assert(fromsnap_obj == 0 || !fromorigin); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = outfd; zc.zc_obj = fromorigin; zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_flags = flags; VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); if (fromsnap && fromsnap[0] != '\0') { VERIFY(0 == nvlist_add_string(thisdbg, "fromsnap", fromsnap)); } if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno)); if (debugnv) { VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); } nvlist_free(thisdbg); switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (@%s) does not exist"), zc.zc_value); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: #ifdef illumos case ENOSTR: #endif case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } if (debugnv) VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); nvlist_free(thisdbg); return (0); } static void gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) { assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); /* * zfs_send() only sets snapholds for sends that need them, * e.g. replication and doall. */ if (sdd->snapholds == NULL) return; fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); } static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; zfs_cmd_t zc = { 0 }; zfs_handle_t *zhp = pa->pa_zhp; libzfs_handle_t *hdl = zhp->zfs_hdl; unsigned long long bytes, total; char buf[16]; time_t t; struct tm *tm; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (!pa->pa_parsable && !pa->pa_astitle) (void) fprintf(stderr, "TIME SENT SNAPSHOT\n"); /* * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { (void) sleep(1); zc.zc_cookie = pa->pa_fd; if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) return ((void *)-1); (void) time(&t); tm = localtime(&t); bytes = zc.zc_cookie; if (pa->pa_astitle) { int pct; if (pa->pa_size > bytes) pct = 100 * bytes / pa->pa_size; else pct = 100; setproctitle("sending %s (%d%%: %llu/%llu)", zhp->zfs_name, pct, bytes, pa->pa_size); } else if (pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, bytes, zhp->zfs_name); } else { zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, buf, zhp->zfs_name); } } } static void send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, uint64_t size, boolean_t parsable) { if (parsable) { if (fromsnap != NULL) { (void) fprintf(fout, "incremental\t%s\t%s", fromsnap, tosnap); } else { (void) fprintf(fout, "full\t%s", tosnap); } } else { if (fromsnap != NULL) { if (strchr(fromsnap, '@') == NULL && strchr(fromsnap, '#') == NULL) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from @%s to %s"), fromsnap, tosnap); } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from %s to %s"), fromsnap, tosnap); } } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "full send of %s"), tosnap); } } - if (size != 0) { - if (parsable) { - (void) fprintf(fout, "\t%llu", - (longlong_t)size); - } else { - char buf[16]; - zfs_nicenum(size, buf, sizeof (buf)); - (void) fprintf(fout, dgettext(TEXT_DOMAIN, - " estimated size is %s"), buf); - } + if (parsable) { + (void) fprintf(fout, "\t%llu", + (longlong_t)size); + } else if (size != 0) { + char buf[16]; + zfs_nicenum(size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + " estimated size is %s"), buf); } (void) fprintf(fout, "\n"); } static int dump_snapshot(zfs_handle_t *zhp, void *arg) { send_dump_data_t *sdd = arg; progress_arg_t pa = { 0 }; pthread_t tid; char *thissnap; enum lzc_send_flags flags = 0; int err; boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; FILE *fout = sdd->std_out ? stdout : stderr; uint64_t size = 0; err = 0; thissnap = strchr(zhp->zfs_name, '@') + 1; isfromsnap = (sdd->fromsnap != NULL && strcmp(sdd->fromsnap, thissnap) == 0); if (!sdd->seenfrom && isfromsnap) { gather_holds(zhp, sdd); sdd->seenfrom = B_TRUE; (void) strcpy(sdd->prevsnap, thissnap); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (0); } if (sdd->seento || !sdd->seenfrom) { zfs_close(zhp); return (0); } istosnap = (strcmp(sdd->tosnap, thissnap) == 0); if (istosnap) sdd->seento = B_TRUE; if (sdd->large_block) flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (sdd->embed_data) flags |= LZC_SEND_FLAG_EMBED_DATA; if (sdd->compress) flags |= LZC_SEND_FLAG_COMPRESS; if (!sdd->doall && !isfromsnap && !istosnap) { if (sdd->replicate) { char *snapname; nvlist_t *snapprops; /* * Filter out all intermediate snapshots except origin * snapshots needed to replicate clones. */ nvlist_t *nvfs = fsavl_find(sdd->fsavl, zhp->zfs_dmustats.dds_guid, &snapname); VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snapprops", &snapprops)); VERIFY(0 == nvlist_lookup_nvlist(snapprops, thissnap, &snapprops)); exclude = !nvlist_exists(snapprops, "is_clone_origin"); } else { exclude = B_TRUE; } } /* * If a filter function exists, call it to determine whether * this snapshot will be sent. */ if (exclude || (sdd->filter_cb != NULL && sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { /* * This snapshot is filtered out. Don't send it, and don't * set prevsnap_obj, so it will be as if this snapshot didn't * exist, and the next accepted snapshot will be sent as * an incremental from the last accepted one, or as the * first (and full) snapshot in the case of a replication, * non-incremental send. */ zfs_close(zhp); return (0); } gather_holds(zhp, sdd); fromorigin = sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate); if (sdd->progress && sdd->dryrun) { (void) estimate_ioctl(zhp, sdd->prevsnap_obj, fromorigin, flags, &size); sdd->size += size; } if (sdd->verbose) { send_print_verbose(fout, zhp->zfs_name, sdd->prevsnap[0] ? sdd->prevsnap : NULL, size, sdd->parsable); } if (!sdd->dryrun) { /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (sdd->progress) { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; pa.pa_size = sdd->size; pa.pa_astitle = sdd->progressastitle; if ((err = pthread_create(&tid, NULL, send_progress_thread, &pa)) != 0) { zfs_close(zhp); return (err); } } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); if (sdd->progress) { (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); } } (void) strcpy(sdd->prevsnap, thissnap); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (err); } static int dump_filesystem(zfs_handle_t *zhp, void *arg) { int rv = 0; send_dump_data_t *sdd = arg; boolean_t missingfrom = B_FALSE; zfs_cmd_t zc = { 0 }; (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->tosnap); if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); sdd->err = B_TRUE; return (0); } if (sdd->replicate && sdd->fromsnap) { /* * If this fs does not have fromsnap, and we're doing * recursive, we need to send a full stream from the * beginning (or an incremental from the origin if this * is a clone). If we're doing non-recursive, then let * them get the error. */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->fromsnap); if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { missingfrom = B_TRUE; } } sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0; sdd->prevsnap_obj = 0; if (sdd->fromsnap == NULL || missingfrom) sdd->seenfrom = B_TRUE; rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg); if (!sdd->seenfrom) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) does not exist\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); sdd->err = B_TRUE; } else if (!sdd->seento) { if (sdd->fromsnap) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) " "is not earlier than it\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: " "could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); } sdd->err = B_TRUE; } return (rv); } static int dump_filesystems(zfs_handle_t *rzhp, void *arg) { send_dump_data_t *sdd = arg; nvpair_t *fspair; boolean_t needagain, progress; if (!sdd->replicate) return (dump_filesystem(rzhp, sdd)); /* Mark the clone origin snapshots. */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *nvfs; uint64_t origin_guid = 0; VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs)); (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); if (origin_guid != 0) { char *snapname; nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, &snapname); if (origin_nv != NULL) { nvlist_t *snapprops; VERIFY(0 == nvlist_lookup_nvlist(origin_nv, "snapprops", &snapprops)); VERIFY(0 == nvlist_lookup_nvlist(snapprops, snapname, &snapprops)); VERIFY(0 == nvlist_add_boolean( snapprops, "is_clone_origin")); } } } again: needagain = progress = B_FALSE; for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist, *parent_nv; char *fsname; zfs_handle_t *zhp; int err; uint64_t origin_guid = 0; uint64_t parent_guid = 0; VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); if (nvlist_lookup_boolean(fslist, "sent") == 0) continue; VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0); (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); (void) nvlist_lookup_uint64(fslist, "parentfromsnap", &parent_guid); if (parent_guid != 0) { parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); if (!nvlist_exists(parent_nv, "sent")) { /* parent has not been sent; skip this one */ needagain = B_TRUE; continue; } } if (origin_guid != 0) { nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL); if (origin_nv != NULL && !nvlist_exists(origin_nv, "sent")) { /* * origin has not been sent yet; * skip this clone. */ needagain = B_TRUE; continue; } } zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); err = dump_filesystem(zhp, sdd); VERIFY(nvlist_add_boolean(fslist, "sent") == 0); progress = B_TRUE; zfs_close(zhp); if (err) return (err); } if (needagain) { assert(progress); goto again; } /* clean out the sent flags in case we reuse this fss */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist; VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); (void) nvlist_remove_all(fslist, "sent"); } return (0); } nvlist_t * zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) { unsigned int version; int nread; unsigned long long checksum, packed_len; /* * Decode token header, which is: * -- * Note that the only supported token version is 1. */ nread = sscanf(token, "%u-%llx-%llx-", &version, &checksum, &packed_len); if (nread != 3) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid format)")); return (NULL); } if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid version %u)"), version); return (NULL); } /* convert hexadecimal representation to binary */ token = strrchr(token, '-') + 1; int len = strlen(token) / 2; unsigned char *compressed = zfs_alloc(hdl, len); for (int i = 0; i < len; i++) { nread = sscanf(token + i * 2, "%2hhx", compressed + i); if (nread != 1) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt " "(payload is not hex-encoded)")); return (NULL); } } /* verify checksum */ zio_cksum_t cksum; fletcher_4_native(compressed, len, NULL, &cksum); if (cksum.zc_word[0] != checksum) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (incorrect checksum)")); return (NULL); } /* uncompress */ void *packed = zfs_alloc(hdl, packed_len); uLongf packed_len_long = packed_len; if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || packed_len_long != packed_len) { free(packed); free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (decompression failed)")); return (NULL); } /* unpack nvlist */ nvlist_t *nv; int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); free(packed); free(compressed); if (error != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (nvlist_unpack failed)")); return (NULL); } return (nv); } int zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, const char *resume_token) { char errbuf[1024]; char *toname; char *fromname = NULL; uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; zfs_handle_t *zhp; int error = 0; char name[ZFS_MAX_DATASET_NAME_LEN]; enum lzc_send_flags lzc_flags = 0; uint64_t size = 0; FILE *fout = (flags->verbose && flags->dryrun) ? stdout : stderr; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); nvlist_t *resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist */ return (zfs_error(hdl, EZFS_FAULT, errbuf)); } if (flags->verbose) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "resume token contents:\n")); nvlist_print(fout, resume_nvl); } if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt")); return (zfs_error(hdl, EZFS_FAULT, errbuf)); } fromguid = 0; (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok")) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (flags->compress || nvlist_exists(resume_nvl, "compressok")) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) { if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is no longer the same snapshot used in " "the initial send"), toname); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' used in the initial send no longer exists"), toname); } return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to access '%s'"), name); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } if (fromguid != 0) { if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source %#llx no longer exists"), (longlong_t)fromguid); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } fromname = name; } if (flags->progress) { error = lzc_send_space(zhp->zfs_name, fromname, lzc_flags, &size); if (error == 0) size = MAX(0, (int64_t)(size - bytes)); } if (flags->verbose) { send_print_verbose(fout, zhp->zfs_name, fromname, size, flags->parsable); } if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (flags->progress) { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; pa.pa_size = size; pa.pa_astitle = flags->progressastitle; error = pthread_create(&tid, NULL, send_progress_thread, &pa); if (error != 0) { zfs_close(zhp); return (error); } } error = lzc_send_resume(zhp->zfs_name, fromname, outfd, lzc_flags, resumeobj, resumeoff); if (flags->progress) { (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); } char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); zfs_close(zhp); switch (error) { case 0: return (0); case EXDEV: case ENOENT: case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: #ifdef illumos case ENOSTR: #endif case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } zfs_close(zhp); return (error); } /* * Generate a send stream for the dataset identified by the argument zhp. * * The content of the send stream is the snapshot identified by * 'tosnap'. Incremental streams are requested in two ways: * - from the snapshot identified by "fromsnap" (if non-null) or * - from the origin of the dataset identified by zhp, which must * be a clone. In this case, "fromsnap" is null and "fromorigin" * is TRUE. * * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) * if "replicate" is set. If "doall" is set, dump all the intermediate * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" * case too. If "props" is set, send properties. */ int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { char errbuf[1024]; send_dump_data_t sdd = { 0 }; int err = 0; nvlist_t *fss = NULL; avl_tree_t *fsavl = NULL; static uint64_t holdseq; int spa_version; pthread_t tid = 0; int pipefd[2]; dedup_arg_t dda = { 0 }; int featureflags = 0; FILE *fout; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot send '%s'"), zhp->zfs_name); if (fromsnap && fromsnap[0] == '\0') { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "zero-length incremental source")); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); } if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) { uint64_t version; version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); if (version >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } } if (flags->dedup && !flags->dryrun) { featureflags |= (DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); if ((err = pipe(pipefd)) != 0) { zfs_error_aux(zhp->zfs_hdl, strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); } dda.outputfd = outfd; dda.inputfd = pipefd[1]; dda.dedup_hdl = zhp->zfs_hdl; if ((err = pthread_create(&tid, NULL, cksummer, &dda)) != 0) { (void) close(pipefd[0]); (void) close(pipefd[1]); zfs_error_aux(zhp->zfs_hdl, strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } } if (flags->replicate || flags->doall || flags->props) { dmu_replay_record_t drr = { 0 }; char *packbuf = NULL; size_t buflen = 0; zio_cksum_t zc = { 0 }; if (flags->replicate || flags->props) { nvlist_t *hdrnv; VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0)); if (fromsnap) { VERIFY(0 == nvlist_add_string(hdrnv, "fromsnap", fromsnap)); } VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap)); if (!flags->replicate) { VERIFY(0 == nvlist_add_boolean(hdrnv, "not_recursive")); } err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name, fromsnap, tosnap, flags->replicate, flags->verbose, &fss, &fsavl); if (err) goto err_out; VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); err = nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, 0); if (debugnvp) *debugnvp = hdrnv; else nvlist_free(hdrnv); if (err) goto stderr_out; } if (!flags->dryrun) { /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. drr_versioninfo, DMU_COMPOUNDSTREAM); DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. drr_versioninfo, featureflags); (void) snprintf(drr.drr_u.drr_begin.drr_toname, sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", zhp->zfs_name, tosnap); drr.drr_payloadlen = buflen; err = dump_record(&drr, packbuf, buflen, &zc, outfd); free(packbuf); if (err != 0) goto stderr_out; /* write end record */ bzero(&drr, sizeof (drr)); drr.drr_type = DRR_END; drr.drr_u.drr_end.drr_checksum = zc; err = write(outfd, &drr, sizeof (drr)); if (err == -1) { err = errno; goto stderr_out; } err = 0; } } /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; if (tid != 0) sdd.outfd = pipefd[0]; else sdd.outfd = outfd; sdd.replicate = flags->replicate; sdd.doall = flags->doall; sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; sdd.verbose = flags->verbose; sdd.parsable = flags->parsable; sdd.progress = flags->progress; sdd.progressastitle = flags->progressastitle; sdd.dryrun = flags->dryrun; sdd.large_block = flags->largeblock; sdd.embed_data = flags->embed_data; sdd.compress = flags->compress; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; if (sdd.verbose && sdd.dryrun) sdd.std_out = B_TRUE; fout = sdd.std_out ? stdout : stderr; /* * Some flags require that we place user holds on the datasets that are * being sent so they don't get destroyed during the send. We can skip * this step if the pool is imported read-only since the datasets cannot * be destroyed. */ if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), ZPOOL_PROP_READONLY, NULL) && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS && (flags->doall || flags->replicate)) { ++holdseq; (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); if (sdd.cleanup_fd < 0) { err = errno; goto stderr_out; } sdd.snapholds = fnvlist_alloc(); } else { sdd.cleanup_fd = -1; sdd.snapholds = NULL; } if (flags->progress || sdd.snapholds != NULL) { /* * Do a verbose no-op dry run to get all the verbose output * or to gather snapshot hold's before generating any data, * then do a non-verbose real run to generate the streams. */ sdd.dryrun = B_TRUE; err = dump_filesystems(zhp, &sdd); if (err != 0) goto stderr_out; if (flags->verbose) { if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)sdd.size); } else { char buf[16]; zfs_nicenum(sdd.size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } } /* Ensure no snaps found is treated as an error. */ if (!sdd.seento) { err = ENOENT; goto err_out; } /* Skip the second run if dryrun was requested. */ if (flags->dryrun) goto err_out; if (sdd.snapholds != NULL) { err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); if (err != 0) goto stderr_out; fnvlist_free(sdd.snapholds); sdd.snapholds = NULL; } sdd.dryrun = B_FALSE; sdd.verbose = B_FALSE; } err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); nvlist_free(fss); /* Ensure no snaps found is treated as an error. */ if (err == 0 && !sdd.seento) err = ENOENT; if (tid != 0) { if (err != 0) (void) pthread_cancel(tid); (void) close(pipefd[0]); (void) pthread_join(tid, NULL); } if (sdd.cleanup_fd != -1) { VERIFY(0 == close(sdd.cleanup_fd)); sdd.cleanup_fd = -1; } if (!flags->dryrun && (flags->replicate || flags->doall || flags->props)) { /* * write final end record. NB: want to do this even if * there was some error, because it might not be totally * failed. */ dmu_replay_record_t drr = { 0 }; drr.drr_type = DRR_END; if (write(outfd, &drr, sizeof (drr)) == -1) { return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } } return (err || sdd.err); stderr_out: err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); err_out: fsavl_destroy(fsavl); nvlist_free(fss); fnvlist_free(sdd.snapholds); if (sdd.cleanup_fd != -1) VERIFY(0 == close(sdd.cleanup_fd)); if (tid != 0) { (void) pthread_cancel(tid); (void) close(pipefd[0]); (void) pthread_join(tid, NULL); } return (err); } int -zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, - enum lzc_send_flags flags) +zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags) { - int err; + int err = 0; libzfs_handle_t *hdl = zhp->zfs_hdl; - + enum lzc_send_flags lzc_flags = 0; + FILE *fout = (flags.verbose && flags.dryrun) ? stdout : stderr; char errbuf[1024]; + + if (flags.largeblock) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (flags.embed_data) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (flags.compress) + lzc_flags |= LZC_SEND_FLAG_COMPRESS; + + if (flags.verbose) { + uint64_t size = 0; + err = lzc_send_space(zhp->zfs_name, from, lzc_flags, &size); + if (err == 0) { + send_print_verbose(fout, zhp->zfs_name, from, size, + flags.parsable); + } else { + (void) fprintf(stderr, "Cannot estimate send size: " + "%s\n", strerror(errno)); + } + } + + if (flags.dryrun) + return (err); + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); - err = lzc_send(zhp->zfs_name, from, fd, flags); + err = lzc_send(zhp->zfs_name, from, fd, lzc_flags); if (err != 0) { switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: case ESRCH: if (lzc_exists(zhp->zfs_name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), from); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "target is busy; if a filesystem, " "it must not be mounted")); return (zfs_error(hdl, EZFS_BUSY, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: #ifdef illumos case ENOSTR: #endif case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (err != 0); } /* * Routines specific to "zfs recv" */ static int recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, boolean_t byteswap, zio_cksum_t *zc) { char *cp = buf; int rv; int len = ilen; assert(ilen <= SPA_MAXBLOCKSIZE); do { rv = read(fd, cp, len); cp += rv; len -= rv; } while (rv > 0); if (rv < 0 || len != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to read from stream")); return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN, "cannot receive"))); } if (zc) { if (byteswap) (void) fletcher_4_incremental_byteswap(buf, ilen, zc); else (void) fletcher_4_incremental_native(buf, ilen, zc); } return (0); } static int recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, boolean_t byteswap, zio_cksum_t *zc) { char *buf; int err; buf = zfs_alloc(hdl, len); if (buf == NULL) return (ENOMEM); err = recv_read(hdl, fd, buf, len, byteswap, zc); if (err != 0) { free(buf); return (err); } err = nvlist_unpack(buf, len, nvp, 0); free(buf); if (err != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (malformed nvlist)")); return (EINVAL); } return (0); } static int recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, int baselen, char *newname, recvflags_t *flags) { static int seq; int err; prop_changelist_t *clp; zfs_handle_t *zhp; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) return (-1); err = changelist_prefix(clp); if (err) return (err); if (tryname) { (void) strcpy(newname, tryname); if (flags->verbose) { (void) printf("attempting rename %s to %s\n", name, newname); } err = lzc_rename(name, newname); if (err == 0) changelist_rename(clp, name, tryname); } else { err = ENOENT; } if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { seq++; (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%.*srecv-%u-%u", baselen, name, getpid(), seq); if (flags->verbose) { (void) printf("failed - trying rename %s to %s\n", name, newname); } err = lzc_rename(name, newname); if (err == 0) changelist_rename(clp, name, newname); if (err && flags->verbose) { (void) printf("failed (%u) - " "will try again on next pass\n", errno); } err = EAGAIN; } else if (flags->verbose) { if (err == 0) (void) printf("success\n"); else (void) printf("failed (%u)\n", errno); } (void) changelist_postfix(clp); changelist_free(clp); return (err); } static int recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, char *newname, recvflags_t *flags) { int err = 0; prop_changelist_t *clp; zfs_handle_t *zhp; boolean_t defer = B_FALSE; int spa_version; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS) defer = B_TRUE; zfs_close(zhp); if (clp == NULL) return (-1); err = changelist_prefix(clp); if (err) return (err); if (flags->verbose) (void) printf("attempting destroy %s\n", name); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, name); err = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { err = lzc_destroy(name); } if (err == 0) { if (flags->verbose) (void) printf("success\n"); changelist_remove(clp, name); } (void) changelist_postfix(clp); changelist_free(clp); /* * Deferred destroy might destroy the snapshot or only mark it to be * destroyed later, and it returns success in either case. */ if (err != 0 || (defer && zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT))) { err = recv_rename(hdl, name, NULL, baselen, newname, flags); } return (err); } typedef struct guid_to_name_data { uint64_t guid; boolean_t bookmark_ok; char *name; char *skip; } guid_to_name_data_t; static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { guid_to_name_data_t *gtnd = arg; const char *slash; int err; if (gtnd->skip != NULL && (slash = strrchr(zhp->zfs_name, '/')) != NULL && strcmp(slash + 1, gtnd->skip) == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); if (err != EEXIST && gtnd->bookmark_ok) err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } /* * Attempt to find the local dataset associated with this guid. In the case of * multiple matches, we attempt to find the "best" match by searching * progressively larger portions of the hierarchy. This allows one to send a * tree of datasets individually and guarantee that we will find the source * guid within that hierarchy, even if there are multiple matches elsewhere. */ static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, char *name) { char pname[ZFS_MAX_DATASET_NAME_LEN]; guid_to_name_data_t gtnd; gtnd.guid = guid; gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; /* * Search progressively larger portions of the hierarchy, starting * with the filesystem specified by 'parent'. This will * select the "most local" version of the origin snapshot in the case * that there are multiple matching snapshots in the system. */ (void) strlcpy(pname, parent, sizeof (pname)); char *cp = strrchr(pname, '@'); if (cp == NULL) cp = strchr(pname, '\0'); for (; cp != NULL; cp = strrchr(pname, '/')) { /* Chop off the last component and open the parent */ *cp = '\0'; zfs_handle_t *zhp = make_dataset_handle(hdl, pname); if (zhp == NULL) continue; int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); if (err != EEXIST) err = zfs_iter_children(zhp, guid_to_name_cb, >nd); if (err != EEXIST && bookmark_ok) err = zfs_iter_bookmarks(zhp, guid_to_name_cb, >nd); zfs_close(zhp); if (err == EEXIST) return (0); /* * Remember the last portion of the dataset so we skip it next * time through (as we've already searched that portion of the * hierarchy). */ gtnd.skip = strrchr(pname, '/') + 1; } return (ENOENT); } /* * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if * guid1 is after guid2. */ static int created_before(libzfs_handle_t *hdl, avl_tree_t *avl, uint64_t guid1, uint64_t guid2) { nvlist_t *nvfs; char *fsname, *snapname; char buf[ZFS_MAX_DATASET_NAME_LEN]; int rv; zfs_handle_t *guid1hdl, *guid2hdl; uint64_t create1, create2; if (guid2 == 0) return (0); if (guid1 == 0) return (1); nvfs = fsavl_find(avl, guid1, &snapname); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid1hdl == NULL) return (-1); nvfs = fsavl_find(avl, guid2, &snapname); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid2hdl == NULL) { zfs_close(guid1hdl); return (-1); } create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); if (create1 < create2) rv = -1; else if (create1 > create2) rv = +1; else rv = 0; zfs_close(guid1hdl); zfs_close(guid2hdl); return (rv); } static int recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, nvlist_t *renamed) { nvlist_t *local_nv, *deleted = NULL; avl_tree_t *local_avl; nvpair_t *fselem, *nextfselem; char *fromsnap; char newname[ZFS_MAX_DATASET_NAME_LEN]; char guidname[32]; int error; boolean_t needagain, progress, recursive; char *s1, *s2; VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap)); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (flags->dryrun) return (0); again: needagain = progress = B_FALSE; VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0)); if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, recursive, B_FALSE, &local_nv, &local_avl)) != 0) return (error); /* * Process deletes and renames */ for (fselem = nvlist_next_nvpair(local_nv, NULL); fselem; fselem = nextfselem) { nvlist_t *nvfs, *snaps; nvlist_t *stream_nvfs = NULL; nvpair_t *snapelem, *nextsnapelem; uint64_t fromguid = 0; uint64_t originguid = 0; uint64_t stream_originguid = 0; uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid; char *fsname, *stream_fsname; nextfselem = nvlist_next_nvpair(local_nv, fselem); VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap", &parent_fromsnap_guid)); (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); /* * First find the stream's fs, so we can check for * a different origin (due to "zfs promote") */ for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { uint64_t thisguid; VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); if (stream_nvfs != NULL) break; } /* check for promote */ (void) nvlist_lookup_uint64(stream_nvfs, "origin", &stream_originguid); if (stream_nvfs && originguid != stream_originguid) { switch (created_before(hdl, local_avl, stream_originguid, originguid)) { case 1: { /* promote it! */ zfs_cmd_t zc = { 0 }; nvlist_t *origin_nvfs; char *origin_fsname; if (flags->verbose) (void) printf("promoting %s\n", fsname); origin_nvfs = fsavl_find(local_avl, originguid, NULL); VERIFY(0 == nvlist_lookup_string(origin_nvfs, "name", &origin_fsname)); (void) strlcpy(zc.zc_value, origin_fsname, sizeof (zc.zc_value)); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); error = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); if (error == 0) progress = B_TRUE; break; } default: break; case -1: fsavl_destroy(local_avl); nvlist_free(local_nv); return (-1); } /* * We had/have the wrong origin, therefore our * list of snapshots is wrong. Need to handle * them on the next pass. */ needagain = B_TRUE; continue; } for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nextsnapelem) { uint64_t thisguid; char *stream_snapname; nvlist_t *found, *props; nextsnapelem = nvlist_next_nvpair(snaps, snapelem); VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); found = fsavl_find(stream_avl, thisguid, &stream_snapname); /* check for delete */ if (found == NULL) { char name[ZFS_MAX_DATASET_NAME_LEN]; if (!flags->force) continue; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); error = recv_destroy(hdl, name, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%" PRIu64, thisguid); nvlist_add_boolean(deleted, guidname); continue; } stream_nvfs = found; if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops", &props) && 0 == nvlist_lookup_nvlist(props, stream_snapname, &props)) { zfs_cmd_t zc = { 0 }; zc.zc_cookie = B_TRUE; /* received */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", fsname, nvpair_name(snapelem)); if (zcmd_write_src_nvlist(hdl, &zc, props) == 0) { (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } } /* check for different snapname */ if (strcmp(nvpair_name(snapelem), stream_snapname) != 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; char tryname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); (void) snprintf(tryname, sizeof (name), "%s@%s", fsname, stream_snapname); error = recv_rename(hdl, name, tryname, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; } if (strcmp(stream_snapname, fromsnap) == 0) fromguid = thisguid; } /* check for delete */ if (stream_nvfs == NULL) { if (!flags->force) continue; error = recv_destroy(hdl, fsname, strlen(tofs)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%" PRIu64, parent_fromsnap_guid); nvlist_add_boolean(deleted, guidname); continue; } if (fromguid == 0) { if (flags->verbose) { (void) printf("local fs %s does not have " "fromsnap (%s in stream); must have " "been deleted locally; ignoring\n", fsname, fromsnap); } continue; } VERIFY(0 == nvlist_lookup_string(stream_nvfs, "name", &stream_fsname)); VERIFY(0 == nvlist_lookup_uint64(stream_nvfs, "parentfromsnap", &stream_parent_fromsnap_guid)); s1 = strrchr(fsname, '/'); s2 = strrchr(stream_fsname, '/'); /* * Check if we're going to rename based on parent guid change * and the current parent guid was also deleted. If it was then * rename will fail and is likely unneeded, so avoid this and * force an early retry to determine the new * parent_fromsnap_guid. */ if (stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) { sprintf(guidname, "%" PRIu64, parent_fromsnap_guid); if (nvlist_exists(deleted, guidname)) { progress = B_TRUE; needagain = B_TRUE; goto doagain; } } /* * Check for rename. If the exact receive path is specified, it * does not count as a rename, but we still need to check the * datasets beneath it. */ if ((stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) || ((flags->isprefix || strcmp(tofs, fsname) != 0) && (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { nvlist_t *parent; char tryname[ZFS_MAX_DATASET_NAME_LEN]; parent = fsavl_find(local_avl, stream_parent_fromsnap_guid, NULL); /* * NB: parent might not be found if we used the * tosnap for stream_parent_fromsnap_guid, * because the parent is a newly-created fs; * we'll be able to rename it after we recv the * new fs. */ if (parent != NULL) { char *pname; VERIFY(0 == nvlist_lookup_string(parent, "name", &pname)); (void) snprintf(tryname, sizeof (tryname), "%s%s", pname, strrchr(stream_fsname, '/')); } else { tryname[0] = '\0'; if (flags->verbose) { (void) printf("local fs %s new parent " "not found\n", fsname); } } newname[0] = '\0'; error = recv_rename(hdl, fsname, tryname, strlen(tofs)+1, newname, flags); if (renamed != NULL && newname[0] != '\0') { VERIFY(0 == nvlist_add_boolean(renamed, newname)); } if (error) needagain = B_TRUE; else progress = B_TRUE; } } doagain: fsavl_destroy(local_avl); nvlist_free(local_nv); nvlist_free(deleted); if (needagain && progress) { /* do another pass to fix up temporary names */ if (flags->verbose) (void) printf("another pass:\n"); goto again; } return (needagain); } static int zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, char **top_zfs, int cleanup_fd, uint64_t *action_handlep) { nvlist_t *stream_nv = NULL; avl_tree_t *stream_avl = NULL; char *fromsnap = NULL; char *sendsnap = NULL; char *cp; char tofs[ZFS_MAX_DATASET_NAME_LEN]; char sendfs[ZFS_MAX_DATASET_NAME_LEN]; char errbuf[1024]; dmu_replay_record_t drre; int error; boolean_t anyerr = B_FALSE; boolean_t softerr = B_FALSE; boolean_t recursive; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); assert(drr->drr_type == DRR_BEGIN); assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == DMU_COMPOUNDSTREAM); /* * Read in the nvlist from the stream. */ if (drr->drr_payloadlen != 0) { error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, &stream_nv, flags->byteswap, zc); if (error) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } } recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (recursive && strchr(destname, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot stream")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } /* * Read in the end record and verify checksum. */ if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), flags->byteswap, NULL))) goto out; if (flags->byteswap) { drre.drr_type = BSWAP_32(drre.drr_type); drre.drr_u.drr_end.drr_checksum.zc_word[0] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); drre.drr_u.drr_end.drr_checksum.zc_word[1] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]); drre.drr_u.drr_end.drr_checksum.zc_word[2] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]); drre.drr_u.drr_end.drr_checksum.zc_word[3] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]); } if (drre.drr_type != DRR_END) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incorrect header checksum")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap); if (drr->drr_payloadlen != 0) { nvlist_t *stream_fss; VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss", &stream_fss)); if ((stream_avl = fsavl_create(stream_fss)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "couldn't allocate avl tree")); error = zfs_error(hdl, EZFS_NOMEM, errbuf); goto out; } if (fromsnap != NULL && recursive) { nvlist_t *renamed = NULL; nvpair_t *pair = NULL; (void) strlcpy(tofs, destname, sizeof (tofs)); if (flags->isprefix) { struct drr_begin *drrb = &drr->drr_u.drr_begin; int i; if (flags->istail) { cp = strrchr(drrb->drr_toname, '/'); if (cp == NULL) { (void) strlcat(tofs, "/", sizeof (tofs)); i = 0; } else { i = (cp - drrb->drr_toname); } } else { i = strcspn(drrb->drr_toname, "/@"); } /* zfs_receive_one() will create_parents() */ (void) strlcat(tofs, &drrb->drr_toname[i], sizeof (tofs)); *strchr(tofs, '@') = '\0'; } if (!flags->dryrun && !flags->nomount) { VERIFY(0 == nvlist_alloc(&renamed, NV_UNIQUE_NAME, 0)); } softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, renamed); /* Unmount renamed filesystems before receiving. */ while ((pair = nvlist_next_nvpair(renamed, pair)) != NULL) { zfs_handle_t *zhp; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, nvpair_name(pair), ZFS_TYPE_FILESYSTEM); if (zhp != NULL) { clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, 0, 0); zfs_close(zhp); if (clp != NULL) { softerr |= changelist_prefix(clp); changelist_free(clp); } } } nvlist_free(renamed); } } /* * Get the fs specified by the first path in the stream (the top level * specified by 'zfs send') and pass it to each invocation of * zfs_receive_one(). */ (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, sizeof (sendfs)); if ((cp = strchr(sendfs, '@')) != NULL) { *cp = '\0'; /* * Find the "sendsnap", the final snapshot in a replication * stream. zfs_receive_one() handles certain errors * differently, depending on if the contained stream is the * last one or not. */ sendsnap = (cp + 1); } /* Finally, receive each contained stream */ do { /* * we should figure out if it has a recoverable * error, in which case do a recv_skip() and drive on. * Note, if we fail due to already having this guid, * zfs_receive_one() will take care of it (ie, * recv_skip() and return 0). */ error = zfs_receive_impl(hdl, destname, NULL, flags, fd, sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, action_handlep, sendsnap); if (error == ENODATA) { error = 0; break; } anyerr |= error; } while (error == 0); if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) { /* * Now that we have the fs's they sent us, try the * renames again. */ softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, NULL); } out: fsavl_destroy(stream_avl); nvlist_free(stream_nv); if (softerr) error = -2; if (anyerr) error = -1; return (error); } static void trunc_prop_errs(int truncated) { ASSERT(truncated != 0); if (truncated == 1) (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "1 more property could not be set\n")); else (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "%d more properties could not be set\n"), truncated); } static int recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) { dmu_replay_record_t *drr; void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive:")); /* XXX would be great to use lseek if possible... */ drr = buf; while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t), byteswap, NULL) == 0) { if (byteswap) drr->drr_type = BSWAP_32(drr->drr_type); switch (drr->drr_type) { case DRR_BEGIN: if (drr->drr_payloadlen != 0) { (void) recv_read(hdl, fd, buf, drr->drr_payloadlen, B_FALSE, NULL); } break; case DRR_END: free(buf); return (0); case DRR_OBJECT: if (byteswap) { drr->drr_u.drr_object.drr_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_bonuslen); } (void) recv_read(hdl, fd, buf, P2ROUNDUP(drr->drr_u.drr_object.drr_bonuslen, 8), B_FALSE, NULL); break; case DRR_WRITE: if (byteswap) { drr->drr_u.drr_write.drr_logical_size = BSWAP_64( drr->drr_u.drr_write.drr_logical_size); drr->drr_u.drr_write.drr_compressed_size = BSWAP_64( drr->drr_u.drr_write.drr_compressed_size); } uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_SPILL: if (byteswap) { drr->drr_u.drr_spill.drr_length = BSWAP_64(drr->drr_u.drr_spill.drr_length); } (void) recv_read(hdl, fd, buf, drr->drr_u.drr_spill.drr_length, B_FALSE, NULL); break; case DRR_WRITE_EMBEDDED: if (byteswap) { drr->drr_u.drr_write_embedded.drr_psize = BSWAP_32(drr->drr_u.drr_write_embedded. drr_psize); } (void) recv_read(hdl, fd, buf, P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, 8), B_FALSE, NULL); break; case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } } free(buf); return (-1); } static void recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, boolean_t resumable) { char target_fs[ZFS_MAX_DATASET_NAME_LEN]; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "checksum mismatch or incomplete stream")); if (!resumable) return; (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); *strchr(target_fs, '@') = '\0'; zfs_handle_t *zhp = zfs_open(hdl, target_fs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return; char token_buf[ZFS_MAXPROPLEN]; int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (error == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "checksum mismatch or incomplete stream.\n" "Partially received snapshot is saved.\n" "A resuming stream can be generated on the sending " "system by running:\n" " zfs send -t %s"), token_buf); } zfs_close(zhp); } /* * Restores a backup of tosnap from the file descriptor specified by infd. */ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, uint64_t *action_handlep, const char *finalsnap) { zfs_cmd_t zc = { 0 }; time_t begin_time; int ioctl_err, ioctl_errno, err; char *cp; struct drr_begin *drrb = &drr->drr_u.drr_begin; char errbuf[1024]; char prop_errbuf[1024]; const char *chopprefix; boolean_t newfs = B_FALSE; boolean_t stream_wantsnewfs; uint64_t parent_snapguid = 0; prop_changelist_t *clp = NULL; nvlist_t *snapprops_nvlist = NULL; zprop_errflags_t prop_errflags; boolean_t recursive; char *snapname = NULL; begin_time = time(NULL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (stream_avl != NULL) { nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, &snapname); nvlist_t *props; int ret; (void) nvlist_lookup_uint64(fs, "parentfromsnap", &parent_snapguid); err = nvlist_lookup_nvlist(fs, "props", &props); if (err) VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); if (flags->canmountoff) { VERIFY(0 == nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0)); } ret = zcmd_write_src_nvlist(hdl, &zc, props); if (err) nvlist_free(props); if (0 == nvlist_lookup_nvlist(fs, "snapprops", &props)) { VERIFY(0 == nvlist_lookup_nvlist(props, snapname, &snapprops_nvlist)); } if (ret != 0) return (-1); } cp = NULL; /* * Determine how much of the snapshot name stored in the stream * we are going to tack on to the name they specified on the * command line, and how much we are going to chop off. * * If they specified a snapshot, chop the entire name stored in * the stream. */ if (flags->istail) { /* * A filesystem was specified with -e. We want to tack on only * the tail of the sent snapshot path. */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -e")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } chopprefix = strrchr(sendfs, '/'); if (chopprefix == NULL) { /* * The tail is the poolname, so we need to * prepend a path separator. */ int len = strlen(drrb->drr_toname); cp = malloc(len + 2); cp[0] = '/'; (void) strcpy(&cp[1], drrb->drr_toname); chopprefix = cp; } else { chopprefix = drrb->drr_toname + (chopprefix - sendfs); } } else if (flags->isprefix) { /* * A filesystem was specified with -d. We want to tack on * everything but the first element of the sent snapshot path * (all but the pool name). */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -d")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } chopprefix = strchr(drrb->drr_toname, '/'); if (chopprefix == NULL) chopprefix = strchr(drrb->drr_toname, '@'); } else if (strchr(tosnap, '@') == NULL) { /* * If a filesystem was specified without -d or -e, we want to * tack on everything after the fs specified by 'zfs send'. */ chopprefix = drrb->drr_toname + strlen(sendfs); } else { /* A snapshot was specified as an exact path (no -d or -e). */ if (recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot " "stream")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); } ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); ASSERT(chopprefix > drrb->drr_toname); ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname)); ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || chopprefix[0] == '\0'); /* * Determine name of destination snapshot, store in zc_value. */ (void) strcpy(zc.zc_value, tosnap); (void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value)); #ifdef __FreeBSD__ if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) zfs_ioctl_version = get_zfs_ioctl_version(); /* * For forward compatibility hide tosnap in zc_value */ if (zfs_ioctl_version < ZFS_IOCVER_LZC) (void) strcpy(zc.zc_value + strlen(zc.zc_value) + 1, tosnap); #endif free(cp); if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) { zcmd_free_nvlists(&zc); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* * Determine the name of the origin snapshot, store in zc_string. */ if (originsnap) { (void) strncpy(zc.zc_string, originsnap, sizeof (zc.zc_string)); if (flags->verbose) (void) printf("using provided clone origin %s\n", zc.zc_string); } else if (drrb->drr_flags & DRR_FLAG_CLONE) { if (guid_to_name(hdl, zc.zc_value, drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local origin for clone %s does not exist"), zc.zc_value); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } if (flags->verbose) (void) printf("found clone origin %s\n", zc.zc_string); } boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RESUMING; stream_wantsnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; if (stream_wantsnewfs) { /* * if the parent fs does not exist, look for it based on * the parent snap GUID */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive new filesystem stream")); (void) strcpy(zc.zc_name, zc.zc_value); cp = strrchr(zc.zc_name, '/'); if (cp) *cp = '\0'; if (cp && !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { char suffix[ZFS_MAX_DATASET_NAME_LEN]; (void) strcpy(suffix, strrchr(zc.zc_value, '/')); if (guid_to_name(hdl, zc.zc_name, parent_snapguid, B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, suffix); } } } else { /* * If the fs does not exist, look for it based on the * fromsnap GUID. */ if (resuming) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive resume stream")); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive incremental stream")); } (void) strcpy(zc.zc_name, zc.zc_value); *strchr(zc.zc_name, '@') = '\0'; /* * If the exact receive path was specified and this is the * topmost path in the stream, then if the fs does not exist we * should look no further. */ if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + strlen(sendfs)) != '\0' && *chopprefix != '@')) && !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { char snap[ZFS_MAX_DATASET_NAME_LEN]; (void) strcpy(snap, strchr(zc.zc_value, '@')); if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid, B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, snap); } } } (void) strcpy(zc.zc_name, zc.zc_value); *strchr(zc.zc_name, '@') = '\0'; if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { zfs_handle_t *zhp; /* * Destination fs exists. It must be one of these cases: * - an incremental send stream * - the stream specifies a new fs (full stream or clone) * and they want us to blow away the existing fs (and * have therefore specified -F and removed any snapshots) * - we are resuming a failed receive. */ if (stream_wantsnewfs) { if (!flags->force) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' exists\n" "must specify -F to overwrite it"), zc.zc_name); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has snapshots (eg. %s)\n" "must destroy them to overwrite it"), zc.zc_name); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } } if ((zhp = zfs_open(hdl, zc.zc_name, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { zcmd_free_nvlists(&zc); return (-1); } if (stream_wantsnewfs && zhp->zfs_dmustats.dds_origin[0]) { zcmd_free_nvlists(&zc); zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' is a clone\n" "must destroy it to overwrite it"), zc.zc_name); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && stream_wantsnewfs) { /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0); if (clp == NULL) { zfs_close(zhp); zcmd_free_nvlists(&zc); return (-1); } if (changelist_prefix(clp) != 0) { changelist_free(clp); zfs_close(zhp); zcmd_free_nvlists(&zc); return (-1); } } /* * If we are resuming a newfs, set newfs here so that we will * mount it if the recv succeeds this time. We can tell * that it was a newfs on the first recv because the fs * itself will be inconsistent (if the fs existed when we * did the first recv, we would have received it into * .../%recv). */ if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) newfs = B_TRUE; zfs_close(zhp); } else { /* * Destination filesystem does not exist. Therefore we better * be creating a new filesystem (either from a full backup, or * a clone). It would therefore be invalid if the user * specified only the pool name (i.e. if the destination name * contained no slash character). */ if (!stream_wantsnewfs || (cp = strrchr(zc.zc_name, '/')) == NULL) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' does not exist"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* * Trim off the final dataset component so we perform the * recvbackup ioctl to the filesystems's parent. */ *cp = '\0'; if (flags->isprefix && !flags->istail && !flags->dryrun && create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) { zcmd_free_nvlists(&zc); return (zfs_error(hdl, EZFS_BADRESTORE, errbuf)); } newfs = B_TRUE; } zc.zc_begin_record = *drr_noswap; zc.zc_cookie = infd; zc.zc_guid = flags->force; zc.zc_resumable = flags->resumable; if (flags->verbose) { (void) printf("%s %s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, zc.zc_value); (void) fflush(stdout); } if (flags->dryrun) { zcmd_free_nvlists(&zc); return (recv_skip(hdl, infd, flags->byteswap)); } zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf; zc.zc_nvlist_dst_size = sizeof (prop_errbuf); zc.zc_cleanup_fd = cleanup_fd; zc.zc_action_handle = *action_handlep; err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc); ioctl_errno = errno; prop_errflags = (zprop_errflags_t)zc.zc_obj; if (err == 0) { nvlist_t *prop_errors; VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, zc.zc_nvlist_dst_size, &prop_errors, 0)); nvpair_t *prop_err = NULL; while ((prop_err = nvlist_next_nvpair(prop_errors, prop_err)) != NULL) { char tbuf[1024]; zfs_prop_t prop; int intval; prop = zfs_name_to_prop(nvpair_name(prop_err)); (void) nvpair_value_int32(prop_err, &intval); if (strcmp(nvpair_name(prop_err), ZPROP_N_MORE_ERRORS) == 0) { trunc_prop_errs(intval); break; } else if (snapname == NULL || finalsnap == NULL || strcmp(finalsnap, snapname) == 0 || strcmp(nvpair_name(prop_err), zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) { /* * Skip the special case of, for example, * "refquota", errors on intermediate * snapshots leading up to a final one. * That's why we have all of the checks above. * * See zfs_ioctl.c's extract_delay_props() for * a list of props which can fail on * intermediate snapshots, but shouldn't * affect the overall receive. */ (void) snprintf(tbuf, sizeof (tbuf), dgettext(TEXT_DOMAIN, "cannot receive %s property on %s"), nvpair_name(prop_err), zc.zc_name); zfs_setprop_error(hdl, prop, intval, tbuf); } } nvlist_free(prop_errors); } zc.zc_nvlist_dst = 0; zc.zc_nvlist_dst_size = 0; zcmd_free_nvlists(&zc); if (err == 0 && snapprops_nvlist) { zfs_cmd_t zc2 = { 0 }; (void) strcpy(zc2.zc_name, zc.zc_value); zc2.zc_cookie = B_TRUE; /* received */ if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) { (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2); zcmd_free_nvlists(&zc2); } } if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { /* * It may be that this snapshot already exists, * in which case we want to consume & ignore it * rather than failing. */ avl_tree_t *local_avl; nvlist_t *local_nv, *fs; cp = strchr(zc.zc_value, '@'); /* * XXX Do this faster by just iterating over snaps in * this fs. Also if zc_value does not exist, we will * get a strange "does not exist" error message. */ *cp = '\0'; if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE, B_FALSE, &local_nv, &local_avl) == 0) { *cp = '@'; fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); fsavl_destroy(local_avl); nvlist_free(local_nv); if (fs != NULL) { if (flags->verbose) { (void) printf("snap %s already exists; " "ignoring\n", zc.zc_value); } err = ioctl_err = recv_skip(hdl, infd, flags->byteswap); } } *cp = '@'; } if (ioctl_err != 0) { switch (ioctl_errno) { case ENODEV: cp = strchr(zc.zc_value, '@'); *cp = '\0'; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "most recent snapshot of %s does not\n" "match incremental source"), zc.zc_value); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); *cp = '@'; break; case ETXTBSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s has been modified\n" "since most recent snapshot"), zc.zc_name); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); break; case EEXIST: cp = strchr(zc.zc_value, '@'); if (newfs) { /* it's the containing fs that exists */ *cp = '\0'; } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination already exists")); (void) zfs_error_fmt(hdl, EZFS_EXISTS, dgettext(TEXT_DOMAIN, "cannot restore to %s"), zc.zc_value); *cp = '@'; break; case EINVAL: (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to receive this stream.")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EDQUOT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s space quota exceeded"), zc.zc_name); (void) zfs_error(hdl, EZFS_NOSPC, errbuf); break; default: (void) zfs_standard_error(hdl, ioctl_errno, errbuf); } } /* * Mount the target filesystem (if created). Also mount any * children of the target filesystem if we did a replication * receive (indicated by stream_avl being non-NULL). */ cp = strchr(zc.zc_value, '@'); if (cp && (ioctl_err == 0 || !newfs)) { zfs_handle_t *h; *cp = '\0'; h = zfs_open(hdl, zc.zc_value, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (h != NULL) { if (h->zfs_type == ZFS_TYPE_VOLUME) { *cp = '@'; } else if (newfs || stream_avl) { /* * Track the first/top of hierarchy fs, * for mounting and sharing later. */ if (top_zfs && *top_zfs == NULL) *top_zfs = zfs_strdup(hdl, zc.zc_value); } zfs_close(h); } *cp = '@'; } if (clp) { if (!flags->nomount) err |= changelist_postfix(clp); changelist_free(clp); } if (prop_errflags & ZPROP_ERR_NOCLEAR) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to clear unreceived properties on %s"), zc.zc_name); (void) fprintf(stderr, "\n"); } if (prop_errflags & ZPROP_ERR_NORESTORE) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to restore original properties on %s"), zc.zc_name); (void) fprintf(stderr, "\n"); } if (err || ioctl_err) return (-1); *action_handlep = zc.zc_action_handle; if (flags->verbose) { char buf1[64]; char buf2[64]; uint64_t bytes = zc.zc_cookie; time_t delta = time(NULL) - begin_time; if (delta == 0) delta = 1; zfs_nicenum(bytes, buf1, sizeof (buf1)); zfs_nicenum(bytes/delta, buf2, sizeof (buf1)); (void) printf("received %sB stream in %lu seconds (%sB/sec)\n", buf1, delta, buf2); } return (0); } static int zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, uint64_t *action_handlep, const char *finalsnap) { int err; dmu_replay_record_t drr, drr_noswap; struct drr_begin *drrb = &drr.drr_u.drr_begin; char errbuf[1024]; zio_cksum_t zcksum = { 0 }; uint64_t featureflags; int hdrtype; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); if (flags->isprefix && !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " "(%s) does not exist"), tosnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } if (originsnap && !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " "(%s) does not exist"), originsnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* read in the BEGIN record */ if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, &zcksum))) return (err); if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) { /* It's the double end record at the end of a package */ return (ENODATA); } /* the kernel needs the non-byteswapped begin record */ drr_noswap = drr; flags->byteswap = B_FALSE; if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { /* * We computed the checksum in the wrong byteorder in * recv_read() above; do it again correctly. */ bzero(&zcksum, sizeof (zio_cksum_t)); (void) fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); flags->byteswap = B_TRUE; drr.drr_type = BSWAP_32(drr.drr_type); drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_flags = BSWAP_32(drrb->drr_flags); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad magic number)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); if (!DMU_STREAM_SUPPORTED(featureflags) || (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream has unsupported feature, feature flags = %lx"), featureflags); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } if (strchr(drrb->drr_toname, '@') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad snapshot name)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN]; if (sendfs == NULL) { /* * We were not called from zfs_receive_package(). Get * the fs specified by 'zfs send'. */ char *cp; (void) strlcpy(nonpackage_sendfs, drr.drr_u.drr_begin.drr_toname, sizeof (nonpackage_sendfs)); if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) *cp = '\0'; sendfs = nonpackage_sendfs; VERIFY(finalsnap == NULL); } return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, action_handlep, finalsnap)); } else { assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM); return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, &zcksum, top_zfs, cleanup_fd, action_handlep)); } } /* * Restores a backup of tosnap from the file descriptor specified by infd. * Return 0 on total success, -2 if some things couldn't be * destroyed/renamed/promoted, -1 if some things couldn't be received. * (-1 will override -2, if -1 and the resumable flag was specified the * transfer can be resumed if the sending side supports it). */ int zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, recvflags_t *flags, int infd, avl_tree_t *stream_avl) { char *top_zfs = NULL; int err; int cleanup_fd; uint64_t action_handle = 0; char *originsnap = NULL; if (props) { err = nvlist_lookup_string(props, "origin", &originsnap); if (err && err != ENOENT) return (err); } cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); VERIFY(cleanup_fd >= 0); err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, stream_avl, &top_zfs, cleanup_fd, &action_handle, NULL); VERIFY(0 == close(cleanup_fd)); if (err == 0 && !flags->nomount && top_zfs) { zfs_handle_t *zhp; prop_changelist_t *clp; zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM); if (zhp != NULL) { clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, CL_GATHER_MOUNT_ALWAYS, 0); zfs_close(zhp); if (clp != NULL) { /* mount and share received datasets */ err = changelist_postfix(clp); changelist_free(clp); } } if (zhp == NULL || clp == NULL || err) err = -1; } if (top_zfs) free(top_zfs); return (err); } Index: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs (revision 352536) +++ projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs (revision 352537) Property changes on: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl/contrib/opensolaris/lib/libzfs:r352105-352536 Index: projects/clang900-import/cddl/contrib/opensolaris =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris (revision 352536) +++ projects/clang900-import/cddl/contrib/opensolaris (revision 352537) Property changes on: projects/clang900-import/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl/contrib/opensolaris:r352308-352536 Index: projects/clang900-import/cddl =================================================================== --- projects/clang900-import/cddl (revision 352536) +++ projects/clang900-import/cddl (revision 352537) Property changes on: projects/clang900-import/cddl ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl:r352308-352536 Index: projects/clang900-import/contrib/jemalloc/src/jemalloc.c =================================================================== --- projects/clang900-import/contrib/jemalloc/src/jemalloc.c (revision 352536) +++ projects/clang900-import/contrib/jemalloc/src/jemalloc.c (revision 352537) @@ -1,3434 +1,3419 @@ #define JEMALLOC_C_ #include "jemalloc/internal/jemalloc_preamble.h" #include "jemalloc/internal/jemalloc_internal_includes.h" #include "jemalloc/internal/assert.h" #include "jemalloc/internal/atomic.h" #include "jemalloc/internal/ctl.h" #include "jemalloc/internal/extent_dss.h" #include "jemalloc/internal/extent_mmap.h" #include "jemalloc/internal/jemalloc_internal_types.h" #include "jemalloc/internal/log.h" #include "jemalloc/internal/malloc_io.h" #include "jemalloc/internal/mutex.h" #include "jemalloc/internal/rtree.h" #include "jemalloc/internal/size_classes.h" #include "jemalloc/internal/spin.h" #include "jemalloc/internal/sz.h" #include "jemalloc/internal/ticker.h" #include "jemalloc/internal/util.h" /******************************************************************************/ /* Data. */ /* Work around : */ const char *__malloc_options_1_0 = NULL; __sym_compat(_malloc_options, __malloc_options_1_0, FBSD_1.0); /* Runtime configuration options. */ const char *je_malloc_conf #ifndef _WIN32 JEMALLOC_ATTR(weak) #endif ; bool opt_abort = #ifdef JEMALLOC_DEBUG true #else false #endif ; bool opt_abort_conf = #ifdef JEMALLOC_DEBUG true #else false #endif ; const char *opt_junk = #if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL)) "true" #else "false" #endif ; bool opt_junk_alloc = #if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL)) true #else false #endif ; bool opt_junk_free = #if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL)) true #else false #endif ; bool opt_utrace = false; bool opt_xmalloc = false; bool opt_zero = false; unsigned opt_narenas = 0; unsigned ncpus; /* Protects arenas initialization. */ malloc_mutex_t arenas_lock; /* * Arenas that are used to service external requests. Not all elements of the * arenas array are necessarily used; arenas are created lazily as needed. * * arenas[0..narenas_auto) are used for automatic multiplexing of threads and * arenas. arenas[narenas_auto..narenas_total) are only used if the application * takes some action to create them and allocate from them. * * Points to an arena_t. */ JEMALLOC_ALIGNED(CACHELINE) atomic_p_t arenas[MALLOCX_ARENA_LIMIT]; static atomic_u_t narenas_total; /* Use narenas_total_*(). */ static arena_t *a0; /* arenas[0]; read-only after initialization. */ unsigned narenas_auto; /* Read-only after initialization. */ typedef enum { malloc_init_uninitialized = 3, malloc_init_a0_initialized = 2, malloc_init_recursible = 1, malloc_init_initialized = 0 /* Common case --> jnz. */ } malloc_init_t; static malloc_init_t malloc_init_state = malloc_init_uninitialized; /* False should be the common case. Set to true to trigger initialization. */ bool malloc_slow = true; /* When malloc_slow is true, set the corresponding bits for sanity check. */ enum { flag_opt_junk_alloc = (1U), flag_opt_junk_free = (1U << 1), flag_opt_zero = (1U << 2), flag_opt_utrace = (1U << 3), flag_opt_xmalloc = (1U << 4) }; static uint8_t malloc_slow_flags; #ifdef JEMALLOC_THREADED_INIT /* Used to let the initializing thread recursively allocate. */ # define NO_INITIALIZER ((unsigned long)0) # define INITIALIZER pthread_self() # define IS_INITIALIZER (malloc_initializer == pthread_self()) static pthread_t malloc_initializer = NO_INITIALIZER; #else # define NO_INITIALIZER false # define INITIALIZER true # define IS_INITIALIZER malloc_initializer static bool malloc_initializer = NO_INITIALIZER; #endif /* Used to avoid initialization races. */ #ifdef _WIN32 #if _WIN32_WINNT >= 0x0600 static malloc_mutex_t init_lock = SRWLOCK_INIT; #else static malloc_mutex_t init_lock; static bool init_lock_initialized = false; JEMALLOC_ATTR(constructor) static void WINAPI _init_init_lock(void) { /* * If another constructor in the same binary is using mallctl to e.g. * set up extent hooks, it may end up running before this one, and * malloc_init_hard will crash trying to lock the uninitialized lock. So * we force an initialization of the lock in malloc_init_hard as well. * We don't try to care about atomicity of the accessed to the * init_lock_initialized boolean, since it really only matters early in * the process creation, before any separate thread normally starts * doing anything. */ if (!init_lock_initialized) { malloc_mutex_init(&init_lock, "init", WITNESS_RANK_INIT, malloc_mutex_rank_exclusive); } init_lock_initialized = true; } #ifdef _MSC_VER # pragma section(".CRT$XCU", read) JEMALLOC_SECTION(".CRT$XCU") JEMALLOC_ATTR(used) static const void (WINAPI *init_init_lock)(void) = _init_init_lock; #endif #endif #else static malloc_mutex_t init_lock = MALLOC_MUTEX_INITIALIZER; #endif typedef struct { void *p; /* Input pointer (as in realloc(p, s)). */ size_t s; /* Request size. */ void *r; /* Result pointer. */ } malloc_utrace_t; #ifdef JEMALLOC_UTRACE # define UTRACE(a, b, c) do { \ if (unlikely(opt_utrace)) { \ int utrace_serrno = errno; \ malloc_utrace_t ut; \ ut.p = (a); \ ut.s = (b); \ ut.r = (c); \ utrace(&ut, sizeof(ut)); \ errno = utrace_serrno; \ } \ } while (0) #else # define UTRACE(a, b, c) #endif /* Whether encountered any invalid config options. */ static bool had_conf_error = false; /******************************************************************************/ /* * Function prototypes for static functions that are referenced prior to * definition. */ static bool malloc_init_hard_a0(void); static bool malloc_init_hard(void); /******************************************************************************/ /* * Begin miscellaneous support functions. */ bool malloc_initialized(void) { return (malloc_init_state == malloc_init_initialized); } JEMALLOC_ALWAYS_INLINE bool malloc_init_a0(void) { if (unlikely(malloc_init_state == malloc_init_uninitialized)) { return malloc_init_hard_a0(); } return false; } JEMALLOC_ALWAYS_INLINE bool malloc_init(void) { if (unlikely(!malloc_initialized()) && malloc_init_hard()) { return true; } return false; } /* * The a0*() functions are used instead of i{d,}alloc() in situations that * cannot tolerate TLS variable access. */ static void * a0ialloc(size_t size, bool zero, bool is_internal) { if (unlikely(malloc_init_a0())) { return NULL; } return iallocztm(TSDN_NULL, size, sz_size2index(size), zero, NULL, is_internal, arena_get(TSDN_NULL, 0, true), true); } static void a0idalloc(void *ptr, bool is_internal) { idalloctm(TSDN_NULL, ptr, NULL, NULL, is_internal, true); } void * a0malloc(size_t size) { return a0ialloc(size, false, true); } void a0dalloc(void *ptr) { a0idalloc(ptr, true); } /* * FreeBSD's libc uses the bootstrap_*() functions in bootstrap-senstive * situations that cannot tolerate TLS variable access (TLS allocation and very * early internal data structure initialization). */ void * bootstrap_malloc(size_t size) { if (unlikely(size == 0)) { size = 1; } return a0ialloc(size, false, false); } void * bootstrap_calloc(size_t num, size_t size) { size_t num_size; num_size = num * size; if (unlikely(num_size == 0)) { assert(num == 0 || size == 0); num_size = 1; } return a0ialloc(num_size, true, false); } void bootstrap_free(void *ptr) { if (unlikely(ptr == NULL)) { return; } a0idalloc(ptr, false); } void arena_set(unsigned ind, arena_t *arena) { atomic_store_p(&arenas[ind], arena, ATOMIC_RELEASE); } static void narenas_total_set(unsigned narenas) { atomic_store_u(&narenas_total, narenas, ATOMIC_RELEASE); } static void narenas_total_inc(void) { atomic_fetch_add_u(&narenas_total, 1, ATOMIC_RELEASE); } unsigned narenas_total_get(void) { return atomic_load_u(&narenas_total, ATOMIC_ACQUIRE); } /* Create a new arena and insert it into the arenas array at index ind. */ static arena_t * arena_init_locked(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) { arena_t *arena; assert(ind <= narenas_total_get()); if (ind >= MALLOCX_ARENA_LIMIT) { return NULL; } if (ind == narenas_total_get()) { narenas_total_inc(); } /* * Another thread may have already initialized arenas[ind] if it's an * auto arena. */ arena = arena_get(tsdn, ind, false); if (arena != NULL) { assert(ind < narenas_auto); return arena; } /* Actually initialize the arena. */ arena = arena_new(tsdn, ind, extent_hooks); return arena; } static void arena_new_create_background_thread(tsdn_t *tsdn, unsigned ind) { if (ind == 0) { return; } if (have_background_thread) { bool err; malloc_mutex_lock(tsdn, &background_thread_lock); err = background_thread_create(tsdn_tsd(tsdn), ind); malloc_mutex_unlock(tsdn, &background_thread_lock); if (err) { malloc_printf(": error in background thread " "creation for arena %u. Abort.\n", ind); abort(); } } } arena_t * arena_init(tsdn_t *tsdn, unsigned ind, extent_hooks_t *extent_hooks) { arena_t *arena; malloc_mutex_lock(tsdn, &arenas_lock); arena = arena_init_locked(tsdn, ind, extent_hooks); malloc_mutex_unlock(tsdn, &arenas_lock); arena_new_create_background_thread(tsdn, ind); return arena; } static void arena_bind(tsd_t *tsd, unsigned ind, bool internal) { arena_t *arena = arena_get(tsd_tsdn(tsd), ind, false); arena_nthreads_inc(arena, internal); if (internal) { tsd_iarena_set(tsd, arena); } else { tsd_arena_set(tsd, arena); } } void arena_migrate(tsd_t *tsd, unsigned oldind, unsigned newind) { arena_t *oldarena, *newarena; oldarena = arena_get(tsd_tsdn(tsd), oldind, false); newarena = arena_get(tsd_tsdn(tsd), newind, false); arena_nthreads_dec(oldarena, false); arena_nthreads_inc(newarena, false); tsd_arena_set(tsd, newarena); } static void arena_unbind(tsd_t *tsd, unsigned ind, bool internal) { arena_t *arena; arena = arena_get(tsd_tsdn(tsd), ind, false); arena_nthreads_dec(arena, internal); if (internal) { tsd_iarena_set(tsd, NULL); } else { tsd_arena_set(tsd, NULL); } } arena_tdata_t * arena_tdata_get_hard(tsd_t *tsd, unsigned ind) { arena_tdata_t *tdata, *arenas_tdata_old; arena_tdata_t *arenas_tdata = tsd_arenas_tdata_get(tsd); unsigned narenas_tdata_old, i; unsigned narenas_tdata = tsd_narenas_tdata_get(tsd); unsigned narenas_actual = narenas_total_get(); /* * Dissociate old tdata array (and set up for deallocation upon return) * if it's too small. */ if (arenas_tdata != NULL && narenas_tdata < narenas_actual) { arenas_tdata_old = arenas_tdata; narenas_tdata_old = narenas_tdata; arenas_tdata = NULL; narenas_tdata = 0; tsd_arenas_tdata_set(tsd, arenas_tdata); tsd_narenas_tdata_set(tsd, narenas_tdata); } else { arenas_tdata_old = NULL; narenas_tdata_old = 0; } /* Allocate tdata array if it's missing. */ if (arenas_tdata == NULL) { bool *arenas_tdata_bypassp = tsd_arenas_tdata_bypassp_get(tsd); narenas_tdata = (ind < narenas_actual) ? narenas_actual : ind+1; if (tsd_nominal(tsd) && !*arenas_tdata_bypassp) { *arenas_tdata_bypassp = true; arenas_tdata = (arena_tdata_t *)a0malloc( sizeof(arena_tdata_t) * narenas_tdata); *arenas_tdata_bypassp = false; } if (arenas_tdata == NULL) { tdata = NULL; goto label_return; } assert(tsd_nominal(tsd) && !*arenas_tdata_bypassp); tsd_arenas_tdata_set(tsd, arenas_tdata); tsd_narenas_tdata_set(tsd, narenas_tdata); } /* * Copy to tdata array. It's possible that the actual number of arenas * has increased since narenas_total_get() was called above, but that * causes no correctness issues unless two threads concurrently execute * the arenas.create mallctl, which we trust mallctl synchronization to * prevent. */ /* Copy/initialize tickers. */ for (i = 0; i < narenas_actual; i++) { if (i < narenas_tdata_old) { ticker_copy(&arenas_tdata[i].decay_ticker, &arenas_tdata_old[i].decay_ticker); } else { ticker_init(&arenas_tdata[i].decay_ticker, DECAY_NTICKS_PER_UPDATE); } } if (narenas_tdata > narenas_actual) { memset(&arenas_tdata[narenas_actual], 0, sizeof(arena_tdata_t) * (narenas_tdata - narenas_actual)); } /* Read the refreshed tdata array. */ tdata = &arenas_tdata[ind]; label_return: if (arenas_tdata_old != NULL) { a0dalloc(arenas_tdata_old); } return tdata; } /* Slow path, called only by arena_choose(). */ arena_t * arena_choose_hard(tsd_t *tsd, bool internal) { arena_t *ret JEMALLOC_CC_SILENCE_INIT(NULL); if (have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena)) { unsigned choose = percpu_arena_choose(); ret = arena_get(tsd_tsdn(tsd), choose, true); assert(ret != NULL); arena_bind(tsd, arena_ind_get(ret), false); arena_bind(tsd, arena_ind_get(ret), true); return ret; } if (narenas_auto > 1) { unsigned i, j, choose[2], first_null; bool is_new_arena[2]; /* * Determine binding for both non-internal and internal * allocation. * * choose[0]: For application allocation. * choose[1]: For internal metadata allocation. */ for (j = 0; j < 2; j++) { choose[j] = 0; is_new_arena[j] = false; } first_null = narenas_auto; malloc_mutex_lock(tsd_tsdn(tsd), &arenas_lock); assert(arena_get(tsd_tsdn(tsd), 0, false) != NULL); for (i = 1; i < narenas_auto; i++) { if (arena_get(tsd_tsdn(tsd), i, false) != NULL) { /* * Choose the first arena that has the lowest * number of threads assigned to it. */ for (j = 0; j < 2; j++) { if (arena_nthreads_get(arena_get( tsd_tsdn(tsd), i, false), !!j) < arena_nthreads_get(arena_get( tsd_tsdn(tsd), choose[j], false), !!j)) { choose[j] = i; } } } else if (first_null == narenas_auto) { /* * Record the index of the first uninitialized * arena, in case all extant arenas are in use. * * NB: It is possible for there to be * discontinuities in terms of initialized * versus uninitialized arenas, due to the * "thread.arena" mallctl. */ first_null = i; } } for (j = 0; j < 2; j++) { if (arena_nthreads_get(arena_get(tsd_tsdn(tsd), choose[j], false), !!j) == 0 || first_null == narenas_auto) { /* * Use an unloaded arena, or the least loaded * arena if all arenas are already initialized. */ if (!!j == internal) { ret = arena_get(tsd_tsdn(tsd), choose[j], false); } } else { arena_t *arena; /* Initialize a new arena. */ choose[j] = first_null; arena = arena_init_locked(tsd_tsdn(tsd), choose[j], (extent_hooks_t *)&extent_hooks_default); if (arena == NULL) { malloc_mutex_unlock(tsd_tsdn(tsd), &arenas_lock); return NULL; } is_new_arena[j] = true; if (!!j == internal) { ret = arena; } } arena_bind(tsd, choose[j], !!j); } malloc_mutex_unlock(tsd_tsdn(tsd), &arenas_lock); for (j = 0; j < 2; j++) { if (is_new_arena[j]) { assert(choose[j] > 0); arena_new_create_background_thread( tsd_tsdn(tsd), choose[j]); } } } else { ret = arena_get(tsd_tsdn(tsd), 0, false); arena_bind(tsd, 0, false); arena_bind(tsd, 0, true); } return ret; } void iarena_cleanup(tsd_t *tsd) { arena_t *iarena; iarena = tsd_iarena_get(tsd); if (iarena != NULL) { arena_unbind(tsd, arena_ind_get(iarena), true); } } void arena_cleanup(tsd_t *tsd) { arena_t *arena; arena = tsd_arena_get(tsd); if (arena != NULL) { arena_unbind(tsd, arena_ind_get(arena), false); } } void arenas_tdata_cleanup(tsd_t *tsd) { arena_tdata_t *arenas_tdata; /* Prevent tsd->arenas_tdata from being (re)created. */ *tsd_arenas_tdata_bypassp_get(tsd) = true; arenas_tdata = tsd_arenas_tdata_get(tsd); if (arenas_tdata != NULL) { tsd_arenas_tdata_set(tsd, NULL); a0dalloc(arenas_tdata); } } static void stats_print_atexit(void) { if (config_stats) { tsdn_t *tsdn; unsigned narenas, i; tsdn = tsdn_fetch(); /* * Merge stats from extant threads. This is racy, since * individual threads do not lock when recording tcache stats * events. As a consequence, the final stats may be slightly * out of date by the time they are reported, if other threads * continue to allocate. */ for (i = 0, narenas = narenas_total_get(); i < narenas; i++) { arena_t *arena = arena_get(tsdn, i, false); if (arena != NULL) { tcache_t *tcache; malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); ql_foreach(tcache, &arena->tcache_ql, link) { tcache_stats_merge(tsdn, tcache, arena); } malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx); } } } je_malloc_stats_print(NULL, NULL, opt_stats_print_opts); } /* * Ensure that we don't hold any locks upon entry to or exit from allocator * code (in a "broad" sense that doesn't count a reentrant allocation as an * entrance or exit). */ JEMALLOC_ALWAYS_INLINE void check_entry_exit_locking(tsdn_t *tsdn) { if (!config_debug) { return; } if (tsdn_null(tsdn)) { return; } tsd_t *tsd = tsdn_tsd(tsdn); /* * It's possible we hold locks at entry/exit if we're in a nested * allocation. */ int8_t reentrancy_level = tsd_reentrancy_level_get(tsd); if (reentrancy_level != 0) { return; } witness_assert_lockless(tsdn_witness_tsdp_get(tsdn)); } /* * End miscellaneous support functions. */ /******************************************************************************/ /* * Begin initialization functions. */ static char * jemalloc_secure_getenv(const char *name) { #ifdef JEMALLOC_HAVE_SECURE_GETENV return secure_getenv(name); #else # ifdef JEMALLOC_HAVE_ISSETUGID if (issetugid() != 0) { return NULL; } # endif return getenv(name); #endif } static unsigned malloc_ncpus(void) { long result; #ifdef _WIN32 SYSTEM_INFO si; GetSystemInfo(&si); result = si.dwNumberOfProcessors; #elif defined(JEMALLOC_GLIBC_MALLOC_HOOK) && defined(CPU_COUNT) /* * glibc >= 2.6 has the CPU_COUNT macro. * * glibc's sysconf() uses isspace(). glibc allocates for the first time * *before* setting up the isspace tables. Therefore we need a * different method to get the number of CPUs. */ { cpu_set_t set; pthread_getaffinity_np(pthread_self(), sizeof(set), &set); result = CPU_COUNT(&set); } #else result = sysconf(_SC_NPROCESSORS_ONLN); #endif return ((result == -1) ? 1 : (unsigned)result); } static void init_opt_stats_print_opts(const char *v, size_t vlen) { size_t opts_len = strlen(opt_stats_print_opts); assert(opts_len <= stats_print_tot_num_options); for (size_t i = 0; i < vlen; i++) { switch (v[i]) { #define OPTION(o, v, d, s) case o: break; STATS_PRINT_OPTIONS #undef OPTION default: continue; } if (strchr(opt_stats_print_opts, v[i]) != NULL) { /* Ignore repeated. */ continue; } opt_stats_print_opts[opts_len++] = v[i]; opt_stats_print_opts[opts_len] = '\0'; assert(opts_len <= stats_print_tot_num_options); } assert(opts_len == strlen(opt_stats_print_opts)); } static bool malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p, char const **v_p, size_t *vlen_p) { bool accept; const char *opts = *opts_p; *k_p = opts; for (accept = false; !accept;) { switch (*opts) { case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '_': opts++; break; case ':': opts++; *klen_p = (uintptr_t)opts - 1 - (uintptr_t)*k_p; *v_p = opts; accept = true; break; case '\0': if (opts != *opts_p) { malloc_write(": Conf string ends " "with key\n"); } return true; default: malloc_write(": Malformed conf string\n"); return true; } } for (accept = false; !accept;) { switch (*opts) { case ',': opts++; /* * Look ahead one character here, because the next time * this function is called, it will assume that end of * input has been cleanly reached if no input remains, * but we have optimistically already consumed the * comma if one exists. */ if (*opts == '\0') { malloc_write(": Conf string ends " "with comma\n"); } *vlen_p = (uintptr_t)opts - 1 - (uintptr_t)*v_p; accept = true; break; case '\0': *vlen_p = (uintptr_t)opts - (uintptr_t)*v_p; accept = true; break; default: opts++; break; } } *opts_p = opts; return false; } static void malloc_abort_invalid_conf(void) { assert(opt_abort_conf); malloc_printf(": Abort (abort_conf:true) on invalid conf " "value (see above).\n"); abort(); } static void malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v, size_t vlen) { malloc_printf(": %s: %.*s:%.*s\n", msg, (int)klen, k, (int)vlen, v); /* If abort_conf is set, error out after processing all options. */ had_conf_error = true; } static void malloc_slow_flag_init(void) { /* * Combine the runtime options into malloc_slow for fast path. Called * after processing all the options. */ malloc_slow_flags |= (opt_junk_alloc ? flag_opt_junk_alloc : 0) | (opt_junk_free ? flag_opt_junk_free : 0) | (opt_zero ? flag_opt_zero : 0) | (opt_utrace ? flag_opt_utrace : 0) | (opt_xmalloc ? flag_opt_xmalloc : 0); malloc_slow = (malloc_slow_flags != 0); } static void malloc_conf_init(void) { unsigned i; char buf[PATH_MAX + 1]; const char *opts, *k, *v; size_t klen, vlen; for (i = 0; i < 4; i++) { /* Get runtime configuration. */ switch (i) { case 0: opts = config_malloc_conf; break; case 1: if (je_malloc_conf != NULL) { /* * Use options that were compiled into the * program. */ opts = je_malloc_conf; } else { /* No configuration specified. */ buf[0] = '\0'; opts = buf; } break; case 2: { ssize_t linklen = 0; #ifndef _WIN32 int saved_errno = errno; const char *linkname = # ifdef JEMALLOC_PREFIX "/etc/"JEMALLOC_PREFIX"malloc.conf" # else "/etc/malloc.conf" # endif ; /* * Try to use the contents of the "/etc/malloc.conf" * symbolic link's name. */ linklen = readlink(linkname, buf, sizeof(buf) - 1); if (linklen == -1) { /* No configuration specified. */ linklen = 0; /* Restore errno. */ set_errno(saved_errno); } #endif buf[linklen] = '\0'; opts = buf; break; } case 3: { const char *envname = #ifdef JEMALLOC_PREFIX JEMALLOC_CPREFIX"MALLOC_CONF" #else "MALLOC_CONF" #endif ; if ((opts = jemalloc_secure_getenv(envname)) != NULL) { /* * Do nothing; opts is already initialized to * the value of the MALLOC_CONF environment * variable. */ } else { /* No configuration specified. */ buf[0] = '\0'; opts = buf; } break; } default: not_reached(); buf[0] = '\0'; opts = buf; } while (*opts != '\0' && !malloc_conf_next(&opts, &k, &klen, &v, &vlen)) { #define CONF_MATCH(n) \ (sizeof(n)-1 == klen && strncmp(n, k, klen) == 0) #define CONF_MATCH_VALUE(n) \ (sizeof(n)-1 == vlen && strncmp(n, v, vlen) == 0) #define CONF_HANDLE_BOOL(o, n) \ if (CONF_MATCH(n)) { \ if (CONF_MATCH_VALUE("true")) { \ o = true; \ } else if (CONF_MATCH_VALUE("false")) { \ o = false; \ } else { \ malloc_conf_error( \ "Invalid conf value", \ k, klen, v, vlen); \ } \ continue; \ } #define CONF_MIN_no(um, min) false #define CONF_MIN_yes(um, min) ((um) < (min)) #define CONF_MAX_no(um, max) false #define CONF_MAX_yes(um, max) ((um) > (max)) #define CONF_HANDLE_T_U(t, o, n, min, max, check_min, check_max, clip) \ if (CONF_MATCH(n)) { \ uintmax_t um; \ char *end; \ \ set_errno(0); \ um = malloc_strtoumax(v, &end, 0); \ if (get_errno() != 0 || (uintptr_t)end -\ (uintptr_t)v != vlen) { \ malloc_conf_error( \ "Invalid conf value", \ k, klen, v, vlen); \ } else if (clip) { \ if (CONF_MIN_##check_min(um, \ (t)(min))) { \ o = (t)(min); \ } else if ( \ CONF_MAX_##check_max(um, \ (t)(max))) { \ o = (t)(max); \ } else { \ o = (t)um; \ } \ } else { \ if (CONF_MIN_##check_min(um, \ (t)(min)) || \ CONF_MAX_##check_max(um, \ (t)(max))) { \ malloc_conf_error( \ "Out-of-range " \ "conf value", \ k, klen, v, vlen); \ } else { \ o = (t)um; \ } \ } \ continue; \ } #define CONF_HANDLE_UNSIGNED(o, n, min, max, check_min, check_max, \ clip) \ CONF_HANDLE_T_U(unsigned, o, n, min, max, \ check_min, check_max, clip) #define CONF_HANDLE_SIZE_T(o, n, min, max, check_min, check_max, clip) \ CONF_HANDLE_T_U(size_t, o, n, min, max, \ check_min, check_max, clip) #define CONF_HANDLE_SSIZE_T(o, n, min, max) \ if (CONF_MATCH(n)) { \ long l; \ char *end; \ \ set_errno(0); \ l = strtol(v, &end, 0); \ if (get_errno() != 0 || (uintptr_t)end -\ (uintptr_t)v != vlen) { \ malloc_conf_error( \ "Invalid conf value", \ k, klen, v, vlen); \ } else if (l < (ssize_t)(min) || l > \ (ssize_t)(max)) { \ malloc_conf_error( \ "Out-of-range conf value", \ k, klen, v, vlen); \ } else { \ o = l; \ } \ continue; \ } #define CONF_HANDLE_CHAR_P(o, n, d) \ if (CONF_MATCH(n)) { \ size_t cpylen = (vlen <= \ sizeof(o)-1) ? vlen : \ sizeof(o)-1; \ strncpy(o, v, cpylen); \ o[cpylen] = '\0'; \ continue; \ } CONF_HANDLE_BOOL(opt_abort, "abort") CONF_HANDLE_BOOL(opt_abort_conf, "abort_conf") if (strncmp("metadata_thp", k, klen) == 0) { int i; bool match = false; for (i = 0; i < metadata_thp_mode_limit; i++) { if (strncmp(metadata_thp_mode_names[i], v, vlen) == 0) { opt_metadata_thp = i; match = true; break; } } if (!match) { malloc_conf_error("Invalid conf value", k, klen, v, vlen); } continue; } CONF_HANDLE_BOOL(opt_retain, "retain") if (strncmp("dss", k, klen) == 0) { int i; bool match = false; for (i = 0; i < dss_prec_limit; i++) { if (strncmp(dss_prec_names[i], v, vlen) == 0) { if (extent_dss_prec_set(i)) { malloc_conf_error( "Error setting dss", k, klen, v, vlen); } else { opt_dss = dss_prec_names[i]; match = true; break; } } } if (!match) { malloc_conf_error("Invalid conf value", k, klen, v, vlen); } continue; } CONF_HANDLE_UNSIGNED(opt_narenas, "narenas", 1, UINT_MAX, yes, no, false) CONF_HANDLE_SSIZE_T(opt_dirty_decay_ms, "dirty_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) < QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) : SSIZE_MAX); CONF_HANDLE_SSIZE_T(opt_muzzy_decay_ms, "muzzy_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) < QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) : SSIZE_MAX); CONF_HANDLE_BOOL(opt_stats_print, "stats_print") if (CONF_MATCH("stats_print_opts")) { init_opt_stats_print_opts(v, vlen); continue; } if (config_fill) { if (CONF_MATCH("junk")) { if (CONF_MATCH_VALUE("true")) { opt_junk = "true"; opt_junk_alloc = opt_junk_free = true; } else if (CONF_MATCH_VALUE("false")) { opt_junk = "false"; opt_junk_alloc = opt_junk_free = false; } else if (CONF_MATCH_VALUE("alloc")) { opt_junk = "alloc"; opt_junk_alloc = true; opt_junk_free = false; } else if (CONF_MATCH_VALUE("free")) { opt_junk = "free"; opt_junk_alloc = false; opt_junk_free = true; } else { malloc_conf_error( "Invalid conf value", k, klen, v, vlen); } continue; } CONF_HANDLE_BOOL(opt_zero, "zero") } if (config_utrace) { CONF_HANDLE_BOOL(opt_utrace, "utrace") } if (config_xmalloc) { CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc") } CONF_HANDLE_BOOL(opt_tcache, "tcache") CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit, "lg_extent_max_active_fit", 0, (sizeof(size_t) << 3), yes, yes, false) CONF_HANDLE_SSIZE_T(opt_lg_tcache_max, "lg_tcache_max", -1, (sizeof(size_t) << 3) - 1) if (strncmp("percpu_arena", k, klen) == 0) { bool match = false; for (int i = percpu_arena_mode_names_base; i < percpu_arena_mode_names_limit; i++) { if (strncmp(percpu_arena_mode_names[i], v, vlen) == 0) { if (!have_percpu_arena) { malloc_conf_error( "No getcpu support", k, klen, v, vlen); } opt_percpu_arena = i; match = true; break; } } if (!match) { malloc_conf_error("Invalid conf value", k, klen, v, vlen); } continue; } CONF_HANDLE_BOOL(opt_background_thread, "background_thread"); CONF_HANDLE_SIZE_T(opt_max_background_threads, "max_background_threads", 1, opt_max_background_threads, yes, yes, true); if (config_prof) { CONF_HANDLE_BOOL(opt_prof, "prof") CONF_HANDLE_CHAR_P(opt_prof_prefix, "prof_prefix", "jeprof") CONF_HANDLE_BOOL(opt_prof_active, "prof_active") CONF_HANDLE_BOOL(opt_prof_thread_active_init, "prof_thread_active_init") CONF_HANDLE_SIZE_T(opt_lg_prof_sample, "lg_prof_sample", 0, (sizeof(uint64_t) << 3) - 1, no, yes, true) CONF_HANDLE_BOOL(opt_prof_accum, "prof_accum") CONF_HANDLE_SSIZE_T(opt_lg_prof_interval, "lg_prof_interval", -1, (sizeof(uint64_t) << 3) - 1) CONF_HANDLE_BOOL(opt_prof_gdump, "prof_gdump") CONF_HANDLE_BOOL(opt_prof_final, "prof_final") CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak") } if (config_log) { if (CONF_MATCH("log")) { size_t cpylen = ( vlen <= sizeof(log_var_names) ? vlen : sizeof(log_var_names) - 1); strncpy(log_var_names, v, cpylen); log_var_names[cpylen] = '\0'; continue; } } if (CONF_MATCH("thp")) { bool match = false; for (int i = 0; i < thp_mode_names_limit; i++) { if (strncmp(thp_mode_names[i],v, vlen) == 0) { if (!have_madvise_huge) { malloc_conf_error( "No THP support", k, klen, v, vlen); } opt_thp = i; match = true; break; } } if (!match) { malloc_conf_error("Invalid conf value", k, klen, v, vlen); } continue; } malloc_conf_error("Invalid conf pair", k, klen, v, vlen); #undef CONF_MATCH #undef CONF_MATCH_VALUE #undef CONF_HANDLE_BOOL #undef CONF_MIN_no #undef CONF_MIN_yes #undef CONF_MAX_no #undef CONF_MAX_yes #undef CONF_HANDLE_T_U #undef CONF_HANDLE_UNSIGNED #undef CONF_HANDLE_SIZE_T #undef CONF_HANDLE_SSIZE_T #undef CONF_HANDLE_CHAR_P } if (opt_abort_conf && had_conf_error) { malloc_abort_invalid_conf(); } } atomic_store_b(&log_init_done, true, ATOMIC_RELEASE); } static bool malloc_init_hard_needed(void) { if (malloc_initialized() || (IS_INITIALIZER && malloc_init_state == malloc_init_recursible)) { /* * Another thread initialized the allocator before this one * acquired init_lock, or this thread is the initializing * thread, and it is recursively allocating. */ return false; } #ifdef JEMALLOC_THREADED_INIT if (malloc_initializer != NO_INITIALIZER && !IS_INITIALIZER) { /* Busy-wait until the initializing thread completes. */ spin_t spinner = SPIN_INITIALIZER; do { malloc_mutex_unlock(TSDN_NULL, &init_lock); spin_adaptive(&spinner); malloc_mutex_lock(TSDN_NULL, &init_lock); } while (!malloc_initialized()); return false; } #endif return true; } static bool malloc_init_hard_a0_locked() { malloc_initializer = INITIALIZER; if (config_prof) { prof_boot0(); } malloc_conf_init(); if (opt_stats_print) { /* Print statistics at exit. */ if (atexit(stats_print_atexit) != 0) { malloc_write(": Error in atexit()\n"); if (opt_abort) { abort(); } } } if (pages_boot()) { return true; } if (base_boot(TSDN_NULL)) { return true; } if (extent_boot()) { return true; } if (ctl_boot()) { return true; } if (config_prof) { prof_boot1(); } arena_boot(); if (tcache_boot(TSDN_NULL)) { return true; } if (malloc_mutex_init(&arenas_lock, "arenas", WITNESS_RANK_ARENAS, malloc_mutex_rank_exclusive)) { return true; } /* * Create enough scaffolding to allow recursive allocation in * malloc_ncpus(). */ narenas_auto = 1; memset(arenas, 0, sizeof(arena_t *) * narenas_auto); /* * Initialize one arena here. The rest are lazily created in * arena_choose_hard(). */ if (arena_init(TSDN_NULL, 0, (extent_hooks_t *)&extent_hooks_default) == NULL) { return true; } a0 = arena_get(TSDN_NULL, 0, false); malloc_init_state = malloc_init_a0_initialized; return false; } static bool malloc_init_hard_a0(void) { bool ret; malloc_mutex_lock(TSDN_NULL, &init_lock); ret = malloc_init_hard_a0_locked(); malloc_mutex_unlock(TSDN_NULL, &init_lock); return ret; } /* Initialize data structures which may trigger recursive allocation. */ static bool malloc_init_hard_recursible(void) { malloc_init_state = malloc_init_recursible; ncpus = malloc_ncpus(); #if (defined(JEMALLOC_HAVE_PTHREAD_ATFORK) && !defined(JEMALLOC_MUTEX_INIT_CB) \ && !defined(JEMALLOC_ZONE) && !defined(_WIN32) && \ !defined(__native_client__)) /* LinuxThreads' pthread_atfork() allocates. */ if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent, jemalloc_postfork_child) != 0) { malloc_write(": Error in pthread_atfork()\n"); if (opt_abort) { abort(); } return true; } #endif if (background_thread_boot0()) { return true; } return false; } static unsigned malloc_narenas_default(void) { assert(ncpus > 0); /* * For SMP systems, create more than one arena per CPU by * default. */ if (ncpus > 1) { return ncpus << 2; } else { return 1; } } static percpu_arena_mode_t percpu_arena_as_initialized(percpu_arena_mode_t mode) { assert(!malloc_initialized()); assert(mode <= percpu_arena_disabled); if (mode != percpu_arena_disabled) { mode += percpu_arena_mode_enabled_base; } return mode; } static bool malloc_init_narenas(void) { assert(ncpus > 0); if (opt_percpu_arena != percpu_arena_disabled) { if (!have_percpu_arena || malloc_getcpu() < 0) { opt_percpu_arena = percpu_arena_disabled; malloc_printf(": perCPU arena getcpu() not " "available. Setting narenas to %u.\n", opt_narenas ? opt_narenas : malloc_narenas_default()); if (opt_abort) { abort(); } } else { if (ncpus >= MALLOCX_ARENA_LIMIT) { malloc_printf(": narenas w/ percpu" "arena beyond limit (%d)\n", ncpus); if (opt_abort) { abort(); } return true; } /* NB: opt_percpu_arena isn't fully initialized yet. */ if (percpu_arena_as_initialized(opt_percpu_arena) == per_phycpu_arena && ncpus % 2 != 0) { malloc_printf(": invalid " "configuration -- per physical CPU arena " "with odd number (%u) of CPUs (no hyper " "threading?).\n", ncpus); if (opt_abort) abort(); } unsigned n = percpu_arena_ind_limit( percpu_arena_as_initialized(opt_percpu_arena)); if (opt_narenas < n) { /* * If narenas is specified with percpu_arena * enabled, actual narenas is set as the greater * of the two. percpu_arena_choose will be free * to use any of the arenas based on CPU * id. This is conservative (at a small cost) * but ensures correctness. * * If for some reason the ncpus determined at * boot is not the actual number (e.g. because * of affinity setting from numactl), reserving * narenas this way provides a workaround for * percpu_arena. */ opt_narenas = n; } } } if (opt_narenas == 0) { opt_narenas = malloc_narenas_default(); } assert(opt_narenas > 0); narenas_auto = opt_narenas; /* * Limit the number of arenas to the indexing range of MALLOCX_ARENA(). */ if (narenas_auto >= MALLOCX_ARENA_LIMIT) { narenas_auto = MALLOCX_ARENA_LIMIT - 1; malloc_printf(": Reducing narenas to limit (%d)\n", narenas_auto); } narenas_total_set(narenas_auto); return false; } static void malloc_init_percpu(void) { opt_percpu_arena = percpu_arena_as_initialized(opt_percpu_arena); } static bool malloc_init_hard_finish(void) { if (malloc_mutex_boot()) { return true; } malloc_init_state = malloc_init_initialized; malloc_slow_flag_init(); return false; } static void malloc_init_hard_cleanup(tsdn_t *tsdn, bool reentrancy_set) { malloc_mutex_assert_owner(tsdn, &init_lock); malloc_mutex_unlock(tsdn, &init_lock); if (reentrancy_set) { assert(!tsdn_null(tsdn)); tsd_t *tsd = tsdn_tsd(tsdn); assert(tsd_reentrancy_level_get(tsd) > 0); post_reentrancy(tsd); } } static bool malloc_init_hard(void) { tsd_t *tsd; #if defined(_WIN32) && _WIN32_WINNT < 0x0600 _init_init_lock(); #endif malloc_mutex_lock(TSDN_NULL, &init_lock); #define UNLOCK_RETURN(tsdn, ret, reentrancy) \ malloc_init_hard_cleanup(tsdn, reentrancy); \ return ret; if (!malloc_init_hard_needed()) { UNLOCK_RETURN(TSDN_NULL, false, false) } if (malloc_init_state != malloc_init_a0_initialized && malloc_init_hard_a0_locked()) { UNLOCK_RETURN(TSDN_NULL, true, false) } malloc_mutex_unlock(TSDN_NULL, &init_lock); /* Recursive allocation relies on functional tsd. */ tsd = malloc_tsd_boot0(); if (tsd == NULL) { return true; } if (malloc_init_hard_recursible()) { return true; } malloc_mutex_lock(tsd_tsdn(tsd), &init_lock); /* Set reentrancy level to 1 during init. */ pre_reentrancy(tsd, NULL); /* Initialize narenas before prof_boot2 (for allocation). */ if (malloc_init_narenas() || background_thread_boot1(tsd_tsdn(tsd))) { UNLOCK_RETURN(tsd_tsdn(tsd), true, true) } if (config_prof && prof_boot2(tsd)) { UNLOCK_RETURN(tsd_tsdn(tsd), true, true) } malloc_init_percpu(); if (malloc_init_hard_finish()) { UNLOCK_RETURN(tsd_tsdn(tsd), true, true) } post_reentrancy(tsd); malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock); witness_assert_lockless(witness_tsd_tsdn( tsd_witness_tsdp_get_unsafe(tsd))); malloc_tsd_boot1(); /* Update TSD after tsd_boot1. */ tsd = tsd_fetch(); if (opt_background_thread) { assert(have_background_thread); /* * Need to finish init & unlock first before creating background * threads (pthread_create depends on malloc). ctl_init (which * sets isthreaded) needs to be called without holding any lock. */ background_thread_ctl_init(tsd_tsdn(tsd)); malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock); bool err = background_thread_create(tsd, 0); malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock); if (err) { return true; } } #undef UNLOCK_RETURN return false; } /* * End initialization functions. */ /******************************************************************************/ /* * Begin allocation-path internal functions and data structures. */ /* * Settings determined by the documented behavior of the allocation functions. */ typedef struct static_opts_s static_opts_t; struct static_opts_s { /* Whether or not allocation size may overflow. */ bool may_overflow; /* Whether or not allocations of size 0 should be treated as size 1. */ bool bump_empty_alloc; /* * Whether to assert that allocations are not of size 0 (after any * bumping). */ bool assert_nonempty_alloc; /* * Whether or not to modify the 'result' argument to malloc in case of * error. */ bool null_out_result_on_error; /* Whether to set errno when we encounter an error condition. */ bool set_errno_on_error; /* * The minimum valid alignment for functions requesting aligned storage. */ size_t min_alignment; /* The error string to use if we oom. */ const char *oom_string; /* The error string to use if the passed-in alignment is invalid. */ const char *invalid_alignment_string; /* * False if we're configured to skip some time-consuming operations. * * This isn't really a malloc "behavior", but it acts as a useful * summary of several other static (or at least, static after program * initialization) options. */ bool slow; }; JEMALLOC_ALWAYS_INLINE void static_opts_init(static_opts_t *static_opts) { static_opts->may_overflow = false; static_opts->bump_empty_alloc = false; static_opts->assert_nonempty_alloc = false; static_opts->null_out_result_on_error = false; static_opts->set_errno_on_error = false; static_opts->min_alignment = 0; static_opts->oom_string = ""; static_opts->invalid_alignment_string = ""; static_opts->slow = false; } /* * These correspond to the macros in jemalloc/jemalloc_macros.h. Broadly, we * should have one constant here per magic value there. Note however that the * representations need not be related. */ #define TCACHE_IND_NONE ((unsigned)-1) #define TCACHE_IND_AUTOMATIC ((unsigned)-2) #define ARENA_IND_AUTOMATIC ((unsigned)-1) typedef struct dynamic_opts_s dynamic_opts_t; struct dynamic_opts_s { void **result; size_t num_items; size_t item_size; size_t alignment; bool zero; unsigned tcache_ind; unsigned arena_ind; }; JEMALLOC_ALWAYS_INLINE void dynamic_opts_init(dynamic_opts_t *dynamic_opts) { dynamic_opts->result = NULL; dynamic_opts->num_items = 0; dynamic_opts->item_size = 0; dynamic_opts->alignment = 0; dynamic_opts->zero = false; dynamic_opts->tcache_ind = TCACHE_IND_AUTOMATIC; dynamic_opts->arena_ind = ARENA_IND_AUTOMATIC; } /* ind is ignored if dopts->alignment > 0. */ JEMALLOC_ALWAYS_INLINE void * imalloc_no_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd, size_t size, size_t usize, szind_t ind) { tcache_t *tcache; arena_t *arena; /* Fill in the tcache. */ if (dopts->tcache_ind == TCACHE_IND_AUTOMATIC) { if (likely(!sopts->slow)) { /* Getting tcache ptr unconditionally. */ tcache = tsd_tcachep_get(tsd); assert(tcache == tcache_get(tsd)); } else { tcache = tcache_get(tsd); } } else if (dopts->tcache_ind == TCACHE_IND_NONE) { tcache = NULL; } else { tcache = tcaches_get(tsd, dopts->tcache_ind); } /* Fill in the arena. */ if (dopts->arena_ind == ARENA_IND_AUTOMATIC) { /* * In case of automatic arena management, we defer arena * computation until as late as we can, hoping to fill the * allocation out of the tcache. */ arena = NULL; } else { arena = arena_get(tsd_tsdn(tsd), dopts->arena_ind, true); } if (unlikely(dopts->alignment != 0)) { return ipalloct(tsd_tsdn(tsd), usize, dopts->alignment, dopts->zero, tcache, arena); } return iallocztm(tsd_tsdn(tsd), size, ind, dopts->zero, tcache, false, arena, sopts->slow); } JEMALLOC_ALWAYS_INLINE void * imalloc_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd, size_t usize, szind_t ind) { void *ret; /* * For small allocations, sampling bumps the usize. If so, we allocate * from the ind_large bucket. */ szind_t ind_large; size_t bumped_usize = usize; if (usize <= SMALL_MAXCLASS) { assert(((dopts->alignment == 0) ? sz_s2u(LARGE_MINCLASS) : sz_sa2u(LARGE_MINCLASS, dopts->alignment)) == LARGE_MINCLASS); ind_large = sz_size2index(LARGE_MINCLASS); bumped_usize = sz_s2u(LARGE_MINCLASS); ret = imalloc_no_sample(sopts, dopts, tsd, bumped_usize, bumped_usize, ind_large); if (unlikely(ret == NULL)) { return NULL; } arena_prof_promote(tsd_tsdn(tsd), ret, usize); } else { ret = imalloc_no_sample(sopts, dopts, tsd, usize, usize, ind); } return ret; } /* * Returns true if the allocation will overflow, and false otherwise. Sets * *size to the product either way. */ JEMALLOC_ALWAYS_INLINE bool compute_size_with_overflow(bool may_overflow, dynamic_opts_t *dopts, size_t *size) { /* * This function is just num_items * item_size, except that we may have * to check for overflow. */ if (!may_overflow) { assert(dopts->num_items == 1); *size = dopts->item_size; return false; } /* A size_t with its high-half bits all set to 1. */ static const size_t high_bits = SIZE_T_MAX << (sizeof(size_t) * 8 / 2); *size = dopts->item_size * dopts->num_items; if (unlikely(*size == 0)) { return (dopts->num_items != 0 && dopts->item_size != 0); } /* * We got a non-zero size, but we don't know if we overflowed to get * there. To avoid having to do a divide, we'll be clever and note that * if both A and B can be represented in N/2 bits, then their product * can be represented in N bits (without the possibility of overflow). */ if (likely((high_bits & (dopts->num_items | dopts->item_size)) == 0)) { return false; } if (likely(*size / dopts->item_size == dopts->num_items)) { return false; } return true; } JEMALLOC_ALWAYS_INLINE int imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { /* Where the actual allocated memory will live. */ void *allocation = NULL; /* Filled in by compute_size_with_overflow below. */ size_t size = 0; /* * For unaligned allocations, we need only ind. For aligned * allocations, or in case of stats or profiling we need usize. * * These are actually dead stores, in that their values are reset before * any branch on their value is taken. Sometimes though, it's * convenient to pass them as arguments before this point. To avoid * undefined behavior then, we initialize them with dummy stores. */ szind_t ind = 0; size_t usize = 0; /* Reentrancy is only checked on slow path. */ int8_t reentrancy_level; /* Compute the amount of memory the user wants. */ if (unlikely(compute_size_with_overflow(sopts->may_overflow, dopts, &size))) { goto label_oom; } /* Validate the user input. */ if (sopts->bump_empty_alloc) { if (unlikely(size == 0)) { size = 1; } } if (sopts->assert_nonempty_alloc) { assert (size != 0); } if (unlikely(dopts->alignment < sopts->min_alignment || (dopts->alignment & (dopts->alignment - 1)) != 0)) { goto label_invalid_alignment; } /* This is the beginning of the "core" algorithm. */ if (dopts->alignment == 0) { ind = sz_size2index(size); if (unlikely(ind >= NSIZES)) { goto label_oom; } if (config_stats || (config_prof && opt_prof)) { usize = sz_index2size(ind); assert(usize > 0 && usize <= LARGE_MAXCLASS); } } else { usize = sz_sa2u(size, dopts->alignment); if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) { goto label_oom; } } check_entry_exit_locking(tsd_tsdn(tsd)); /* * If we need to handle reentrancy, we can do it out of a * known-initialized arena (i.e. arena 0). */ reentrancy_level = tsd_reentrancy_level_get(tsd); if (sopts->slow && unlikely(reentrancy_level > 0)) { /* * We should never specify particular arenas or tcaches from * within our internal allocations. */ assert(dopts->tcache_ind == TCACHE_IND_AUTOMATIC || dopts->tcache_ind == TCACHE_IND_NONE); assert(dopts->arena_ind == ARENA_IND_AUTOMATIC); dopts->tcache_ind = TCACHE_IND_NONE; /* We know that arena 0 has already been initialized. */ dopts->arena_ind = 0; } /* If profiling is on, get our profiling context. */ if (config_prof && opt_prof) { /* * Note that if we're going down this path, usize must have been * initialized in the previous if statement. */ prof_tctx_t *tctx = prof_alloc_prep( tsd, usize, prof_active_get_unlocked(), true); alloc_ctx_t alloc_ctx; if (likely((uintptr_t)tctx == (uintptr_t)1U)) { alloc_ctx.slab = (usize <= SMALL_MAXCLASS); allocation = imalloc_no_sample( sopts, dopts, tsd, usize, usize, ind); } else if ((uintptr_t)tctx > (uintptr_t)1U) { /* * Note that ind might still be 0 here. This is fine; * imalloc_sample ignores ind if dopts->alignment > 0. */ allocation = imalloc_sample( sopts, dopts, tsd, usize, ind); alloc_ctx.slab = false; } else { allocation = NULL; } if (unlikely(allocation == NULL)) { prof_alloc_rollback(tsd, tctx, true); goto label_oom; } prof_malloc(tsd_tsdn(tsd), allocation, usize, &alloc_ctx, tctx); } else { /* * If dopts->alignment > 0, then ind is still 0, but usize was * computed in the previous if statement. Down the positive * alignment path, imalloc_no_sample ignores ind and size * (relying only on usize). */ allocation = imalloc_no_sample(sopts, dopts, tsd, size, usize, ind); if (unlikely(allocation == NULL)) { goto label_oom; } } /* * Allocation has been done at this point. We still have some * post-allocation work to do though. */ assert(dopts->alignment == 0 || ((uintptr_t)allocation & (dopts->alignment - 1)) == ZU(0)); if (config_stats) { assert(usize == isalloc(tsd_tsdn(tsd), allocation)); *tsd_thread_allocatedp_get(tsd) += usize; } if (sopts->slow) { UTRACE(0, size, allocation); } /* Success! */ check_entry_exit_locking(tsd_tsdn(tsd)); *dopts->result = allocation; return 0; label_oom: if (unlikely(sopts->slow) && config_xmalloc && unlikely(opt_xmalloc)) { malloc_write(sopts->oom_string); abort(); } if (sopts->slow) { UTRACE(NULL, size, NULL); } check_entry_exit_locking(tsd_tsdn(tsd)); if (sopts->set_errno_on_error) { set_errno(ENOMEM); } if (sopts->null_out_result_on_error) { *dopts->result = NULL; } return ENOMEM; /* * This label is only jumped to by one goto; we move it out of line * anyways to avoid obscuring the non-error paths, and for symmetry with * the oom case. */ label_invalid_alignment: if (config_xmalloc && unlikely(opt_xmalloc)) { malloc_write(sopts->invalid_alignment_string); abort(); } if (sopts->set_errno_on_error) { set_errno(EINVAL); } if (sopts->slow) { UTRACE(NULL, size, NULL); } check_entry_exit_locking(tsd_tsdn(tsd)); if (sopts->null_out_result_on_error) { *dopts->result = NULL; } return EINVAL; } /* Returns the errno-style error code of the allocation. */ JEMALLOC_ALWAYS_INLINE int imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) { if (unlikely(!malloc_initialized()) && unlikely(malloc_init())) { if (config_xmalloc && unlikely(opt_xmalloc)) { malloc_write(sopts->oom_string); abort(); } UTRACE(NULL, dopts->num_items * dopts->item_size, NULL); set_errno(ENOMEM); *dopts->result = NULL; return ENOMEM; } /* We always need the tsd. Let's grab it right away. */ tsd_t *tsd = tsd_fetch(); assert(tsd); if (likely(tsd_fast(tsd))) { /* Fast and common path. */ tsd_assert_fast(tsd); sopts->slow = false; return imalloc_body(sopts, dopts, tsd); } else { sopts->slow = true; return imalloc_body(sopts, dopts, tsd); } } /******************************************************************************/ /* * Begin malloc(3)-compatible functions. */ JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) je_malloc(size_t size) { void *ret; static_opts_t sopts; dynamic_opts_t dopts; LOG("core.malloc.entry", "size: %zu", size); static_opts_init(&sopts); dynamic_opts_init(&dopts); sopts.bump_empty_alloc = true; sopts.null_out_result_on_error = true; sopts.set_errno_on_error = true; sopts.oom_string = ": Error in malloc(): out of memory\n"; dopts.result = &ret; dopts.num_items = 1; dopts.item_size = size; imalloc(&sopts, &dopts); LOG("core.malloc.exit", "result: %p", ret); return ret; } JEMALLOC_EXPORT int JEMALLOC_NOTHROW JEMALLOC_ATTR(nonnull(1)) je_posix_memalign(void **memptr, size_t alignment, size_t size) { int ret; static_opts_t sopts; dynamic_opts_t dopts; LOG("core.posix_memalign.entry", "mem ptr: %p, alignment: %zu, " "size: %zu", memptr, alignment, size); static_opts_init(&sopts); dynamic_opts_init(&dopts); sopts.bump_empty_alloc = true; sopts.min_alignment = sizeof(void *); sopts.oom_string = ": Error allocating aligned memory: out of memory\n"; sopts.invalid_alignment_string = ": Error allocating aligned memory: invalid alignment\n"; dopts.result = memptr; dopts.num_items = 1; dopts.item_size = size; dopts.alignment = alignment; ret = imalloc(&sopts, &dopts); LOG("core.posix_memalign.exit", "result: %d, alloc ptr: %p", ret, *memptr); return ret; } JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2) je_aligned_alloc(size_t alignment, size_t size) { void *ret; static_opts_t sopts; dynamic_opts_t dopts; LOG("core.aligned_alloc.entry", "alignment: %zu, size: %zu\n", alignment, size); static_opts_init(&sopts); dynamic_opts_init(&dopts); sopts.bump_empty_alloc = true; sopts.null_out_result_on_error = true; sopts.set_errno_on_error = true; sopts.min_alignment = 1; sopts.oom_string = ": Error allocating aligned memory: out of memory\n"; sopts.invalid_alignment_string = ": Error allocating aligned memory: invalid alignment\n"; dopts.result = &ret; dopts.num_items = 1; dopts.item_size = size; dopts.alignment = alignment; imalloc(&sopts, &dopts); LOG("core.aligned_alloc.exit", "result: %p", ret); return ret; } JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2) je_calloc(size_t num, size_t size) { void *ret; static_opts_t sopts; dynamic_opts_t dopts; LOG("core.calloc.entry", "num: %zu, size: %zu\n", num, size); static_opts_init(&sopts); dynamic_opts_init(&dopts); sopts.may_overflow = true; sopts.bump_empty_alloc = true; sopts.null_out_result_on_error = true; sopts.set_errno_on_error = true; sopts.oom_string = ": Error in calloc(): out of memory\n"; dopts.result = &ret; dopts.num_items = num; dopts.item_size = size; dopts.zero = true; imalloc(&sopts, &dopts); LOG("core.calloc.exit", "result: %p", ret); return ret; } static void * irealloc_prof_sample(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize, prof_tctx_t *tctx) { void *p; if (tctx == NULL) { return NULL; } if (usize <= SMALL_MAXCLASS) { p = iralloc(tsd, old_ptr, old_usize, LARGE_MINCLASS, 0, false); if (p == NULL) { return NULL; } arena_prof_promote(tsd_tsdn(tsd), p, usize); } else { p = iralloc(tsd, old_ptr, old_usize, usize, 0, false); } return p; } JEMALLOC_ALWAYS_INLINE void * irealloc_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t usize, alloc_ctx_t *alloc_ctx) { void *p; bool prof_active; prof_tctx_t *old_tctx, *tctx; prof_active = prof_active_get_unlocked(); old_tctx = prof_tctx_get(tsd_tsdn(tsd), old_ptr, alloc_ctx); tctx = prof_alloc_prep(tsd, usize, prof_active, true); if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) { p = irealloc_prof_sample(tsd, old_ptr, old_usize, usize, tctx); } else { p = iralloc(tsd, old_ptr, old_usize, usize, 0, false); } if (unlikely(p == NULL)) { prof_alloc_rollback(tsd, tctx, true); return NULL; } prof_realloc(tsd, p, usize, tctx, prof_active, true, old_ptr, old_usize, old_tctx); return p; } JEMALLOC_ALWAYS_INLINE void ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) { if (!slow_path) { tsd_assert_fast(tsd); } check_entry_exit_locking(tsd_tsdn(tsd)); if (tsd_reentrancy_level_get(tsd) != 0) { assert(slow_path); } assert(ptr != NULL); assert(malloc_initialized() || IS_INITIALIZER); alloc_ctx_t alloc_ctx; rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd); rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx, (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab); assert(alloc_ctx.szind != NSIZES); size_t usize; if (config_prof && opt_prof) { usize = sz_index2size(alloc_ctx.szind); prof_free(tsd, ptr, usize, &alloc_ctx); } else if (config_stats) { usize = sz_index2size(alloc_ctx.szind); } if (config_stats) { *tsd_thread_deallocatedp_get(tsd) += usize; } if (likely(!slow_path)) { idalloctm(tsd_tsdn(tsd), ptr, tcache, &alloc_ctx, false, false); } else { idalloctm(tsd_tsdn(tsd), ptr, tcache, &alloc_ctx, false, true); } } JEMALLOC_ALWAYS_INLINE void isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) { if (!slow_path) { tsd_assert_fast(tsd); } check_entry_exit_locking(tsd_tsdn(tsd)); if (tsd_reentrancy_level_get(tsd) != 0) { assert(slow_path); } assert(ptr != NULL); assert(malloc_initialized() || IS_INITIALIZER); alloc_ctx_t alloc_ctx, *ctx; if (!config_cache_oblivious && ((uintptr_t)ptr & PAGE_MASK) != 0) { /* * When cache_oblivious is disabled and ptr is not page aligned, * the allocation was not sampled -- usize can be used to * determine szind directly. */ alloc_ctx.szind = sz_size2index(usize); alloc_ctx.slab = true; ctx = &alloc_ctx; if (config_debug) { alloc_ctx_t dbg_ctx; rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd); rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx, (uintptr_t)ptr, true, &dbg_ctx.szind, &dbg_ctx.slab); assert(dbg_ctx.szind == alloc_ctx.szind); assert(dbg_ctx.slab == alloc_ctx.slab); } } else if (config_prof && opt_prof) { rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd); rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx, (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab); assert(alloc_ctx.szind == sz_size2index(usize)); ctx = &alloc_ctx; } else { ctx = NULL; } if (config_prof && opt_prof) { prof_free(tsd, ptr, usize, ctx); } if (config_stats) { *tsd_thread_deallocatedp_get(tsd) += usize; } if (likely(!slow_path)) { isdalloct(tsd_tsdn(tsd), ptr, usize, tcache, ctx, false); } else { isdalloct(tsd_tsdn(tsd), ptr, usize, tcache, ctx, true); } } JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * JEMALLOC_ALLOC_SIZE(2) je_realloc(void *ptr, size_t size) { void *ret; tsdn_t *tsdn JEMALLOC_CC_SILENCE_INIT(NULL); size_t usize JEMALLOC_CC_SILENCE_INIT(0); size_t old_usize = 0; LOG("core.realloc.entry", "ptr: %p, size: %zu\n", ptr, size); if (unlikely(size == 0)) { - if (ptr != NULL) { - /* realloc(ptr, 0) is equivalent to free(ptr). */ - UTRACE(ptr, 0, 0); - tcache_t *tcache; - tsd_t *tsd = tsd_fetch(); - if (tsd_reentrancy_level_get(tsd) == 0) { - tcache = tcache_get(tsd); - } else { - tcache = NULL; - } - ifree(tsd, ptr, tcache, true); - - LOG("core.realloc.exit", "result: %p", NULL); - return NULL; - } size = 1; } if (likely(ptr != NULL)) { assert(malloc_initialized() || IS_INITIALIZER); tsd_t *tsd = tsd_fetch(); check_entry_exit_locking(tsd_tsdn(tsd)); alloc_ctx_t alloc_ctx; rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd); rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx, (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab); assert(alloc_ctx.szind != NSIZES); old_usize = sz_index2size(alloc_ctx.szind); assert(old_usize == isalloc(tsd_tsdn(tsd), ptr)); if (config_prof && opt_prof) { usize = sz_s2u(size); ret = unlikely(usize == 0 || usize > LARGE_MAXCLASS) ? NULL : irealloc_prof(tsd, ptr, old_usize, usize, &alloc_ctx); } else { if (config_stats) { usize = sz_s2u(size); } ret = iralloc(tsd, ptr, old_usize, size, 0, false); } tsdn = tsd_tsdn(tsd); } else { /* realloc(NULL, size) is equivalent to malloc(size). */ void *ret = je_malloc(size); LOG("core.realloc.exit", "result: %p", ret); return ret; } if (unlikely(ret == NULL)) { if (config_xmalloc && unlikely(opt_xmalloc)) { malloc_write(": Error in realloc(): " "out of memory\n"); abort(); } set_errno(ENOMEM); } if (config_stats && likely(ret != NULL)) { tsd_t *tsd; assert(usize == isalloc(tsdn, ret)); tsd = tsdn_tsd(tsdn); *tsd_thread_allocatedp_get(tsd) += usize; *tsd_thread_deallocatedp_get(tsd) += old_usize; } UTRACE(ptr, size, ret); check_entry_exit_locking(tsdn); LOG("core.realloc.exit", "result: %p", ret); return ret; } JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_free(void *ptr) { LOG("core.free.entry", "ptr: %p", ptr); UTRACE(ptr, 0, 0); if (likely(ptr != NULL)) { /* * We avoid setting up tsd fully (e.g. tcache, arena binding) * based on only free() calls -- other activities trigger the * minimal to full transition. This is because free() may * happen during thread shutdown after tls deallocation: if a * thread never had any malloc activities until then, a * fully-setup tsd won't be destructed properly. */ tsd_t *tsd = tsd_fetch_min(); check_entry_exit_locking(tsd_tsdn(tsd)); tcache_t *tcache; if (likely(tsd_fast(tsd))) { tsd_assert_fast(tsd); /* Unconditionally get tcache ptr on fast path. */ tcache = tsd_tcachep_get(tsd); ifree(tsd, ptr, tcache, false); } else { if (likely(tsd_reentrancy_level_get(tsd) == 0)) { tcache = tcache_get(tsd); } else { tcache = NULL; } ifree(tsd, ptr, tcache, true); } check_entry_exit_locking(tsd_tsdn(tsd)); } LOG("core.free.exit", ""); } /* * End malloc(3)-compatible functions. */ /******************************************************************************/ /* * Begin non-standard override functions. */ #ifdef JEMALLOC_OVERRIDE_MEMALIGN JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * JEMALLOC_ATTR(malloc) je_memalign(size_t alignment, size_t size) { void *ret; static_opts_t sopts; dynamic_opts_t dopts; LOG("core.memalign.entry", "alignment: %zu, size: %zu\n", alignment, size); static_opts_init(&sopts); dynamic_opts_init(&dopts); sopts.bump_empty_alloc = true; sopts.min_alignment = 1; sopts.oom_string = ": Error allocating aligned memory: out of memory\n"; sopts.invalid_alignment_string = ": Error allocating aligned memory: invalid alignment\n"; sopts.null_out_result_on_error = true; dopts.result = &ret; dopts.num_items = 1; dopts.item_size = size; dopts.alignment = alignment; imalloc(&sopts, &dopts); LOG("core.memalign.exit", "result: %p", ret); return ret; } #endif #ifdef JEMALLOC_OVERRIDE_VALLOC JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * JEMALLOC_ATTR(malloc) je_valloc(size_t size) { void *ret; static_opts_t sopts; dynamic_opts_t dopts; LOG("core.valloc.entry", "size: %zu\n", size); static_opts_init(&sopts); dynamic_opts_init(&dopts); sopts.bump_empty_alloc = true; sopts.null_out_result_on_error = true; sopts.min_alignment = PAGE; sopts.oom_string = ": Error allocating aligned memory: out of memory\n"; sopts.invalid_alignment_string = ": Error allocating aligned memory: invalid alignment\n"; dopts.result = &ret; dopts.num_items = 1; dopts.item_size = size; dopts.alignment = PAGE; imalloc(&sopts, &dopts); LOG("core.valloc.exit", "result: %p\n", ret); return ret; } #endif #if defined(JEMALLOC_IS_MALLOC) && defined(JEMALLOC_GLIBC_MALLOC_HOOK) /* * glibc provides the RTLD_DEEPBIND flag for dlopen which can make it possible * to inconsistently reference libc's malloc(3)-compatible functions * (https://bugzilla.mozilla.org/show_bug.cgi?id=493541). * * These definitions interpose hooks in glibc. The functions are actually * passed an extra argument for the caller return address, which will be * ignored. */ JEMALLOC_EXPORT void (*__free_hook)(void *ptr) = je_free; JEMALLOC_EXPORT void *(*__malloc_hook)(size_t size) = je_malloc; JEMALLOC_EXPORT void *(*__realloc_hook)(void *ptr, size_t size) = je_realloc; # ifdef JEMALLOC_GLIBC_MEMALIGN_HOOK JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) = je_memalign; # endif # ifdef CPU_COUNT /* * To enable static linking with glibc, the libc specific malloc interface must * be implemented also, so none of glibc's malloc.o functions are added to the * link. */ # define ALIAS(je_fn) __attribute__((alias (#je_fn), used)) /* To force macro expansion of je_ prefix before stringification. */ # define PREALIAS(je_fn) ALIAS(je_fn) # ifdef JEMALLOC_OVERRIDE___LIBC_CALLOC void *__libc_calloc(size_t n, size_t size) PREALIAS(je_calloc); # endif # ifdef JEMALLOC_OVERRIDE___LIBC_FREE void __libc_free(void* ptr) PREALIAS(je_free); # endif # ifdef JEMALLOC_OVERRIDE___LIBC_MALLOC void *__libc_malloc(size_t size) PREALIAS(je_malloc); # endif # ifdef JEMALLOC_OVERRIDE___LIBC_MEMALIGN void *__libc_memalign(size_t align, size_t s) PREALIAS(je_memalign); # endif # ifdef JEMALLOC_OVERRIDE___LIBC_REALLOC void *__libc_realloc(void* ptr, size_t size) PREALIAS(je_realloc); # endif # ifdef JEMALLOC_OVERRIDE___LIBC_VALLOC void *__libc_valloc(size_t size) PREALIAS(je_valloc); # endif # ifdef JEMALLOC_OVERRIDE___POSIX_MEMALIGN int __posix_memalign(void** r, size_t a, size_t s) PREALIAS(je_posix_memalign); # endif # undef PREALIAS # undef ALIAS # endif #endif /* * End non-standard override functions. */ /******************************************************************************/ /* * Begin non-standard functions. */ JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) je_mallocx(size_t size, int flags) { void *ret; static_opts_t sopts; dynamic_opts_t dopts; LOG("core.mallocx.entry", "size: %zu, flags: %d", size, flags); static_opts_init(&sopts); dynamic_opts_init(&dopts); sopts.assert_nonempty_alloc = true; sopts.null_out_result_on_error = true; sopts.oom_string = ": Error in mallocx(): out of memory\n"; dopts.result = &ret; dopts.num_items = 1; dopts.item_size = size; if (unlikely(flags != 0)) { if ((flags & MALLOCX_LG_ALIGN_MASK) != 0) { dopts.alignment = MALLOCX_ALIGN_GET_SPECIFIED(flags); } dopts.zero = MALLOCX_ZERO_GET(flags); if ((flags & MALLOCX_TCACHE_MASK) != 0) { if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE) { dopts.tcache_ind = TCACHE_IND_NONE; } else { dopts.tcache_ind = MALLOCX_TCACHE_GET(flags); } } else { dopts.tcache_ind = TCACHE_IND_AUTOMATIC; } if ((flags & MALLOCX_ARENA_MASK) != 0) dopts.arena_ind = MALLOCX_ARENA_GET(flags); } imalloc(&sopts, &dopts); LOG("core.mallocx.exit", "result: %p", ret); return ret; } static void * irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize, size_t usize, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena, prof_tctx_t *tctx) { void *p; if (tctx == NULL) { return NULL; } if (usize <= SMALL_MAXCLASS) { p = iralloct(tsdn, old_ptr, old_usize, LARGE_MINCLASS, alignment, zero, tcache, arena); if (p == NULL) { return NULL; } arena_prof_promote(tsdn, p, usize); } else { p = iralloct(tsdn, old_ptr, old_usize, usize, alignment, zero, tcache, arena); } return p; } JEMALLOC_ALWAYS_INLINE void * irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size, size_t alignment, size_t *usize, bool zero, tcache_t *tcache, arena_t *arena, alloc_ctx_t *alloc_ctx) { void *p; bool prof_active; prof_tctx_t *old_tctx, *tctx; prof_active = prof_active_get_unlocked(); old_tctx = prof_tctx_get(tsd_tsdn(tsd), old_ptr, alloc_ctx); tctx = prof_alloc_prep(tsd, *usize, prof_active, false); if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) { p = irallocx_prof_sample(tsd_tsdn(tsd), old_ptr, old_usize, *usize, alignment, zero, tcache, arena, tctx); } else { p = iralloct(tsd_tsdn(tsd), old_ptr, old_usize, size, alignment, zero, tcache, arena); } if (unlikely(p == NULL)) { prof_alloc_rollback(tsd, tctx, false); return NULL; } if (p == old_ptr && alignment != 0) { /* * The allocation did not move, so it is possible that the size * class is smaller than would guarantee the requested * alignment, and that the alignment constraint was * serendipitously satisfied. Additionally, old_usize may not * be the same as the current usize because of in-place large * reallocation. Therefore, query the actual value of usize. */ *usize = isalloc(tsd_tsdn(tsd), p); } prof_realloc(tsd, p, *usize, tctx, prof_active, false, old_ptr, old_usize, old_tctx); return p; } JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * JEMALLOC_ALLOC_SIZE(2) je_rallocx(void *ptr, size_t size, int flags) { void *p; tsd_t *tsd; size_t usize; size_t old_usize; size_t alignment = MALLOCX_ALIGN_GET(flags); bool zero = flags & MALLOCX_ZERO; arena_t *arena; tcache_t *tcache; LOG("core.rallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr, size, flags); assert(ptr != NULL); assert(size != 0); assert(malloc_initialized() || IS_INITIALIZER); tsd = tsd_fetch(); check_entry_exit_locking(tsd_tsdn(tsd)); if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) { unsigned arena_ind = MALLOCX_ARENA_GET(flags); arena = arena_get(tsd_tsdn(tsd), arena_ind, true); if (unlikely(arena == NULL)) { goto label_oom; } } else { arena = NULL; } if (unlikely((flags & MALLOCX_TCACHE_MASK) != 0)) { if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE) { tcache = NULL; } else { tcache = tcaches_get(tsd, MALLOCX_TCACHE_GET(flags)); } } else { tcache = tcache_get(tsd); } alloc_ctx_t alloc_ctx; rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd); rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx, (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab); assert(alloc_ctx.szind != NSIZES); old_usize = sz_index2size(alloc_ctx.szind); assert(old_usize == isalloc(tsd_tsdn(tsd), ptr)); if (config_prof && opt_prof) { usize = (alignment == 0) ? sz_s2u(size) : sz_sa2u(size, alignment); if (unlikely(usize == 0 || usize > LARGE_MAXCLASS)) { goto label_oom; } p = irallocx_prof(tsd, ptr, old_usize, size, alignment, &usize, zero, tcache, arena, &alloc_ctx); if (unlikely(p == NULL)) { goto label_oom; } } else { p = iralloct(tsd_tsdn(tsd), ptr, old_usize, size, alignment, zero, tcache, arena); if (unlikely(p == NULL)) { goto label_oom; } if (config_stats) { usize = isalloc(tsd_tsdn(tsd), p); } } assert(alignment == 0 || ((uintptr_t)p & (alignment - 1)) == ZU(0)); if (config_stats) { *tsd_thread_allocatedp_get(tsd) += usize; *tsd_thread_deallocatedp_get(tsd) += old_usize; } UTRACE(ptr, size, p); check_entry_exit_locking(tsd_tsdn(tsd)); LOG("core.rallocx.exit", "result: %p", p); return p; label_oom: if (config_xmalloc && unlikely(opt_xmalloc)) { malloc_write(": Error in rallocx(): out of memory\n"); abort(); } UTRACE(ptr, size, 0); check_entry_exit_locking(tsd_tsdn(tsd)); LOG("core.rallocx.exit", "result: %p", NULL); return NULL; } JEMALLOC_ALWAYS_INLINE size_t ixallocx_helper(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size, size_t extra, size_t alignment, bool zero) { size_t usize; if (ixalloc(tsdn, ptr, old_usize, size, extra, alignment, zero)) { return old_usize; } usize = isalloc(tsdn, ptr); return usize; } static size_t ixallocx_prof_sample(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size, size_t extra, size_t alignment, bool zero, prof_tctx_t *tctx) { size_t usize; if (tctx == NULL) { return old_usize; } usize = ixallocx_helper(tsdn, ptr, old_usize, size, extra, alignment, zero); return usize; } JEMALLOC_ALWAYS_INLINE size_t ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size, size_t extra, size_t alignment, bool zero, alloc_ctx_t *alloc_ctx) { size_t usize_max, usize; bool prof_active; prof_tctx_t *old_tctx, *tctx; prof_active = prof_active_get_unlocked(); old_tctx = prof_tctx_get(tsd_tsdn(tsd), ptr, alloc_ctx); /* * usize isn't knowable before ixalloc() returns when extra is non-zero. * Therefore, compute its maximum possible value and use that in * prof_alloc_prep() to decide whether to capture a backtrace. * prof_realloc() will use the actual usize to decide whether to sample. */ if (alignment == 0) { usize_max = sz_s2u(size+extra); assert(usize_max > 0 && usize_max <= LARGE_MAXCLASS); } else { usize_max = sz_sa2u(size+extra, alignment); if (unlikely(usize_max == 0 || usize_max > LARGE_MAXCLASS)) { /* * usize_max is out of range, and chances are that * allocation will fail, but use the maximum possible * value and carry on with prof_alloc_prep(), just in * case allocation succeeds. */ usize_max = LARGE_MAXCLASS; } } tctx = prof_alloc_prep(tsd, usize_max, prof_active, false); if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) { usize = ixallocx_prof_sample(tsd_tsdn(tsd), ptr, old_usize, size, extra, alignment, zero, tctx); } else { usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, extra, alignment, zero); } if (usize == old_usize) { prof_alloc_rollback(tsd, tctx, false); return usize; } prof_realloc(tsd, ptr, usize, tctx, prof_active, false, ptr, old_usize, old_tctx); return usize; } JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_xallocx(void *ptr, size_t size, size_t extra, int flags) { tsd_t *tsd; size_t usize, old_usize; size_t alignment = MALLOCX_ALIGN_GET(flags); bool zero = flags & MALLOCX_ZERO; LOG("core.xallocx.entry", "ptr: %p, size: %zu, extra: %zu, " "flags: %d", ptr, size, extra, flags); assert(ptr != NULL); assert(size != 0); assert(SIZE_T_MAX - size >= extra); assert(malloc_initialized() || IS_INITIALIZER); tsd = tsd_fetch(); check_entry_exit_locking(tsd_tsdn(tsd)); alloc_ctx_t alloc_ctx; rtree_ctx_t *rtree_ctx = tsd_rtree_ctx(tsd); rtree_szind_slab_read(tsd_tsdn(tsd), &extents_rtree, rtree_ctx, (uintptr_t)ptr, true, &alloc_ctx.szind, &alloc_ctx.slab); assert(alloc_ctx.szind != NSIZES); old_usize = sz_index2size(alloc_ctx.szind); assert(old_usize == isalloc(tsd_tsdn(tsd), ptr)); /* * The API explicitly absolves itself of protecting against (size + * extra) numerical overflow, but we may need to clamp extra to avoid * exceeding LARGE_MAXCLASS. * * Ordinarily, size limit checking is handled deeper down, but here we * have to check as part of (size + extra) clamping, since we need the * clamped value in the above helper functions. */ if (unlikely(size > LARGE_MAXCLASS)) { usize = old_usize; goto label_not_resized; } if (unlikely(LARGE_MAXCLASS - size < extra)) { extra = LARGE_MAXCLASS - size; } if (config_prof && opt_prof) { usize = ixallocx_prof(tsd, ptr, old_usize, size, extra, alignment, zero, &alloc_ctx); } else { usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, extra, alignment, zero); } if (unlikely(usize == old_usize)) { goto label_not_resized; } if (config_stats) { *tsd_thread_allocatedp_get(tsd) += usize; *tsd_thread_deallocatedp_get(tsd) += old_usize; } label_not_resized: UTRACE(ptr, size, ptr); check_entry_exit_locking(tsd_tsdn(tsd)); LOG("core.xallocx.exit", "result: %zu", usize); return usize; } JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW JEMALLOC_ATTR(pure) je_sallocx(const void *ptr, UNUSED int flags) { size_t usize; tsdn_t *tsdn; LOG("core.sallocx.entry", "ptr: %p, flags: %d", ptr, flags); assert(malloc_initialized() || IS_INITIALIZER); assert(ptr != NULL); tsdn = tsdn_fetch(); check_entry_exit_locking(tsdn); if (config_debug || force_ivsalloc) { usize = ivsalloc(tsdn, ptr); assert(force_ivsalloc || usize != 0); } else { usize = isalloc(tsdn, ptr); } check_entry_exit_locking(tsdn); LOG("core.sallocx.exit", "result: %zu", usize); return usize; } JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_dallocx(void *ptr, int flags) { LOG("core.dallocx.entry", "ptr: %p, flags: %d", ptr, flags); assert(ptr != NULL); assert(malloc_initialized() || IS_INITIALIZER); tsd_t *tsd = tsd_fetch(); bool fast = tsd_fast(tsd); check_entry_exit_locking(tsd_tsdn(tsd)); tcache_t *tcache; if (unlikely((flags & MALLOCX_TCACHE_MASK) != 0)) { /* Not allowed to be reentrant and specify a custom tcache. */ assert(tsd_reentrancy_level_get(tsd) == 0); if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE) { tcache = NULL; } else { tcache = tcaches_get(tsd, MALLOCX_TCACHE_GET(flags)); } } else { if (likely(fast)) { tcache = tsd_tcachep_get(tsd); assert(tcache == tcache_get(tsd)); } else { if (likely(tsd_reentrancy_level_get(tsd) == 0)) { tcache = tcache_get(tsd); } else { tcache = NULL; } } } UTRACE(ptr, 0, 0); if (likely(fast)) { tsd_assert_fast(tsd); ifree(tsd, ptr, tcache, false); } else { ifree(tsd, ptr, tcache, true); } check_entry_exit_locking(tsd_tsdn(tsd)); LOG("core.dallocx.exit", ""); } JEMALLOC_ALWAYS_INLINE size_t inallocx(tsdn_t *tsdn, size_t size, int flags) { check_entry_exit_locking(tsdn); size_t usize; if (likely((flags & MALLOCX_LG_ALIGN_MASK) == 0)) { usize = sz_s2u(size); } else { usize = sz_sa2u(size, MALLOCX_ALIGN_GET_SPECIFIED(flags)); } check_entry_exit_locking(tsdn); return usize; } JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_sdallocx(void *ptr, size_t size, int flags) { assert(ptr != NULL); assert(malloc_initialized() || IS_INITIALIZER); LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr, size, flags); tsd_t *tsd = tsd_fetch(); bool fast = tsd_fast(tsd); size_t usize = inallocx(tsd_tsdn(tsd), size, flags); assert(usize == isalloc(tsd_tsdn(tsd), ptr)); check_entry_exit_locking(tsd_tsdn(tsd)); tcache_t *tcache; if (unlikely((flags & MALLOCX_TCACHE_MASK) != 0)) { /* Not allowed to be reentrant and specify a custom tcache. */ assert(tsd_reentrancy_level_get(tsd) == 0); if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE) { tcache = NULL; } else { tcache = tcaches_get(tsd, MALLOCX_TCACHE_GET(flags)); } } else { if (likely(fast)) { tcache = tsd_tcachep_get(tsd); assert(tcache == tcache_get(tsd)); } else { if (likely(tsd_reentrancy_level_get(tsd) == 0)) { tcache = tcache_get(tsd); } else { tcache = NULL; } } } UTRACE(ptr, 0, 0); if (likely(fast)) { tsd_assert_fast(tsd); isfree(tsd, ptr, usize, tcache, false); } else { isfree(tsd, ptr, usize, tcache, true); } check_entry_exit_locking(tsd_tsdn(tsd)); LOG("core.sdallocx.exit", ""); } JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW JEMALLOC_ATTR(pure) je_nallocx(size_t size, int flags) { size_t usize; tsdn_t *tsdn; assert(size != 0); if (unlikely(malloc_init())) { LOG("core.nallocx.exit", "result: %zu", ZU(0)); return 0; } tsdn = tsdn_fetch(); check_entry_exit_locking(tsdn); usize = inallocx(tsdn, size, flags); if (unlikely(usize > LARGE_MAXCLASS)) { LOG("core.nallocx.exit", "result: %zu", ZU(0)); return 0; } check_entry_exit_locking(tsdn); LOG("core.nallocx.exit", "result: %zu", usize); return usize; } JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int ret; tsd_t *tsd; LOG("core.mallctl.entry", "name: %s", name); if (unlikely(malloc_init())) { LOG("core.mallctl.exit", "result: %d", EAGAIN); return EAGAIN; } tsd = tsd_fetch(); check_entry_exit_locking(tsd_tsdn(tsd)); ret = ctl_byname(tsd, name, oldp, oldlenp, newp, newlen); check_entry_exit_locking(tsd_tsdn(tsd)); LOG("core.mallctl.exit", "result: %d", ret); return ret; } JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp) { int ret; LOG("core.mallctlnametomib.entry", "name: %s", name); if (unlikely(malloc_init())) { LOG("core.mallctlnametomib.exit", "result: %d", EAGAIN); return EAGAIN; } tsd_t *tsd = tsd_fetch(); check_entry_exit_locking(tsd_tsdn(tsd)); ret = ctl_nametomib(tsd, name, mibp, miblenp); check_entry_exit_locking(tsd_tsdn(tsd)); LOG("core.mallctlnametomib.exit", "result: %d", ret); return ret; } JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { int ret; tsd_t *tsd; LOG("core.mallctlbymib.entry", ""); if (unlikely(malloc_init())) { LOG("core.mallctlbymib.exit", "result: %d", EAGAIN); return EAGAIN; } tsd = tsd_fetch(); check_entry_exit_locking(tsd_tsdn(tsd)); ret = ctl_bymib(tsd, mib, miblen, oldp, oldlenp, newp, newlen); check_entry_exit_locking(tsd_tsdn(tsd)); LOG("core.mallctlbymib.exit", "result: %d", ret); return ret; } JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque, const char *opts) { tsdn_t *tsdn; LOG("core.malloc_stats_print.entry", ""); tsdn = tsdn_fetch(); check_entry_exit_locking(tsdn); stats_print(write_cb, cbopaque, opts); check_entry_exit_locking(tsdn); LOG("core.malloc_stats_print.exit", ""); } JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) { size_t ret; tsdn_t *tsdn; LOG("core.malloc_usable_size.entry", "ptr: %p", ptr); assert(malloc_initialized() || IS_INITIALIZER); tsdn = tsdn_fetch(); check_entry_exit_locking(tsdn); if (unlikely(ptr == NULL)) { ret = 0; } else { if (config_debug || force_ivsalloc) { ret = ivsalloc(tsdn, ptr); assert(force_ivsalloc || ret != 0); } else { ret = isalloc(tsdn, ptr); } } check_entry_exit_locking(tsdn); LOG("core.malloc_usable_size.exit", "result: %zu", ret); return ret; } /* * End non-standard functions. */ /******************************************************************************/ /* * Begin compatibility functions. */ #define ALLOCM_LG_ALIGN(la) (la) #define ALLOCM_ALIGN(a) (ffsl(a)-1) #define ALLOCM_ZERO ((int)0x40) #define ALLOCM_NO_MOVE ((int)0x80) #define ALLOCM_SUCCESS 0 #define ALLOCM_ERR_OOM 1 #define ALLOCM_ERR_NOT_MOVED 2 int je_allocm(void **ptr, size_t *rsize, size_t size, int flags) { assert(ptr != NULL); void *p = je_mallocx(size, flags); if (p == NULL) { return (ALLOCM_ERR_OOM); } if (rsize != NULL) { *rsize = isalloc(tsdn_fetch(), p); } *ptr = p; return ALLOCM_SUCCESS; } int je_rallocm(void **ptr, size_t *rsize, size_t size, size_t extra, int flags) { assert(ptr != NULL); assert(*ptr != NULL); assert(size != 0); assert(SIZE_T_MAX - size >= extra); int ret; bool no_move = flags & ALLOCM_NO_MOVE; if (no_move) { size_t usize = je_xallocx(*ptr, size, extra, flags); ret = (usize >= size) ? ALLOCM_SUCCESS : ALLOCM_ERR_NOT_MOVED; if (rsize != NULL) { *rsize = usize; } } else { void *p = je_rallocx(*ptr, size+extra, flags); if (p != NULL) { *ptr = p; ret = ALLOCM_SUCCESS; } else { ret = ALLOCM_ERR_OOM; } if (rsize != NULL) { *rsize = isalloc(tsdn_fetch(), *ptr); } } return ret; } int je_sallocm(const void *ptr, size_t *rsize, int flags) { assert(rsize != NULL); *rsize = je_sallocx(ptr, flags); return ALLOCM_SUCCESS; } int je_dallocm(void *ptr, int flags) { je_dallocx(ptr, flags); return ALLOCM_SUCCESS; } int je_nallocm(size_t *rsize, size_t size, int flags) { size_t usize = je_nallocx(size, flags); if (usize == 0) { return ALLOCM_ERR_OOM; } if (rsize != NULL) { *rsize = usize; } return ALLOCM_SUCCESS; } #undef ALLOCM_LG_ALIGN #undef ALLOCM_ALIGN #undef ALLOCM_ZERO #undef ALLOCM_NO_MOVE #undef ALLOCM_SUCCESS #undef ALLOCM_ERR_OOM #undef ALLOCM_ERR_NOT_MOVED /* * End compatibility functions. */ /******************************************************************************/ /* * The following functions are used by threading libraries for protection of * malloc during fork(). */ /* * If an application creates a thread before doing any allocation in the main * thread, then calls fork(2) in the main thread followed by memory allocation * in the child process, a race can occur that results in deadlock within the * child: the main thread may have forked while the created thread had * partially initialized the allocator. Ordinarily jemalloc prevents * fork/malloc races via the following functions it registers during * initialization using pthread_atfork(), but of course that does no good if * the allocator isn't fully initialized at fork time. The following library * constructor is a partial solution to this problem. It may still be possible * to trigger the deadlock described above, but doing so would involve forking * via a library constructor that runs before jemalloc's runs. */ #ifndef JEMALLOC_JET JEMALLOC_ATTR(constructor) static void jemalloc_constructor(void) { malloc_init(); } #endif #ifndef JEMALLOC_MUTEX_INIT_CB void jemalloc_prefork(void) #else JEMALLOC_EXPORT void _malloc_prefork(void) #endif { tsd_t *tsd; unsigned i, j, narenas; arena_t *arena; #ifdef JEMALLOC_MUTEX_INIT_CB if (!malloc_initialized()) { return; } #endif assert(malloc_initialized()); tsd = tsd_fetch(); narenas = narenas_total_get(); witness_prefork(tsd_witness_tsdp_get(tsd)); /* Acquire all mutexes in a safe order. */ ctl_prefork(tsd_tsdn(tsd)); tcache_prefork(tsd_tsdn(tsd)); malloc_mutex_prefork(tsd_tsdn(tsd), &arenas_lock); if (have_background_thread) { background_thread_prefork0(tsd_tsdn(tsd)); } prof_prefork0(tsd_tsdn(tsd)); if (have_background_thread) { background_thread_prefork1(tsd_tsdn(tsd)); } /* Break arena prefork into stages to preserve lock order. */ for (i = 0; i < 8; i++) { for (j = 0; j < narenas; j++) { if ((arena = arena_get(tsd_tsdn(tsd), j, false)) != NULL) { switch (i) { case 0: arena_prefork0(tsd_tsdn(tsd), arena); break; case 1: arena_prefork1(tsd_tsdn(tsd), arena); break; case 2: arena_prefork2(tsd_tsdn(tsd), arena); break; case 3: arena_prefork3(tsd_tsdn(tsd), arena); break; case 4: arena_prefork4(tsd_tsdn(tsd), arena); break; case 5: arena_prefork5(tsd_tsdn(tsd), arena); break; case 6: arena_prefork6(tsd_tsdn(tsd), arena); break; case 7: arena_prefork7(tsd_tsdn(tsd), arena); break; default: not_reached(); } } } } prof_prefork1(tsd_tsdn(tsd)); } #ifndef JEMALLOC_MUTEX_INIT_CB void jemalloc_postfork_parent(void) #else JEMALLOC_EXPORT void _malloc_postfork(void) #endif { tsd_t *tsd; unsigned i, narenas; #ifdef JEMALLOC_MUTEX_INIT_CB if (!malloc_initialized()) { return; } #endif assert(malloc_initialized()); tsd = tsd_fetch(); witness_postfork_parent(tsd_witness_tsdp_get(tsd)); /* Release all mutexes, now that fork() has completed. */ for (i = 0, narenas = narenas_total_get(); i < narenas; i++) { arena_t *arena; if ((arena = arena_get(tsd_tsdn(tsd), i, false)) != NULL) { arena_postfork_parent(tsd_tsdn(tsd), arena); } } prof_postfork_parent(tsd_tsdn(tsd)); if (have_background_thread) { background_thread_postfork_parent(tsd_tsdn(tsd)); } malloc_mutex_postfork_parent(tsd_tsdn(tsd), &arenas_lock); tcache_postfork_parent(tsd_tsdn(tsd)); ctl_postfork_parent(tsd_tsdn(tsd)); } void jemalloc_postfork_child(void) { tsd_t *tsd; unsigned i, narenas; assert(malloc_initialized()); tsd = tsd_fetch(); witness_postfork_child(tsd_witness_tsdp_get(tsd)); /* Release all mutexes, now that fork() has completed. */ for (i = 0, narenas = narenas_total_get(); i < narenas; i++) { arena_t *arena; if ((arena = arena_get(tsd_tsdn(tsd), i, false)) != NULL) { arena_postfork_child(tsd_tsdn(tsd), arena); } } prof_postfork_child(tsd_tsdn(tsd)); if (have_background_thread) { background_thread_postfork_child(tsd_tsdn(tsd)); } malloc_mutex_postfork_child(tsd_tsdn(tsd), &arenas_lock); tcache_postfork_child(tsd_tsdn(tsd)); ctl_postfork_child(tsd_tsdn(tsd)); } void _malloc_first_thread(void) { (void)malloc_mutex_first_thread(); } /******************************************************************************/ Index: projects/clang900-import/contrib/netbsd-tests/lib/libc/sys/t_stat.c =================================================================== --- projects/clang900-import/contrib/netbsd-tests/lib/libc/sys/t_stat.c (revision 352536) +++ projects/clang900-import/contrib/netbsd-tests/lib/libc/sys/t_stat.c (revision 352537) @@ -1,419 +1,422 @@ /* $NetBSD: t_stat.c,v 1.5 2017/01/13 20:06:50 christos Exp $ */ /*- * Copyright (c) 2011 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jukka Ruohonen. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __RCSID("$NetBSD: t_stat.c,v 1.5 2017/01/13 20:06:50 christos Exp $"); #include #include #include #include #include #include #include #include #include #include #include #include #include static const char *path = "stat"; ATF_TC_WITH_CLEANUP(stat_chflags); ATF_TC_HEAD(stat_chflags, tc) { atf_tc_set_md_var(tc, "descr", "Test chflags(2) with stat(2)"); } ATF_TC_BODY(stat_chflags, tc) { struct stat sa, sb; int fd; (void)memset(&sa, 0, sizeof(struct stat)); (void)memset(&sb, 0, sizeof(struct stat)); fd = open(path, O_RDONLY | O_CREAT, 0600); ATF_REQUIRE(fd != -1); ATF_REQUIRE(stat(path, &sa) == 0); ATF_REQUIRE(chflags(path, UF_NODUMP) == 0); ATF_REQUIRE(stat(path, &sb) == 0); if (sa.st_flags == sb.st_flags) atf_tc_fail("stat(2) did not detect chflags(2)"); ATF_REQUIRE(close(fd) == 0); ATF_REQUIRE(unlink(path) == 0); } ATF_TC_CLEANUP(stat_chflags, tc) { (void)unlink(path); } ATF_TC(stat_dir); ATF_TC_HEAD(stat_dir, tc) { atf_tc_set_md_var(tc, "descr", "Test stat(2) with directories"); } ATF_TC_BODY(stat_dir, tc) { const short depth = 2; struct stat sa, sb; char *argv[2]; FTSENT *ftse; FTS *fts; int ops; argv[1] = NULL; argv[0] = __UNCONST("/"); ops = FTS_NOCHDIR; ops |= FTS_PHYSICAL; fts = fts_open(argv, ops, NULL); ATF_REQUIRE(fts != NULL); while ((ftse = fts_read(fts)) != NULL) { if (ftse->fts_level < 1) continue; if (ftse->fts_level > depth) { (void)fts_set(fts, ftse, FTS_SKIP); continue; } switch(ftse->fts_info) { case FTS_DP: (void)memset(&sa, 0, sizeof(struct stat)); (void)memset(&sb, 0, sizeof(struct stat)); ATF_REQUIRE(stat(ftse->fts_parent->fts_path,&sa) == 0); ATF_REQUIRE(chdir(ftse->fts_path) == 0); ATF_REQUIRE(stat(".", &sb) == 0); /* * The previous two stat(2) calls * should be for the same directory. */ if (sa.st_dev != sb.st_dev || sa.st_ino != sb.st_ino) atf_tc_fail("inconsistent stat(2)"); /* * Check that fts(3)'s stat(2) * call equals the manual one. */ if (sb.st_ino != ftse->fts_statp->st_ino) atf_tc_fail("stat(2) and fts(3) differ"); break; default: break; } } (void)fts_close(fts); } ATF_TC(stat_err); ATF_TC_HEAD(stat_err, tc) { atf_tc_set_md_var(tc, "descr", "Test errors from the stat(2) family"); } ATF_TC_BODY(stat_err, tc) { char buf[NAME_MAX + 1]; struct stat st; (void)memset(buf, 'x', sizeof(buf)); errno = 0; ATF_REQUIRE_ERRNO(EBADF, fstat(-1, &st) == -1); errno = 0; ATF_REQUIRE_ERRNO(ENAMETOOLONG, stat(buf, &st) == -1); errno = 0; ATF_REQUIRE_ERRNO(ENAMETOOLONG, lstat(buf, &st) == -1); errno = 0; ATF_REQUIRE_ERRNO(EFAULT, stat((void *)-1, &st) == -1); errno = 0; ATF_REQUIRE_ERRNO(EFAULT, lstat((void *)-1, &st) == -1); errno = 0; ATF_REQUIRE_ERRNO(EFAULT, stat("/etc/passwd", (void *)-1) == -1); errno = 0; ATF_REQUIRE_ERRNO(EFAULT, lstat("/etc/passwd", (void *)-1) == -1); errno = 0; ATF_REQUIRE_ERRNO(ENOENT, stat("/a/b/c/d/e/f/g/h/i/j/k", &st) == -1); errno = 0; ATF_REQUIRE_ERRNO(ENOENT, lstat("/a/b/c/d/e/f/g/h/i/j/k", &st) == -1); } ATF_TC_WITH_CLEANUP(stat_mtime); ATF_TC_HEAD(stat_mtime, tc) { atf_tc_set_md_var(tc, "descr", "Test modification times with stat(2)"); } ATF_TC_BODY(stat_mtime, tc) { struct stat sa, sb; int fd[3]; size_t i; for (i = 0; i < __arraycount(fd); i++) { (void)memset(&sa, 0, sizeof(struct stat)); (void)memset(&sb, 0, sizeof(struct stat)); fd[i] = open(path, O_WRONLY | O_CREAT, 0600); ATF_REQUIRE(fd[i] != -1); ATF_REQUIRE(write(fd[i], "X", 1) == 1); ATF_REQUIRE(stat(path, &sa) == 0); (void)sleep(1); ATF_REQUIRE(write(fd[i], "X", 1) == 1); ATF_REQUIRE(stat(path, &sb) == 0); ATF_REQUIRE(close(fd[i]) == 0); ATF_REQUIRE(unlink(path) == 0); if (sa.st_mtime == sb.st_mtime) atf_tc_fail("mtimes did not change"); } } ATF_TC_CLEANUP(stat_mtime, tc) { (void)unlink(path); } ATF_TC_WITH_CLEANUP(stat_perm); ATF_TC_HEAD(stat_perm, tc) { atf_tc_set_md_var(tc, "descr", "Test permissions with stat(2)"); atf_tc_set_md_var(tc, "require.user", "root"); } ATF_TC_BODY(stat_perm, tc) { struct stat sa, sb; gid_t gid; uid_t uid; int fd; (void)memset(&sa, 0, sizeof(struct stat)); (void)memset(&sb, 0, sizeof(struct stat)); uid = getuid(); gid = getgid(); fd = open(path, O_RDONLY | O_CREAT, 0600); ATF_REQUIRE(fd != -1); ATF_REQUIRE(fstat(fd, &sa) == 0); ATF_REQUIRE(stat(path, &sb) == 0); if (gid != sa.st_gid || sa.st_gid != sb.st_gid) atf_tc_fail("invalid GID"); if (uid != sa.st_uid || sa.st_uid != sb.st_uid) atf_tc_fail("invalid UID"); ATF_REQUIRE(close(fd) == 0); ATF_REQUIRE(unlink(path) == 0); } ATF_TC_CLEANUP(stat_perm, tc) { (void)unlink(path); } ATF_TC_WITH_CLEANUP(stat_size); ATF_TC_HEAD(stat_size, tc) { atf_tc_set_md_var(tc, "descr", "Test file sizes with stat(2)"); } ATF_TC_BODY(stat_size, tc) { struct stat sa, sb, sc; const size_t n = 10; size_t i; int fd; fd = open(path, O_WRONLY | O_CREAT, 0600); ATF_REQUIRE(fd >= 0); for (i = 0; i < n; i++) { (void)memset(&sa, 0, sizeof(struct stat)); (void)memset(&sb, 0, sizeof(struct stat)); (void)memset(&sc, 0, sizeof(struct stat)); ATF_REQUIRE(fstat(fd, &sa) == 0); ATF_REQUIRE(write(fd, "X", 1) == 1); ATF_REQUIRE(fstat(fd, &sb) == 0); ATF_REQUIRE(stat(path, &sc) == 0); if (sa.st_size + 1 != sb.st_size) atf_tc_fail("invalid file size"); if (sb.st_size != sc.st_size) atf_tc_fail("stat(2) and fstat(2) mismatch"); } ATF_REQUIRE(close(fd) == 0); ATF_REQUIRE(unlink(path) == 0); } ATF_TC_CLEANUP(stat_size, tc) { (void)unlink(path); } ATF_TC(stat_socket); ATF_TC_HEAD(stat_socket, tc) { atf_tc_set_md_var(tc, "descr", "Test fstat(2) with " "a socket (PR kern/46077)"); } ATF_TC_BODY(stat_socket, tc) { struct sockaddr_in addr; struct stat st; uint32_t iaddr; int fd, flags; + if (atf_tc_get_config_var_as_bool_wd(tc, "ci", false)) + atf_tc_skip("https://bugs.freebsd.org/240621"); + (void)memset(&st, 0, sizeof(struct stat)); (void)memset(&addr, 0, sizeof(struct sockaddr_in)); fd = socket(AF_INET, SOCK_STREAM, 0); ATF_REQUIRE(fd >= 0); flags = fcntl(fd, F_GETFL); ATF_REQUIRE(flags != -1); ATF_REQUIRE(fcntl(fd, F_SETFL, flags | O_NONBLOCK) != -1); ATF_REQUIRE(inet_pton(AF_INET, "127.0.0.1", &iaddr) == 1); addr.sin_port = htons(42); addr.sin_family = AF_INET; addr.sin_addr.s_addr = iaddr; errno = 0; ATF_REQUIRE_ERRNO(EINPROGRESS, connect(fd, (struct sockaddr *)&addr, sizeof(struct sockaddr_in)) == -1); errno = 0; if (fstat(fd, &st) != 0 || errno != 0) atf_tc_fail("fstat(2) failed for a EINPROGRESS socket"); (void)close(fd); } ATF_TC_WITH_CLEANUP(stat_symlink); ATF_TC_HEAD(stat_symlink, tc) { atf_tc_set_md_var(tc, "descr", "Test symbolic links with stat(2)"); } ATF_TC_BODY(stat_symlink, tc) { const char *pathlink = "pathlink"; struct stat sa, sb; int fd; (void)memset(&sa, 0, sizeof(struct stat)); (void)memset(&sb, 0, sizeof(struct stat)); fd = open(path, O_WRONLY | O_CREAT, 0600); ATF_REQUIRE(fd >= 0); ATF_REQUIRE(symlink(path, pathlink) == 0); ATF_REQUIRE(stat(pathlink, &sa) == 0); ATF_REQUIRE(lstat(pathlink, &sb) == 0); if (S_ISLNK(sa.st_mode) != 0) atf_tc_fail("stat(2) detected symbolic link"); if (S_ISLNK(sb.st_mode) == 0) atf_tc_fail("lstat(2) did not detect symbolic link"); if (sa.st_mode == sb.st_mode) atf_tc_fail("inconsistencies between stat(2) and lstat(2)"); (void)close(fd); ATF_REQUIRE(unlink(path) == 0); ATF_REQUIRE(unlink(pathlink) == 0); } ATF_TC_CLEANUP(stat_symlink, tc) { (void)unlink(path); } ATF_TP_ADD_TCS(tp) { ATF_TP_ADD_TC(tp, stat_chflags); ATF_TP_ADD_TC(tp, stat_dir); ATF_TP_ADD_TC(tp, stat_err); ATF_TP_ADD_TC(tp, stat_mtime); ATF_TP_ADD_TC(tp, stat_perm); ATF_TP_ADD_TC(tp, stat_size); ATF_TP_ADD_TC(tp, stat_socket); ATF_TP_ADD_TC(tp, stat_symlink); return atf_no_error(); } Index: projects/clang900-import/contrib/netbsd-tests =================================================================== --- projects/clang900-import/contrib/netbsd-tests (revision 352536) +++ projects/clang900-import/contrib/netbsd-tests (revision 352537) Property changes on: projects/clang900-import/contrib/netbsd-tests ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/netbsd-tests:r352308-352536 Index: projects/clang900-import/lib/Makefile =================================================================== --- projects/clang900-import/lib/Makefile (revision 352536) +++ projects/clang900-import/lib/Makefile (revision 352537) @@ -1,215 +1,215 @@ # @(#)Makefile 8.1 (Berkeley) 6/4/93 # $FreeBSD$ .include # The SUBDIR_BOOTSTRAP list is a small set of libraries which are used by many # of the other libraries. These are built first with a .WAIT between them # and the main list to avoid needing a SUBDIR_DEPEND line on every library # naming just these few items. SUBDIR_BOOTSTRAP= \ csu \ .WAIT \ libc \ libc_nonshared \ libcompiler_rt \ ${_libclang_rt} \ ${_libcplusplus} \ ${_libcxxrt} \ libelf \ msun # The main list; please keep these sorted alphabetically. SUBDIR= ${SUBDIR_BOOTSTRAP} \ .WAIT \ geom \ libalias \ libarchive \ libauditd \ libbegemot \ libblocksruntime \ libbsdstat \ libbsm \ libbz2 \ libcalendar \ libcam \ libcapsicum \ libcasper \ libcompat \ libcrypt \ libdevctl \ libdevinfo \ libdevstat \ libdl \ libdwarf \ libedit \ libelftc \ libevent \ libexecinfo \ libexpat \ libfetch \ libfigpar \ libgeom \ libifconfig \ libipsec \ libjail \ libkiconv \ libkvm \ liblzma \ libmemstat \ libmd \ libmt \ lib80211 \ libnetbsd \ libnv \ libopenbsd \ libopie \ libpam \ libpathconv \ libpcap \ libpjdlog \ ${_libproc} \ libprocstat \ libregex \ librpcsvc \ librss \ librt \ ${_librtld_db} \ libsbuf \ libsmb \ libsqlite3 \ libstdbuf \ libstdthreads \ libsysdecode \ libtacplus \ libthread_db \ libucl \ libufs \ libugidfw \ libulog \ libutil \ ${_libvgl} \ libwrap \ libxo \ liby \ libz \ libzstd \ ncurses # Inter-library dependencies. When the makefile for a library contains LDADD # libraries, those libraries should be listed as build order dependencies here. SUBDIR_DEPEND_geom= libufs -SUBDIR_DEPEND_libarchive= libz libbz2 libexpat liblzma libmd +SUBDIR_DEPEND_libarchive= libz libbz2 libexpat liblzma libmd libzstd SUBDIR_DEPEND_libauditdm= libbsm SUBDIR_DEPEND_libbsnmp= ${_libnetgraph} SUBDIR_DEPEND_libc++:= libcxxrt SUBDIR_DEPEND_libc= libcompiler_rt SUBDIR_DEPEND_libcam= libsbuf SUBDIR_DEPEND_libcasper= libnv SUBDIR_DEPEND_libdevstat= libkvm SUBDIR_DEPEND_libdpv= libfigpar ncurses libutil SUBDIR_DEPEND_libedit= ncurses SUBDIR_DEPEND_libgeom= libexpat libsbuf SUBDIR_DEPEND_librpcsec_gss= libgssapi SUBDIR_DEPEND_libmagic= libz SUBDIR_DEPEND_libmemstat= libkvm SUBDIR_DEPEND_libopie= libmd SUBDIR_DEPEND_libpam= libcrypt libopie ${_libradius} librpcsvc libtacplus libutil ${_libypclnt} ${_libcom_err} SUBDIR_DEPEND_libpjdlog= libutil SUBDIR_DEPEND_libprocstat= libkvm libutil SUBDIR_DEPEND_libradius= libmd SUBDIR_DEPEND_libsmb= libkiconv SUBDIR_DEPEND_libtacplus= libmd SUBDIR_DEPEND_libulog= libmd SUBDIR_DEPEND_libunbound= ${_libldns} SUBDIR_DEPEND_liblzma= ${_libthr} .if ${MK_OFED} != "no" SUBDIR_DEPEND_libpcap= ofed .endif # NB: keep these sorted by MK_* knobs SUBDIR.${MK_ATM}+= libngatm SUBDIR.${MK_BEARSSL}+= libbearssl libsecureboot SUBDIR.${MK_BLACKLIST}+=libblacklist SUBDIR.${MK_BLUETOOTH}+=libbluetooth libsdp SUBDIR.${MK_BSNMP}+= libbsnmp .if !defined(COMPAT_32BIT) && !defined(COMPAT_SOFTFP) SUBDIR.${MK_CLANG}+= clang .endif SUBDIR.${MK_CUSE}+= libcuse SUBDIR.${MK_CXX}+= libdevdctl SUBDIR.${MK_TOOLCHAIN}+=libpe SUBDIR.${MK_DIALOG}+= libdpv SUBDIR.${MK_FILE}+= libmagic SUBDIR.${MK_GPIO}+= libgpio SUBDIR.${MK_GSSAPI}+= libgssapi librpcsec_gss SUBDIR.${MK_ICONV}+= libiconv_modules SUBDIR.${MK_KERBEROS_SUPPORT}+= libcom_err SUBDIR.${MK_LDNS}+= libldns # The libraries under libclang_rt can only be built by clang, and only make # sense to build when clang is enabled at all. Furthermore, they can only be # built for certain architectures. .if ${MK_CLANG} != "no" && ${COMPILER_TYPE} == "clang" && \ (${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "arm" || ${MACHINE_CPUARCH} == "i386") _libclang_rt= libclang_rt .endif .if ${MK_LIBCPLUSPLUS} != "no" _libcxxrt= libcxxrt _libcplusplus= libc++ _libcplusplus+= libc++experimental .endif SUBDIR.${MK_EFI}+= libefivar SUBDIR.${MK_GOOGLETEST}+= googletest SUBDIR.${MK_LIBTHR}+= libthr SUBDIR.${MK_LLVM_LIBUNWIND}+= libgcc_eh SUBDIR.${MK_LLVM_LIBUNWIND}+= libgcc_s SUBDIR.${MK_NETGRAPH}+= libnetgraph SUBDIR.${MK_NIS}+= libypclnt .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _libvgl= libvgl .endif .if ${MACHINE_CPUARCH} == "aarch64" SUBDIR.${MK_PMC}+= libopencsd .endif .if ${MACHINE_CPUARCH} == "amd64" SUBDIR.${MK_PMC}+= libipt SUBDIR.${MK_BHYVE}+= libvmmapi .endif .if ${MACHINE_CPUARCH} != "sparc64" _libproc= libproc _librtld_db= librtld_db .endif SUBDIR.${MK_OPENMP}+= libomp SUBDIR.${MK_OPENSSL}+= libmp SUBDIR.${MK_PMC}+= libpmc libpmcstat SUBDIR.${MK_RADIUS_SUPPORT}+= libradius SUBDIR.${MK_SENDMAIL}+= libmilter libsm libsmdb libsmutil SUBDIR.${MK_TELNET}+= libtelnet SUBDIR.${MK_TESTS_SUPPORT}+= atf SUBDIR.${MK_TESTS}+= tests SUBDIR.${MK_UNBOUND}+= libunbound SUBDIR.${MK_USB}+= libusbhid libusb SUBDIR.${MK_OFED}+= ofed SUBDIR.${MK_VERIEXEC}+= libveriexec SUBDIR.${MK_ZFS}+= libbe .if !make(install) SUBDIR_PARALLEL= .endif .include Index: projects/clang900-import/lib/libarchive/Makefile =================================================================== --- projects/clang900-import/lib/libarchive/Makefile (revision 352536) +++ projects/clang900-import/lib/libarchive/Makefile (revision 352537) @@ -1,426 +1,427 @@ # $FreeBSD$ .include PACKAGE=lib${LIB} _LIBARCHIVEDIR= ${SRCTOP}/contrib/libarchive LIB= archive -LIBADD= z bz2 lzma bsdxml -CFLAGS+= -DHAVE_BZLIB_H=1 -DHAVE_LIBLZMA=1 -DHAVE_LZMA_H=1 +LIBADD= z bz2 lzma bsdxml zstd +CFLAGS+= -DHAVE_BZLIB_H=1 -DHAVE_LIBLZMA=1 -DHAVE_LZMA_H=1 -DHAVE_ZSTD_H=1 -DHAVE_LIBZSTD=1 # FreeBSD SHLIB_MAJOR value is managed as part of the FreeBSD system. # It has no real relation to the libarchive version number. SHLIB_MAJOR= 7 CFLAGS+= -DPLATFORM_CONFIG_H=\"${.CURDIR}/config_freebsd.h\" CFLAGS+= -I${.OBJDIR} +CFLAGS+= -I${SRCTOP}/sys/contrib/zstd/lib .if ${MK_OPENSSL} != "no" CFLAGS+= -DWITH_OPENSSL LIBADD+= crypto .else LIBADD+= md .endif .if ${MK_ICONV} != "no" # TODO: This can be changed back to CFLAGS once iconv works correctly # with statically linked binaries. SHARED_CFLAGS+= -DHAVE_ICONV=1 -DHAVE_ICONV_H=1 -DICONV_CONST= .endif .if ${MACHINE_ARCH:Marm*} != "" || ${MACHINE_ARCH:Mmips*} != "" || \ ${MACHINE_ARCH:Msparc64*} != "" || ${MACHINE_ARCH:Mpowerpc*} != "" NO_WCAST_ALIGN= yes .if ${MACHINE_ARCH:M*64*} == "" CFLAGS+= -DPPMD_32BIT .endif .endif NO_WCAST_ALIGN.clang= .PATH: ${_LIBARCHIVEDIR}/libarchive # Headers to be installed in /usr/include INCS= archive.h archive_entry.h # Sources to be compiled. SRCS= archive_acl.c \ archive_blake2sp_ref.c \ archive_blake2s_ref.c \ archive_check_magic.c \ archive_cmdline.c \ archive_cryptor.c \ archive_disk_acl_freebsd.c \ archive_digest.c \ archive_entry.c \ archive_entry_copy_stat.c \ archive_entry_link_resolver.c \ archive_entry_sparse.c \ archive_entry_stat.c \ archive_entry_strmode.c \ archive_entry_xattr.c \ archive_getdate.c \ archive_hmac.c \ archive_match.c \ archive_options.c \ archive_pack_dev.c \ archive_pathmatch.c \ archive_ppmd7.c \ archive_ppmd8.c \ archive_random.c \ archive_rb.c \ archive_read.c \ archive_read_add_passphrase.c \ archive_read_append_filter.c \ archive_read_data_into_fd.c \ archive_read_disk_entry_from_file.c \ archive_read_disk_posix.c \ archive_read_disk_set_standard_lookup.c \ archive_read_extract.c \ archive_read_extract2.c \ archive_read_open_fd.c \ archive_read_open_file.c \ archive_read_open_filename.c \ archive_read_open_memory.c \ archive_read_set_format.c \ archive_read_set_options.c \ archive_read_support_filter_all.c \ archive_read_support_filter_bzip2.c \ archive_read_support_filter_compress.c \ archive_read_support_filter_gzip.c \ archive_read_support_filter_grzip.c \ archive_read_support_filter_lrzip.c \ archive_read_support_filter_lz4.c \ archive_read_support_filter_lzop.c \ archive_read_support_filter_none.c \ archive_read_support_filter_program.c \ archive_read_support_filter_rpm.c \ archive_read_support_filter_uu.c \ archive_read_support_filter_xz.c \ archive_read_support_filter_zstd.c \ archive_read_support_format_7zip.c \ archive_read_support_format_all.c \ archive_read_support_format_ar.c \ archive_read_support_format_by_code.c \ archive_read_support_format_cab.c \ archive_read_support_format_cpio.c \ archive_read_support_format_empty.c \ archive_read_support_format_iso9660.c \ archive_read_support_format_lha.c \ archive_read_support_format_mtree.c \ archive_read_support_format_rar.c \ archive_read_support_format_rar5.c \ archive_read_support_format_raw.c \ archive_read_support_format_tar.c \ archive_read_support_format_warc.c \ archive_read_support_format_xar.c \ archive_read_support_format_zip.c \ archive_string.c \ archive_string_sprintf.c \ archive_util.c \ archive_version_details.c \ archive_virtual.c \ archive_write.c \ archive_write_add_filter.c \ archive_write_disk_set_standard_lookup.c \ archive_write_disk_posix.c \ archive_write_open_fd.c \ archive_write_open_file.c \ archive_write_open_filename.c \ archive_write_open_memory.c \ archive_write_add_filter_b64encode.c \ archive_write_add_filter_by_name.c \ archive_write_add_filter_bzip2.c \ archive_write_add_filter_compress.c \ archive_write_add_filter_grzip.c \ archive_write_add_filter_gzip.c \ archive_write_add_filter_lrzip.c \ archive_write_add_filter_lz4.c \ archive_write_add_filter_lzop.c \ archive_write_add_filter_none.c \ archive_write_add_filter_program.c \ archive_write_add_filter_uuencode.c \ archive_write_add_filter_xz.c \ archive_write_add_filter_zstd.c \ archive_write_set_format.c \ archive_write_set_format_7zip.c \ archive_write_set_format_ar.c \ archive_write_set_format_by_name.c \ archive_write_set_format_cpio.c \ archive_write_set_format_cpio_newc.c \ archive_write_set_format_filter_by_ext.c \ archive_write_set_format_gnutar.c \ archive_write_set_format_iso9660.c \ archive_write_set_format_mtree.c \ archive_write_set_format_pax.c \ archive_write_set_format_raw.c \ archive_write_set_format_shar.c \ archive_write_set_format_ustar.c \ archive_write_set_format_v7tar.c \ archive_write_set_format_warc.c \ archive_write_set_format_xar.c \ archive_write_set_format_zip.c \ archive_write_set_passphrase.c \ archive_write_set_options.c \ filter_fork_posix.c # Man pages to be installed. MAN= archive_entry.3 \ archive_entry_acl.3 \ archive_entry_linkify.3 \ archive_entry_misc.3 \ archive_entry_paths.3 \ archive_entry_perms.3 \ archive_entry_stat.3 \ archive_entry_time.3 \ archive_read.3 \ archive_read_data.3 \ archive_read_disk.3 \ archive_read_extract.3 \ archive_read_filter.3 \ archive_read_format.3 \ archive_read_free.3 \ archive_read_header.3 \ archive_read_new.3 \ archive_read_open.3 \ archive_read_set_options.3 \ archive_util.3 \ archive_write.3 \ archive_write_blocksize.3 \ archive_write_data.3 \ archive_write_disk.3 \ archive_write_filter.3 \ archive_write_finish_entry.3 \ archive_write_format.3 \ archive_write_free.3 \ archive_write_header.3 \ archive_write_new.3 \ archive_write_open.3 \ archive_write_set_options.3 \ cpio.5 \ libarchive.3 \ libarchive_changes.3 \ libarchive_internals.3 \ libarchive-formats.5 \ tar.5 # Symlink the man pages under each function name. MLINKS+= archive_entry.3 archive_entry_clear.3 MLINKS+= archive_entry.3 archive_entry_clone.3 MLINKS+= archive_entry.3 archive_entry_free.3 MLINKS+= archive_entry.3 archive_entry_new.3 MLINKS+= archive_entry_acl.3 archive_entry_acl_add_entry.3 MLINKS+= archive_entry_acl.3 archive_entry_acl_add_entry_w.3 MLINKS+= archive_entry_acl.3 archive_entry_acl_clear.3 MLINKS+= archive_entry_acl.3 archive_entry_acl_count.3 MLINKS+= archive_entry_acl.3 archive_entry_acl_next.3 MLINKS+= archive_entry_acl.3 archive_entry_acl_next_w.3 MLINKS+= archive_entry_acl.3 archive_entry_acl_reset.3 MLINKS+= archive_entry_acl.3 archive_entry_acl_text_w.3 MLINKS+= archive_entry_linkify.3 archive_entry_linkresolver.3 MLINKS+= archive_entry_linkify.3 archive_entry_linkresolver_new.3 MLINKS+= archive_entry_linkify.3 archive_entry_linkresolver_set_strategy.3 MLINKS+= archive_entry_linkify.3 archive_entry_linkresolver_free.3 MLINKS+= archive_entry_paths.3 archive_entry_copy_hardlink.3 MLINKS+= archive_entry_paths.3 archive_entry_copy_hardlink_w.3 MLINKS+= archive_entry_paths.3 archive_entry_copy_link.3 MLINKS+= archive_entry_paths.3 archive_entry_copy_link_w.3 MLINKS+= archive_entry_paths.3 archive_entry_copy_pathname.3 MLINKS+= archive_entry_paths.3 archive_entry_copy_pathname_w.3 MLINKS+= archive_entry_paths.3 archive_entry_copy_sourcepath.3 MLINKS+= archive_entry_paths.3 archive_entry_copy_symlink.3 MLINKS+= archive_entry_paths.3 archive_entry_copy_symlink_w.3 MLINKS+= archive_entry_paths.3 archive_entry_hardlink.3 MLINKS+= archive_entry_paths.3 archive_entry_hardlink_w.3 MLINKS+= archive_entry_paths.3 archive_entry_pathname.3 MLINKS+= archive_entry_paths.3 archive_entry_pathname_w.3 MLINKS+= archive_entry_paths.3 archive_entry_set_hardlink.3 MLINKS+= archive_entry_paths.3 archive_entry_set_link.3 MLINKS+= archive_entry_paths.3 archive_entry_set_pathname.3 MLINKS+= archive_entry_paths.3 archive_entry_set_symlink.3 MLINKS+= archive_entry_paths.3 archive_entry_symlink.3 MLINKS+= archive_entry_paths.3 archive_entry_symlink_w.3 MLINKS+= archive_entry_paths.3 archive_entry_update_symlink_utf8.3 MLINKS+= archive_entry_paths.3 archive_entry_update_hardlink_utf8.3 MLINKS+= archive_entry_perms.3 archive_entry_copy_fflags_text.3 MLINKS+= archive_entry_perms.3 archive_entry_copy_fflags_text_w.3 MLINKS+= archive_entry_perms.3 archive_entry_copy_gname.3 MLINKS+= archive_entry_perms.3 archive_entry_copy_gname_w.3 MLINKS+= archive_entry_perms.3 archive_entry_copy_uname.3 MLINKS+= archive_entry_perms.3 archive_entry_copy_uname_w.3 MLINKS+= archive_entry_perms.3 archive_entry_fflags.3 MLINKS+= archive_entry_perms.3 archive_entry_fflags_text.3 MLINKS+= archive_entry_perms.3 archive_entry_gid.3 MLINKS+= archive_entry_perms.3 archive_entry_gname.3 MLINKS+= archive_entry_perms.3 archive_entry_gname_w.3 MLINKS+= archive_entry_perms.3 archive_entry_set_fflags.3 MLINKS+= archive_entry_perms.3 archive_entry_set_gid.3 MLINKS+= archive_entry_perms.3 archive_entry_set_gname.3 MLINKS+= archive_entry_perms.3 archive_entry_perm.3 MLINKS+= archive_entry_perms.3 archive_entry_set_perm.3 MLINKS+= archive_entry_perms.3 archive_entry_set_uid.3 MLINKS+= archive_entry_perms.3 archive_entry_set_uname.3 MLINKS+= archive_entry_perms.3 archive_entry_strmode.3 MLINKS+= archive_entry_perms.3 archive_entry_uid.3 MLINKS+= archive_entry_perms.3 archive_entry_uname.3 MLINKS+= archive_entry_perms.3 archive_entry_uname_w.3 MLINKS+= archive_entry_perms.3 archive_entry_update_gname_utf8.3 MLINKS+= archive_entry_perms.3 archive_entry_update_uname_utf8.3 MLINKS+= archive_entry_stat.3 archive_entry_copy_stat.3 MLINKS+= archive_entry_stat.3 archive_entry_dev.3 MLINKS+= archive_entry_stat.3 archive_entry_dev_is_set.3 MLINKS+= archive_entry_stat.3 archive_entry_devmajor.3 MLINKS+= archive_entry_stat.3 archive_entry_devminor.3 MLINKS+= archive_entry_stat.3 archive_entry_filetype.3 MLINKS+= archive_entry_stat.3 archive_entry_ino.3 MLINKS+= archive_entry_stat.3 archive_entry_ino64.3 MLINKS+= archive_entry_stat.3 archive_entry_ino_is_set.3 MLINKS+= archive_entry_stat.3 archive_entry_mode.3 MLINKS+= archive_entry_stat.3 archive_entry_nlink.3 MLINKS+= archive_entry_stat.3 archive_entry_rdev.3 MLINKS+= archive_entry_stat.3 archive_entry_rdevmajor.3 MLINKS+= archive_entry_stat.3 archive_entry_rdevminor.3 MLINKS+= archive_entry_stat.3 archive_entry_set_dev.3 MLINKS+= archive_entry_stat.3 archive_entry_set_devmajor.3 MLINKS+= archive_entry_stat.3 archive_entry_set_devminor.3 MLINKS+= archive_entry_stat.3 archive_entry_set_filetype.3 MLINKS+= archive_entry_stat.3 archive_entry_set_ino.3 MLINKS+= archive_entry_stat.3 archive_entry_set_ino64.3 MLINKS+= archive_entry_stat.3 archive_entry_set_mode.3 MLINKS+= archive_entry_stat.3 archive_entry_set_nlink.3 MLINKS+= archive_entry_stat.3 archive_entry_set_rdev.3 MLINKS+= archive_entry_stat.3 archive_entry_set_rdevmajor.3 MLINKS+= archive_entry_stat.3 archive_entry_set_rdevminor.3 MLINKS+= archive_entry_stat.3 archive_entry_set_size.3 MLINKS+= archive_entry_stat.3 archive_entry_size.3 MLINKS+= archive_entry_stat.3 archive_entry_size_is_set.3 MLINKS+= archive_entry_stat.3 archive_entry_unset_size.3 MLINKS+= archive_entry_time.3 archive_entry_atime.3 MLINKS+= archive_entry_time.3 archive_entry_atime_is_set.3 MLINKS+= archive_entry_time.3 archive_entry_atime_nsec.3 MLINKS+= archive_entry_time.3 archive_entry_birthtime.3 MLINKS+= archive_entry_time.3 archive_entry_birthtime_is_set.3 MLINKS+= archive_entry_time.3 archive_entry_birthtime_nsec.3 MLINKS+= archive_entry_time.3 archive_entry_ctime.3 MLINKS+= archive_entry_time.3 archive_entry_ctime_is_set.3 MLINKS+= archive_entry_time.3 archive_entry_ctime_nsec.3 MLINKS+= archive_entry_time.3 archive_entry_mtime.3 MLINKS+= archive_entry_time.3 archive_entry_mtime_is_set.3 MLINKS+= archive_entry_time.3 archive_entry_mtime_nsec.3 MLINKS+= archive_entry_time.3 archive_entry_set_atime.3 MLINKS+= archive_entry_time.3 archive_entry_set_birthtime.3 MLINKS+= archive_entry_time.3 archive_entry_set_ctime.3 MLINKS+= archive_entry_time.3 archive_entry_set_mtime.3 MLINKS+= archive_entry_time.3 archive_entry_unset_atime.3 MLINKS+= archive_entry_time.3 archive_entry_unset_birthtime.3 MLINKS+= archive_entry_time.3 archive_entry_unset_ctime.3 MLINKS+= archive_entry_time.3 archive_entry_unset_mtime.3 MLINKS+= archive_read_data.3 archive_read_data_block.3 MLINKS+= archive_read_data.3 archive_read_data_into_fd.3 MLINKS+= archive_read_data.3 archive_read_data_skip.3 MLINKS+= archive_read_header.3 archive_read_next_header.3 MLINKS+= archive_read_header.3 archive_read_next_header2.3 MLINKS+= archive_read_extract.3 archive_read_extract2.3 MLINKS+= archive_read_extract.3 archive_read_extract_set_progress_callback.3 MLINKS+= archive_read_extract.3 archive_read_extract_set_skip_file.3 MLINKS+= archive_read_open.3 archive_read_open2.3 MLINKS+= archive_read_open.3 archive_read_open_FILE.3 MLINKS+= archive_read_open.3 archive_read_open_fd.3 MLINKS+= archive_read_open.3 archive_read_open_file.3 MLINKS+= archive_read_open.3 archive_read_open_filename.3 MLINKS+= archive_read_open.3 archive_read_open_memory.3 MLINKS+= archive_read_free.3 archive_read_close.3 MLINKS+= archive_read_free.3 archive_read_finish.3 MLINKS+= archive_read_filter.3 archive_read_support_filter_all.3 MLINKS+= archive_read_filter.3 archive_read_support_filter_bzip2.3 MLINKS+= archive_read_filter.3 archive_read_support_filter_compress.3 MLINKS+= archive_read_filter.3 archive_read_support_filter_gzip.3 MLINKS+= archive_read_filter.3 archive_read_support_filter_lzma.3 MLINKS+= archive_read_filter.3 archive_read_support_filter_none.3 MLINKS+= archive_read_filter.3 archive_read_support_filter_xz.3 MLINKS+= archive_read_filter.3 archive_read_support_filter_program.3 MLINKS+= archive_read_filter.3 archive_read_support_filter_program_signature.3 MLINKS+= archive_read_format.3 archive_read_support_format_7zip.3 MLINKS+= archive_read_format.3 archive_read_support_format_all.3 MLINKS+= archive_read_format.3 archive_read_support_format_ar.3 MLINKS+= archive_read_format.3 archive_read_support_format_by_code.3 MLINKS+= archive_read_format.3 archive_read_support_format_cab.3 MLINKS+= archive_read_format.3 archive_read_support_format_cpio.3 MLINKS+= archive_read_format.3 archive_read_support_format_empty.3 MLINKS+= archive_read_format.3 archive_read_support_format_iso9660.3 MLINKS+= archive_read_format.3 archive_read_support_format_lha.3 MLINKS+= archive_read_format.3 archive_read_support_format_mtree.3 MLINKS+= archive_read_format.3 archive_read_support_format_rar.3 MLINKS+= archive_read_format.3 archive_read_support_format_raw.3 MLINKS+= archive_read_format.3 archive_read_support_format_tar.3 MLINKS+= archive_read_format.3 archive_read_support_format_xar.3 MLINKS+= archive_read_format.3 archive_read_support_format_zip.3 MLINKS+= archive_read_disk.3 archive_read_disk_entry_from_file.3 MLINKS+= archive_read_disk.3 archive_read_disk_gname.3 MLINKS+= archive_read_disk.3 archive_read_disk_new.3 MLINKS+= archive_read_disk.3 archive_read_disk_set_gname_lookup.3 MLINKS+= archive_read_disk.3 archive_read_disk_set_standard_lookup.3 MLINKS+= archive_read_disk.3 archive_read_disk_set_symlink_hybrid.3 MLINKS+= archive_read_disk.3 archive_read_disk_set_symlink_logical.3 MLINKS+= archive_read_disk.3 archive_read_disk_set_symlink_physical.3 MLINKS+= archive_read_disk.3 archive_read_disk_set_uname_lookup.3 MLINKS+= archive_read_disk.3 archive_read_disk_uname.3 MLINKS+= archive_read_set_options.3 archive_read_set_filter_option.3 MLINKS+= archive_read_set_options.3 archive_read_set_format_option.3 MLINKS+= archive_read_set_options.3 archive_read_set_option.3 MLINKS+= archive_util.3 archive_clear_error.3 MLINKS+= archive_util.3 archive_compression.3 MLINKS+= archive_util.3 archive_compression_name.3 MLINKS+= archive_util.3 archive_copy_error.3 MLINKS+= archive_util.3 archive_errno.3 MLINKS+= archive_util.3 archive_error_string.3 MLINKS+= archive_util.3 archive_file_count.3 MLINKS+= archive_util.3 archive_filter_code.3 MLINKS+= archive_util.3 archive_filter_count.3 MLINKS+= archive_util.3 archive_filter_name.3 MLINKS+= archive_util.3 archive_format.3 MLINKS+= archive_util.3 archive_format_name.3 MLINKS+= archive_util.3 archive_position.3 MLINKS+= archive_util.3 archive_set_error.3 MLINKS+= archive_write_blocksize.3 archive_write_get_bytes_in_last_block.3 MLINKS+= archive_write_blocksize.3 archive_write_get_bytes_per_block.3 MLINKS+= archive_write_blocksize.3 archive_write_set_bytes_in_last_block.3 MLINKS+= archive_write_blocksize.3 archive_write_set_bytes_per_block.3 MLINKS+= archive_write_disk.3 archive_write_data_block.3 MLINKS+= archive_write_disk.3 archive_write_disk_new.3 MLINKS+= archive_write_disk.3 archive_write_disk_set_group_lookup.3 MLINKS+= archive_write_disk.3 archive_write_disk_set_options.3 MLINKS+= archive_write_disk.3 archive_write_disk_set_skip_file.3 MLINKS+= archive_write_disk.3 archive_write_disk_set_standard_lookup.3 MLINKS+= archive_write_disk.3 archive_write_disk_set_user_lookup.3 MLINKS+= archive_write_filter.3 archive_write_add_filter_bzip2.3 MLINKS+= archive_write_filter.3 archive_write_add_filter_compress.3 MLINKS+= archive_write_filter.3 archive_write_add_filter_gzip.3 MLINKS+= archive_write_filter.3 archive_write_add_filter_lzip.3 MLINKS+= archive_write_filter.3 archive_write_add_filter_lzma.3 MLINKS+= archive_write_filter.3 archive_write_add_filter_none.3 MLINKS+= archive_write_filter.3 archive_write_add_filter_program.3 MLINKS+= archive_write_filter.3 archive_write_add_filter_xz.3 MLINKS+= archive_write_format.3 archive_write_set_format_cpio.3 MLINKS+= archive_write_format.3 archive_write_set_format_pax.3 MLINKS+= archive_write_format.3 archive_write_set_format_pax_restricted.3 MLINKS+= archive_write_format.3 archive_write_set_format_shar.3 MLINKS+= archive_write_format.3 archive_write_set_format_shar_dump.3 MLINKS+= archive_write_format.3 archive_write_set_format_ustar.3 MLINKS+= archive_write_free.3 archive_write_close.3 MLINKS+= archive_write_free.3 archive_write_fail.3 MLINKS+= archive_write_free.3 archive_write_finish.3 MLINKS+= archive_write_open.3 archive_write_open_FILE.3 MLINKS+= archive_write_open.3 archive_write_open_fd.3 MLINKS+= archive_write_open.3 archive_write_open_file.3 MLINKS+= archive_write_open.3 archive_write_open_filename.3 MLINKS+= archive_write_open.3 archive_write_open_memory.3 MLINKS+= archive_write_set_options.3 archive_write_set_filter_option.3 MLINKS+= archive_write_set_options.3 archive_write_set_format_option.3 MLINKS+= archive_write_set_options.3 archive_write_set_option.3 MLINKS+= libarchive.3 archive.3 HAS_TESTS= SUBDIR.${MK_TESTS}+= tests .include Index: projects/clang900-import/lib/libarchive/tests/Makefile =================================================================== --- projects/clang900-import/lib/libarchive/tests/Makefile (revision 352536) +++ projects/clang900-import/lib/libarchive/tests/Makefile (revision 352537) @@ -1,628 +1,631 @@ # $FreeBSD$ PACKAGE= tests _LIBARCHIVEDIR= ${SRCTOP}/contrib/libarchive ATF_TESTS_SH+= functional_test TEST_METADATA.functional_test+= timeout="600" BINDIR= ${TESTSDIR} PROGS+= libarchive_test CFLAGS+= -I${.CURDIR} -I${.CURDIR:H} -I${.OBJDIR} CFLAGS+= -I${_LIBARCHIVEDIR}/libarchive -I${_LIBARCHIVEDIR}/libarchive/test CFLAGS+= -I${_LIBARCHIVEDIR}/test_utils CFLAGS+= -DHAVE_LIBLZMA=1 -DHAVE_LZMA_H=1 # Uncomment to link against dmalloc #LDADD+= -L/usr/local/lib -ldmalloc #CFLAGS+= -I/usr/local/include -DUSE_DMALLOC .PATH: ${_LIBARCHIVEDIR}/libarchive/test TESTS_SRCS= \ test_acl_nfs4.c \ test_acl_pax.c \ test_acl_platform_nfs4.c \ test_acl_platform_posix1e.c \ test_acl_posix1e.c \ test_acl_text.c \ test_archive_api_feature.c \ test_archive_clear_error.c \ test_archive_cmdline.c \ test_archive_digest.c \ test_archive_getdate.c \ test_archive_match_time.c \ test_archive_match_owner.c \ test_archive_match_path.c \ test_archive_pathmatch.c \ test_archive_read_add_passphrase.c \ test_archive_read_close_twice.c \ test_archive_read_close_twice_open_fd.c \ test_archive_read_close_twice_open_filename.c \ test_archive_read_multiple_data_objects.c \ test_archive_read_next_header_empty.c \ test_archive_read_next_header_raw.c \ test_archive_read_open2.c \ test_archive_read_set_filter_option.c \ test_archive_read_set_format_option.c \ test_archive_read_set_option.c \ test_archive_read_set_options.c \ test_archive_read_support.c \ test_archive_set_error.c \ test_archive_string.c \ test_archive_string_conversion.c \ test_archive_write_add_filter_by_name.c \ test_archive_write_set_filter_option.c \ test_archive_write_set_format_by_name.c \ test_archive_write_set_format_filter_by_ext.c \ test_archive_write_set_format_option.c \ test_archive_write_set_option.c \ test_archive_write_set_options.c \ test_archive_write_set_passphrase.c \ test_bad_fd.c \ test_compat_bzip2.c \ test_compat_cpio.c \ test_compat_gtar.c \ test_compat_gzip.c \ test_compat_lz4.c \ test_compat_lzip.c \ test_compat_lzma.c \ test_compat_lzop.c \ test_compat_mac.c \ test_compat_perl_archive_tar.c \ test_compat_plexus_archiver_tar.c \ test_compat_solaris_tar_acl.c \ test_compat_solaris_pax_sparse.c \ test_compat_star_acl.c \ test_compat_tar_hardlink.c \ test_compat_uudecode.c \ test_compat_uudecode_large.c \ test_compat_xz.c \ test_compat_zip.c \ test_compat_zstd.c \ test_empty_write.c \ test_entry.c \ test_entry_strmode.c \ test_extattr_freebsd.c \ test_filter_count.c \ test_fuzz.c \ test_gnutar_filename_encoding.c \ test_link_resolver.c \ test_open_fd.c \ test_open_failure.c \ test_open_file.c \ test_open_filename.c \ test_pax_filename_encoding.c \ test_read_data_large.c \ test_read_disk.c \ test_read_disk_directory_traversals.c \ test_read_disk_entry_from_file.c \ test_read_extract.c \ test_read_file_nonexistent.c \ test_read_filter_compress.c \ test_read_filter_grzip.c \ test_read_filter_lrzip.c \ test_read_filter_lzop.c \ test_read_filter_lzop_multiple_parts.c \ test_read_filter_program.c \ test_read_filter_program_signature.c \ test_read_filter_uudecode.c \ test_read_format_7zip.c \ test_read_format_7zip_encryption_data.c \ test_read_format_7zip_encryption_header.c \ test_read_format_7zip_encryption_partially.c \ test_read_format_7zip_malformed.c \ test_read_format_ar.c \ test_read_format_cab.c \ test_read_format_cab_filename.c \ test_read_format_cpio_afio.c \ test_read_format_cpio_bin.c \ test_read_format_cpio_bin_Z.c \ test_read_format_cpio_bin_be.c \ test_read_format_cpio_bin_bz2.c \ test_read_format_cpio_bin_gz.c \ test_read_format_cpio_bin_le.c \ test_read_format_cpio_bin_lzip.c \ test_read_format_cpio_bin_lzma.c \ test_read_format_cpio_bin_xz.c \ test_read_format_cpio_filename.c \ test_read_format_cpio_odc.c \ test_read_format_cpio_svr4_gzip.c \ test_read_format_cpio_svr4c_Z.c \ test_read_format_cpio_svr4_bzip2_rpm.c \ test_read_format_cpio_svr4_gzip_rpm.c \ test_read_format_empty.c \ test_read_format_gtar_filename.c \ test_read_format_gtar_gz.c \ test_read_format_gtar_lzma.c \ test_read_format_gtar_sparse.c \ test_read_format_gtar_sparse_skip_entry.c \ test_read_format_iso_Z.c \ test_read_format_iso_multi_extent.c \ test_read_format_iso_xorriso.c \ test_read_format_isorr_rr_moved.c \ test_read_format_isojoliet_bz2.c \ test_read_format_isojoliet_long.c \ test_read_format_isojoliet_rr.c \ test_read_format_isojoliet_versioned.c \ test_read_format_isorr_bz2.c \ test_read_format_isorr_ce.c \ test_read_format_isorr_new_bz2.c \ test_read_format_isozisofs_bz2.c \ test_read_format_lha.c \ test_read_format_lha_bugfix_0.c \ test_read_format_lha_filename.c \ test_read_format_mtree.c \ test_read_format_mtree_crash747.c \ test_read_format_pax_bz2.c \ test_read_format_rar.c \ test_read_format_rar5.c \ test_read_format_rar_encryption_data.c \ test_read_format_rar_encryption_header.c \ test_read_format_rar_encryption_partially.c \ test_read_format_rar_invalid1.c \ test_read_format_raw.c \ test_read_format_tar.c \ test_read_format_tar_concatenated.c \ test_read_format_tar_empty_filename.c \ test_read_format_tar_empty_pax.c \ test_read_format_tar_empty_with_gnulabel.c \ test_read_format_tar_filename.c \ test_read_format_tbz.c \ test_read_format_tgz.c \ test_read_format_tlz.c \ test_read_format_txz.c \ test_read_format_tz.c \ test_read_format_ustar_filename.c \ test_read_format_warc.c \ test_read_format_xar.c \ test_read_format_zip.c \ test_read_format_zip_7075_utf8_paths.c \ test_read_format_zip_comment_stored.c \ test_read_format_zip_encryption_data.c \ test_read_format_zip_encryption_header.c \ test_read_format_zip_encryption_partially.c \ test_read_format_zip_extra_padding.c \ test_read_format_zip_filename.c \ test_read_format_zip_high_compression.c \ test_read_format_zip_jar.c \ test_read_format_zip_mac_metadata.c \ test_read_format_zip_malformed.c \ test_read_format_zip_msdos.c \ test_read_format_zip_nested.c \ test_read_format_zip_nofiletype.c \ test_read_format_zip_padded.c \ test_read_format_zip_sfx.c \ test_read_format_zip_traditional_encryption_data.c \ test_read_format_zip_winzip_aes.c \ test_read_format_zip_winzip_aes_large.c \ test_read_format_zip_with_invalid_traditional_eocd.c \ test_read_format_zip_zip64.c \ test_read_large.c \ test_read_pax_schily_xattr.c \ test_read_pax_truncated.c \ test_read_position.c \ test_read_set_format.c \ test_read_too_many_filters.c \ test_read_truncated.c \ test_read_truncated_filter.c \ test_sparse_basic.c \ test_tar_filenames.c \ test_tar_large.c \ test_warn_missing_hardlink_target.c \ test_ustar_filenames.c \ test_ustar_filename_encoding.c \ test_write_disk.c \ test_write_disk_appledouble.c \ test_write_disk_failures.c \ test_write_disk_hardlink.c \ test_write_disk_hfs_compression.c \ test_write_disk_lookup.c \ test_write_disk_mac_metadata.c \ test_write_disk_no_hfs_compression.c \ test_write_disk_perms.c \ test_write_disk_secure.c \ test_write_disk_secure744.c \ test_write_disk_secure745.c \ test_write_disk_secure746.c \ test_write_disk_sparse.c \ test_write_disk_symlink.c \ test_write_disk_times.c \ test_write_filter_b64encode.c \ test_write_filter_bzip2.c \ test_write_filter_compress.c \ test_write_filter_gzip.c \ test_write_filter_gzip_timestamp.c \ test_write_filter_lrzip.c \ test_write_filter_lz4.c \ test_write_filter_lzip.c \ test_write_filter_lzma.c \ test_write_filter_lzop.c \ test_write_filter_program.c \ test_write_filter_uuencode.c \ test_write_filter_xz.c \ test_write_filter_zstd.c \ test_write_format_7zip.c \ test_write_format_7zip_empty.c \ test_write_format_7zip_large.c \ test_write_format_ar.c \ test_write_format_cpio.c \ test_write_format_cpio_empty.c \ test_write_format_cpio_newc.c \ test_write_format_cpio_odc.c \ test_write_format_gnutar.c \ test_write_format_gnutar_filenames.c \ test_write_format_iso9660.c \ test_write_format_iso9660_boot.c \ test_write_format_iso9660_empty.c \ test_write_format_iso9660_filename.c \ test_write_format_iso9660_zisofs.c \ test_write_format_mtree.c \ test_write_format_mtree_absolute_path.c \ test_write_format_mtree_classic.c \ test_write_format_mtree_classic_indent.c \ test_write_format_mtree_fflags.c \ test_write_format_mtree_no_separator.c \ test_write_format_mtree_quoted_filename.c \ test_write_format_pax.c \ test_write_format_raw.c \ test_write_format_raw_b64.c \ test_write_format_shar_empty.c \ test_write_format_tar.c \ test_write_format_tar_empty.c \ test_write_format_tar_sparse.c \ test_write_format_tar_ustar.c \ test_write_format_tar_v7tar.c \ test_write_format_warc.c \ test_write_format_warc_empty.c \ test_write_format_xar.c \ test_write_format_xar_empty.c \ test_write_format_zip.c \ test_write_format_zip_compression_store.c \ test_write_format_zip_empty.c \ test_write_format_zip_empty_zip64.c \ test_write_format_zip_file.c \ test_write_format_zip_file_zip64.c \ test_write_format_zip_large.c \ test_write_format_zip_zip64.c \ test_write_open_memory.c \ test_write_read_format_zip.c \ test_xattr_platform.c \ test_zip_filename_encoding.c # Deterministic failures: # Crashes with SIGBUS BROKEN_TESTS+= test_archive_rmd160 # Fails with `libarchive/test/test_archive_crypto.c:121: md != actualmd` BROKEN_TESTS+= test_archive_sha384 # Fails with `test_read_disk_directory_traversals.c:1094: File at has atime 886622, 1443306049 seconds ago` BROKEN_TESTS+= test_read_disk_directory_traversals # Non-deterministic failures: # (Times out?) [and] crashes BROKEN_TESTS+= test_fuzz_rar +# https://bugs.freebsd.org/240683 +BROKEN_TESTS+= test_write_filter_zstd + # Build the test program. SRCS.libarchive_test= \ ${TESTS_SRCS} \ read_open_memory.c \ list.h LIBADD.libarchive_test= archive .PATH: ${_LIBARCHIVEDIR}/test_utils SRCS.libarchive_test+= test_main.c \ test_utils.c # list.h is just a list of all tests, as indicated by DEFINE_TEST macro lines list.h: ${TESTS_SRCS} Makefile @(cd ${_LIBARCHIVEDIR}/libarchive/test && \ grep -E -h ^DEFINE_TEST ${.ALLSRC:N*Makefile} | \ egrep -v '${BROKEN_TESTS:tW:C/ /|/g}') > ${.TARGET}.tmp @mv ${.TARGET}.tmp ${.TARGET} CLEANTESTS+= list.h list.h.tmp ${PACKAGE}FILES+= README ${PACKAGE}FILES+= test_acl_pax_posix1e.tar.uu ${PACKAGE}FILES+= test_acl_pax_nfs4.tar.uu ${PACKAGE}FILES+= test_archive_string_conversion.txt.Z.uu ${PACKAGE}FILES+= test_compat_bzip2_1.tbz.uu ${PACKAGE}FILES+= test_compat_bzip2_2.tbz.uu ${PACKAGE}FILES+= test_compat_cpio_1.cpio.uu ${PACKAGE}FILES+= test_compat_gtar_1.tar.uu ${PACKAGE}FILES+= test_compat_gtar_2.tar.uu ${PACKAGE}FILES+= test_compat_gzip_1.tgz.uu ${PACKAGE}FILES+= test_compat_gzip_2.tgz.uu ${PACKAGE}FILES+= test_compat_lz4_1.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_2.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_3.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B4.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B4BD.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B4BDBX.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B5.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B5BD.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B6.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B6BD.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B7.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lz4_B7BD.tar.lz4.uu ${PACKAGE}FILES+= test_compat_lzip_1.tlz.uu ${PACKAGE}FILES+= test_compat_lzip_2.tlz.uu ${PACKAGE}FILES+= test_compat_lzma_1.tlz.uu ${PACKAGE}FILES+= test_compat_lzma_2.tlz.uu ${PACKAGE}FILES+= test_compat_lzma_3.tlz.uu ${PACKAGE}FILES+= test_compat_lzop_1.tar.lzo.uu ${PACKAGE}FILES+= test_compat_lzop_2.tar.lzo.uu ${PACKAGE}FILES+= test_compat_lzop_3.tar.lzo.uu ${PACKAGE}FILES+= test_compat_mac-1.tar.Z.uu ${PACKAGE}FILES+= test_compat_mac-2.tar.Z.uu ${PACKAGE}FILES+= test_compat_perl_archive_tar.tar.uu ${PACKAGE}FILES+= test_compat_plexus_archiver_tar.tar.uu ${PACKAGE}FILES+= test_compat_solaris_pax_sparse_1.pax.Z.uu ${PACKAGE}FILES+= test_compat_solaris_pax_sparse_2.pax.Z.uu ${PACKAGE}FILES+= test_compat_solaris_tar_acl.tar.uu ${PACKAGE}FILES+= test_compat_star_acl_nfs4.tar.uu ${PACKAGE}FILES+= test_compat_star_acl_posix1e.tar.uu ${PACKAGE}FILES+= test_compat_tar_hardlink_1.tar.uu ${PACKAGE}FILES+= test_compat_uudecode_large.tar.Z.uu ${PACKAGE}FILES+= test_compat_xz_1.txz.uu ${PACKAGE}FILES+= test_compat_zip_1.zip.uu ${PACKAGE}FILES+= test_compat_zip_2.zip.uu ${PACKAGE}FILES+= test_compat_zip_3.zip.uu ${PACKAGE}FILES+= test_compat_zip_4.zip.uu ${PACKAGE}FILES+= test_compat_zip_5.zip.uu ${PACKAGE}FILES+= test_compat_zip_6.zip.uu ${PACKAGE}FILES+= test_compat_zip_7.xps.uu ${PACKAGE}FILES+= test_compat_zip_8.zip.uu ${PACKAGE}FILES+= test_compat_zstd_1.tar.zst.uu ${PACKAGE}FILES+= test_fuzz.cab.uu ${PACKAGE}FILES+= test_fuzz.lzh.uu ${PACKAGE}FILES+= test_fuzz_1.iso.Z.uu ${PACKAGE}FILES+= test_pax_filename_encoding.tar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part1.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part2.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part3.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part4.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part5.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_multiple_files.part6.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_single_file.part1.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_single_file.part2.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_single_file.part3.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part01.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part02.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part03.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part04.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part05.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part06.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part07.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part08.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part09.rar.uu ${PACKAGE}FILES+= test_rar_multivolume_uncompressed_files.part10.rar.uu ${PACKAGE}FILES+= test_read_filter_grzip.tar.grz.uu ${PACKAGE}FILES+= test_read_filter_lrzip.tar.lrz.uu ${PACKAGE}FILES+= test_read_filter_lzop.tar.lzo.uu ${PACKAGE}FILES+= test_read_filter_lzop_multiple_parts.tar.lzo.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_bzip2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_copy_1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_copy_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_copy_lzma.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_deflate.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_lzma1_1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_lzma1_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_lzma2_1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj2_lzma2_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_bzip2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_copy.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_deflate.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_lzma1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bcj_lzma2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_bzip2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_copy.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_copy_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_deflate.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_delta_lzma1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_delta_lzma2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_empty_archive.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_empty_file.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_encryption.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_encryption_header.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_encryption_partially.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_lzma1.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_lzma1_2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_lzma1_lzma2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_lzma2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_malformed.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_malformed2.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_ppmd.7z.uu ${PACKAGE}FILES+= test_read_format_7zip_symbolic_name.7z.uu ${PACKAGE}FILES+= test_read_format_ar.ar.uu ${PACKAGE}FILES+= test_read_format_cab_1.cab.uu ${PACKAGE}FILES+= test_read_format_cab_2.cab.uu ${PACKAGE}FILES+= test_read_format_cab_3.cab.uu ${PACKAGE}FILES+= test_read_format_cab_filename_cp932.cab.uu ${PACKAGE}FILES+= test_read_format_cpio_bin_be.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_bin_le.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_cp866.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_eucjp.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_koi8r.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_utf8_jp.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_filename_utf8_ru.cpio.uu ${PACKAGE}FILES+= test_read_format_cpio_svr4_bzip2_rpm.rpm.uu ${PACKAGE}FILES+= test_read_format_cpio_svr4_gzip_rpm.rpm.uu ${PACKAGE}FILES+= test_read_format_gtar_filename_cp866.tar.Z.uu ${PACKAGE}FILES+= test_read_format_gtar_filename_eucjp.tar.Z.uu ${PACKAGE}FILES+= test_read_format_gtar_filename_koi8r.tar.Z.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_13.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17_posix00.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17_posix01.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17_posix10.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_1_17_posix10_modified.tar.uu ${PACKAGE}FILES+= test_read_format_gtar_sparse_skip_entry.tar.Z.uu ${PACKAGE}FILES+= test_read_format_iso.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_2.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_joliet.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_joliet_by_nero.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_joliet_long.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_joliet_rockridge.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_multi_extent.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_rockridge.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_rockridge_ce.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_rockridge_new.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_rockridge_rr_moved.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_xorriso.iso.Z.uu ${PACKAGE}FILES+= test_read_format_iso_zisofs.iso.Z.uu ${PACKAGE}FILES+= test_read_format_lha_bugfix_0.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_filename_cp932.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_header0.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_header1.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_header2.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_header3.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_lh0.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_lh6.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_lh7.lzh.uu ${PACKAGE}FILES+= test_read_format_lha_withjunk.lzh.uu ${PACKAGE}FILES+= test_read_format_mtree.mtree.uu ${PACKAGE}FILES+= test_read_format_mtree_crash747.mtree.bz2.uu ${PACKAGE}FILES+= test_read_format_mtree_nomagic.mtree.uu ${PACKAGE}FILES+= test_read_format_mtree_nomagic2.mtree.uu ${PACKAGE}FILES+= test_read_format_mtree_nomagic3.mtree.uu ${PACKAGE}FILES+= test_read_format_mtree_noprint.mtree.uu ${PACKAGE}FILES+= test_read_format_rar.rar.uu ${PACKAGE}FILES+= test_read_format_rar_binary_data.rar.uu ${PACKAGE}FILES+= test_read_format_rar_compress_best.rar.uu ${PACKAGE}FILES+= test_read_format_rar_compress_normal.rar.uu ${PACKAGE}FILES+= test_read_format_rar_encryption_data.rar.uu ${PACKAGE}FILES+= test_read_format_rar_encryption_header.rar.uu ${PACKAGE}FILES+= test_read_format_rar_encryption_partially.rar.uu ${PACKAGE}FILES+= test_read_format_rar_invalid1.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multi_lzss_blocks.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multivolume.part0001.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multivolume.part0002.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multivolume.part0003.rar.uu ${PACKAGE}FILES+= test_read_format_rar_multivolume.part0004.rar.uu ${PACKAGE}FILES+= test_read_format_rar_noeof.rar.uu ${PACKAGE}FILES+= test_read_format_rar_ppmd_lzss_conversion.rar.uu ${PACKAGE}FILES+= test_read_format_rar_ppmd_use_after_free.rar.uu ${PACKAGE}FILES+= test_read_format_rar_ppmd_use_after_free2.rar.uu ${PACKAGE}FILES+= test_read_format_rar_sfx.exe.uu ${PACKAGE}FILES+= test_read_format_rar_subblock.rar.uu ${PACKAGE}FILES+= test_read_format_rar_unicode.rar.uu ${PACKAGE}FILES+= test_read_format_rar_windows.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_arm.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_arm_filter_on_window_boundary.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_blake2.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_compressed.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_different_window_size.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_distance_overflow.rar.uu -${PACKAGE}FILES+= test_read_format_rar5_extra_field_version.rar.uu +${PACKAGE}FILES+= test_read_format_rar5_extra_field_version.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_fileattr.rar.uu -${PACKAGE}FILES+= test_read_format_rar5_hardlink.rar.uu -${PACKAGE}FILES+= test_read_format_rar5_invalid_dict_reference.rar.uu -${PACKAGE}FILES+= test_read_format_rar5_leftshift1.rar.uu -${PACKAGE}FILES+= test_read_format_rar5_leftshift2.rar.uu +${PACKAGE}FILES+= test_read_format_rar5_hardlink.rar.uu +${PACKAGE}FILES+= test_read_format_rar5_invalid_dict_reference.rar.uu +${PACKAGE}FILES+= test_read_format_rar5_leftshift1.rar.uu +${PACKAGE}FILES+= test_read_format_rar5_leftshift2.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part01.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part02.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part03.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part04.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part05.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part06.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part07.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive.part08.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive_solid.part01.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive_solid.part02.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive_solid.part03.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiarchive_solid.part04.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiple_files.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_multiple_files_solid.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_nonempty_dir_stream.rar.uu -${PACKAGE}FILES+= test_read_format_rar5_owner.rar.uu -${PACKAGE}FILES+= test_read_format_rar5_readtables_overflow.rar.uu +${PACKAGE}FILES+= test_read_format_rar5_owner.rar.uu +${PACKAGE}FILES+= test_read_format_rar5_readtables_overflow.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_solid.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_stored.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_stored_manyfiles.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_symlink.rar.uu -${PACKAGE}FILES+= test_read_format_rar5_truncated_huff.rar.uu +${PACKAGE}FILES+= test_read_format_rar5_truncated_huff.rar.uu ${PACKAGE}FILES+= test_read_format_rar5_win32.rar.uu ${PACKAGE}FILES+= test_read_format_raw.bufr.uu ${PACKAGE}FILES+= test_read_format_raw.data.Z.uu ${PACKAGE}FILES+= test_read_format_raw.data.gz.uu ${PACKAGE}FILES+= test_read_format_raw.data.uu ${PACKAGE}FILES+= test_read_format_tar_concatenated.tar.uu ${PACKAGE}FILES+= test_read_format_tar_empty_filename.tar.uu ${PACKAGE}FILES+= test_read_format_tar_empty_with_gnulabel.tar.uu ${PACKAGE}FILES+= test_read_format_tar_empty_pax.tar.Z.uu ${PACKAGE}FILES+= test_read_format_tar_filename_koi8r.tar.Z.uu ${PACKAGE}FILES+= test_read_format_ustar_filename_cp866.tar.Z.uu ${PACKAGE}FILES+= test_read_format_ustar_filename_eucjp.tar.Z.uu ${PACKAGE}FILES+= test_read_format_ustar_filename_koi8r.tar.Z.uu ${PACKAGE}FILES+= test_read_format_warc.warc.uu ${PACKAGE}FILES+= test_read_format_zip.zip.uu ${PACKAGE}FILES+= test_read_format_zip_7075_utf8_paths.zip.uu ${PACKAGE}FILES+= test_read_format_zip_bz2_hang.zip.uu ${PACKAGE}FILES+= test_read_format_zip_bzip2.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_bzip2_multi.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_comment_stored_1.zip.uu ${PACKAGE}FILES+= test_read_format_zip_comment_stored_2.zip.uu ${PACKAGE}FILES+= test_read_format_zip_encryption_data.zip.uu ${PACKAGE}FILES+= test_read_format_zip_encryption_header.zip.uu ${PACKAGE}FILES+= test_read_format_zip_encryption_partially.zip.uu ${PACKAGE}FILES+= test_read_format_zip_extra_padding.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_cp866.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_cp932.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_koi8r.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_utf8_jp.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_utf8_ru.zip.uu ${PACKAGE}FILES+= test_read_format_zip_filename_utf8_ru2.zip.uu ${PACKAGE}FILES+= test_read_format_zip_high_compression.zip.uu ${PACKAGE}FILES+= test_read_format_zip_jar.jar.uu ${PACKAGE}FILES+= test_read_format_zip_length_at_end.zip.uu ${PACKAGE}FILES+= test_read_format_zip_lzma_alone_leak.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_lzma.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_lzma_multi.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_mac_metadata.zip.uu ${PACKAGE}FILES+= test_read_format_zip_malformed1.zip.uu ${PACKAGE}FILES+= test_read_format_zip_msdos.zip.uu ${PACKAGE}FILES+= test_read_format_zip_nested.zip.uu ${PACKAGE}FILES+= test_read_format_zip_nofiletype.zip.uu ${PACKAGE}FILES+= test_read_format_zip_padded1.zip.uu ${PACKAGE}FILES+= test_read_format_zip_padded2.zip.uu ${PACKAGE}FILES+= test_read_format_zip_padded3.zip.uu ${PACKAGE}FILES+= test_read_format_zip_ppmd8.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_ppmd8_crash_1.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_ppmd8_crash_2.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_ppmd8_multi.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_sfx.uu ${PACKAGE}FILES+= test_read_format_zip_symlink.zip.uu ${PACKAGE}FILES+= test_read_format_zip_traditional_encryption_data.zip.uu ${PACKAGE}FILES+= test_read_format_zip_ux.zip.uu ${PACKAGE}FILES+= test_read_format_zip_with_invalid_traditional_eocd.zip.uu ${PACKAGE}FILES+= test_read_format_zip_winzip_aes128.zip.uu ${PACKAGE}FILES+= test_read_format_zip_winzip_aes256.zip.uu ${PACKAGE}FILES+= test_read_format_zip_winzip_aes256_large.zip.uu ${PACKAGE}FILES+= test_read_format_zip_winzip_aes256_stored.zip.uu ${PACKAGE}FILES+= test_read_format_zip_xz_multi.zipx.uu ${PACKAGE}FILES+= test_read_format_zip_zip64a.zip.uu ${PACKAGE}FILES+= test_read_format_zip_zip64b.zip.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_aa.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_ab.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_ac.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_ad.uu ${PACKAGE}FILES+= test_read_large_splitted_rar_ae.uu ${PACKAGE}FILES+= test_read_pax_schily_xattr.tar.uu ${PACKAGE}FILES+= test_read_splitted_rar_aa.uu ${PACKAGE}FILES+= test_read_splitted_rar_ab.uu ${PACKAGE}FILES+= test_read_splitted_rar_ac.uu ${PACKAGE}FILES+= test_read_splitted_rar_ad.uu ${PACKAGE}FILES+= test_read_too_many_filters.gz.uu ${PACKAGE}FILES+= test_splitted_rar_seek_support_aa.uu ${PACKAGE}FILES+= test_splitted_rar_seek_support_ab.uu ${PACKAGE}FILES+= test_splitted_rar_seek_support_ac.uu ${PACKAGE}FILES+= test_write_disk_appledouble.cpio.gz.uu ${PACKAGE}FILES+= test_write_disk_hfs_compression.tgz.uu ${PACKAGE}FILES+= test_write_disk_mac_metadata.tar.gz.uu ${PACKAGE}FILES+= test_write_disk_no_hfs_compression.tgz.uu .include Index: projects/clang900-import/lib/libbe/be.c =================================================================== --- projects/clang900-import/lib/libbe/be.c (revision 352536) +++ projects/clang900-import/lib/libbe/be.c (revision 352537) @@ -1,1097 +1,1098 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 Kyle J. Kneitinger * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "be.h" #include "be_impl.h" struct be_destroy_data { libbe_handle_t *lbh; char *snapname; }; #if SOON static int be_create_child_noent(libbe_handle_t *lbh, const char *active, const char *child_path); static int be_create_child_cloned(libbe_handle_t *lbh, const char *active); #endif /* Arbitrary... should tune */ #define BE_SNAP_SERIAL_MAX 1024 /* * Iterator function for locating the rootfs amongst the children of the * zfs_be_root set by loader(8). data is expected to be a libbe_handle_t *. */ static int be_locate_rootfs(libbe_handle_t *lbh) { struct statfs sfs; struct extmnttab entry; zfs_handle_t *zfs; /* * Check first if root is ZFS; if not, we'll bail on rootfs capture. * Unfortunately needed because zfs_path_to_zhandle will emit to * stderr if / isn't actually a ZFS filesystem, which we'd like * to avoid. */ if (statfs("/", &sfs) == 0) { statfs2mnttab(&sfs, &entry); if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) return (1); } else return (1); zfs = zfs_path_to_zhandle(lbh->lzh, "/", ZFS_TYPE_FILESYSTEM); if (zfs == NULL) return (1); strlcpy(lbh->rootfs, zfs_get_name(zfs), sizeof(lbh->rootfs)); zfs_close(zfs); return (0); } /* * Initializes the libbe context to operate in the root boot environment * dataset, for example, zroot/ROOT. */ libbe_handle_t * libbe_init(const char *root) { char altroot[MAXPATHLEN]; libbe_handle_t *lbh; char *poolname, *pos; int pnamelen; lbh = NULL; poolname = pos = NULL; if ((lbh = calloc(1, sizeof(libbe_handle_t))) == NULL) goto err; if ((lbh->lzh = libzfs_init()) == NULL) goto err; /* * Grab rootfs, we'll work backwards from there if an optional BE root * has not been passed in. */ if (be_locate_rootfs(lbh) != 0) { if (root == NULL) goto err; *lbh->rootfs = '\0'; } if (root == NULL) { /* Strip off the final slash from rootfs to get the be root */ strlcpy(lbh->root, lbh->rootfs, sizeof(lbh->root)); pos = strrchr(lbh->root, '/'); if (pos == NULL) goto err; *pos = '\0'; } else strlcpy(lbh->root, root, sizeof(lbh->root)); if ((pos = strchr(lbh->root, '/')) == NULL) goto err; pnamelen = pos - lbh->root; poolname = malloc(pnamelen + 1); if (poolname == NULL) goto err; strlcpy(poolname, lbh->root, pnamelen + 1); if ((lbh->active_phandle = zpool_open(lbh->lzh, poolname)) == NULL) goto err; free(poolname); poolname = NULL; if (zpool_get_prop(lbh->active_phandle, ZPOOL_PROP_BOOTFS, lbh->bootfs, sizeof(lbh->bootfs), NULL, true) != 0) goto err; if (zpool_get_prop(lbh->active_phandle, ZPOOL_PROP_ALTROOT, altroot, sizeof(altroot), NULL, true) == 0 && strcmp(altroot, "-") != 0) lbh->altroot_len = strlen(altroot); return (lbh); err: if (lbh != NULL) { if (lbh->active_phandle != NULL) zpool_close(lbh->active_phandle); if (lbh->lzh != NULL) libzfs_fini(lbh->lzh); free(lbh); } free(poolname); return (NULL); } /* * Free memory allocated by libbe_init() */ void libbe_close(libbe_handle_t *lbh) { if (lbh->active_phandle != NULL) zpool_close(lbh->active_phandle); libzfs_fini(lbh->lzh); free(lbh); } /* * Proxy through to libzfs for the moment. */ void be_nicenum(uint64_t num, char *buf, size_t buflen) { zfs_nicenum(num, buf, buflen); } static int be_destroy_cb(zfs_handle_t *zfs_hdl, void *data) { char path[BE_MAXPATHLEN]; struct be_destroy_data *bdd; zfs_handle_t *snap; int err; bdd = (struct be_destroy_data *)data; if (bdd->snapname == NULL) { err = zfs_iter_children(zfs_hdl, be_destroy_cb, data); if (err != 0) return (err); return (zfs_destroy(zfs_hdl, false)); } /* If we're dealing with snapshots instead, delete that one alone */ err = zfs_iter_filesystems(zfs_hdl, be_destroy_cb, data); if (err != 0) return (err); /* * This part is intentionally glossing over any potential errors, * because there's a lot less potential for errors when we're cleaning * up snapshots rather than a full deep BE. The primary error case * here being if the snapshot doesn't exist in the first place, which * the caller will likely deem insignificant as long as it doesn't * exist after the call. Thus, such a missing snapshot shouldn't jam * up the destruction. */ snprintf(path, sizeof(path), "%s@%s", zfs_get_name(zfs_hdl), bdd->snapname); if (!zfs_dataset_exists(bdd->lbh->lzh, path, ZFS_TYPE_SNAPSHOT)) return (0); snap = zfs_open(bdd->lbh->lzh, path, ZFS_TYPE_SNAPSHOT); if (snap != NULL) zfs_destroy(snap, false); return (0); } /* * Destroy the boot environment or snapshot specified by the name * parameter. Options are or'd together with the possible values: * BE_DESTROY_FORCE : forces operation on mounted datasets * BE_DESTROY_ORIGIN: destroy the origin snapshot as well */ int be_destroy(libbe_handle_t *lbh, const char *name, int options) { struct be_destroy_data bdd; char origin[BE_MAXPATHLEN], path[BE_MAXPATHLEN]; zfs_handle_t *fs; char *snapdelim; int err, force, mounted; size_t rootlen; bdd.lbh = lbh; bdd.snapname = NULL; force = options & BE_DESTROY_FORCE; *origin = '\0'; be_root_concat(lbh, name, path); if ((snapdelim = strchr(path, '@')) == NULL) { if (!zfs_dataset_exists(lbh->lzh, path, ZFS_TYPE_FILESYSTEM)) return (set_error(lbh, BE_ERR_NOENT)); if (strcmp(path, lbh->rootfs) == 0 || strcmp(path, lbh->bootfs) == 0) return (set_error(lbh, BE_ERR_DESTROYACT)); fs = zfs_open(lbh->lzh, path, ZFS_TYPE_FILESYSTEM); if (fs == NULL) return (set_error(lbh, BE_ERR_ZFSOPEN)); if ((options & BE_DESTROY_ORIGIN) != 0 && zfs_prop_get(fs, ZFS_PROP_ORIGIN, origin, sizeof(origin), NULL, NULL, 0, 1) != 0) return (set_error(lbh, BE_ERR_NOORIGIN)); /* Don't destroy a mounted dataset unless force is specified */ if ((mounted = zfs_is_mounted(fs, NULL)) != 0) { if (force) { zfs_unmount(fs, NULL, 0); } else { free(bdd.snapname); return (set_error(lbh, BE_ERR_DESTROYMNT)); } } } else { if (!zfs_dataset_exists(lbh->lzh, path, ZFS_TYPE_SNAPSHOT)) return (set_error(lbh, BE_ERR_NOENT)); bdd.snapname = strdup(snapdelim + 1); if (bdd.snapname == NULL) return (set_error(lbh, BE_ERR_NOMEM)); *snapdelim = '\0'; fs = zfs_open(lbh->lzh, path, ZFS_TYPE_DATASET); if (fs == NULL) { free(bdd.snapname); return (set_error(lbh, BE_ERR_ZFSOPEN)); } } err = be_destroy_cb(fs, &bdd); zfs_close(fs); free(bdd.snapname); if (err != 0) { /* Children are still present or the mount is referenced */ if (err == EBUSY) return (set_error(lbh, BE_ERR_DESTROYMNT)); return (set_error(lbh, BE_ERR_UNKNOWN)); } if ((options & BE_DESTROY_ORIGIN) == 0) return (0); /* The origin can't possibly be shorter than the BE root */ rootlen = strlen(lbh->root); if (*origin == '\0' || strlen(origin) <= rootlen + 1) return (set_error(lbh, BE_ERR_INVORIGIN)); /* * We'll be chopping off the BE root and running this back through * be_destroy, so that we properly handle the origin snapshot whether * it be that of a deep BE or not. */ if (strncmp(origin, lbh->root, rootlen) != 0 || origin[rootlen] != '/') return (0); return (be_destroy(lbh, origin + rootlen + 1, options & ~BE_DESTROY_ORIGIN)); } static void be_setup_snapshot_name(libbe_handle_t *lbh, char *buf, size_t buflen) { time_t rawtime; int len, serial; time(&rawtime); len = strlen(buf); len += strftime(buf + len, buflen - len, "@%F-%T", localtime(&rawtime)); /* No room for serial... caller will do its best */ if (buflen - len < 2) return; for (serial = 0; serial < BE_SNAP_SERIAL_MAX; ++serial) { snprintf(buf + len, buflen - len, "-%d", serial); if (!zfs_dataset_exists(lbh->lzh, buf, ZFS_TYPE_SNAPSHOT)) return; } } int be_snapshot(libbe_handle_t *lbh, const char *source, const char *snap_name, bool recursive, char *result) { char buf[BE_MAXPATHLEN]; int err; be_root_concat(lbh, source, buf); if ((err = be_exists(lbh, buf)) != 0) return (set_error(lbh, err)); if (snap_name != NULL) { if (strlcat(buf, "@", sizeof(buf)) >= sizeof(buf)) return (set_error(lbh, BE_ERR_INVALIDNAME)); if (strlcat(buf, snap_name, sizeof(buf)) >= sizeof(buf)) return (set_error(lbh, BE_ERR_INVALIDNAME)); if (result != NULL) snprintf(result, BE_MAXPATHLEN, "%s@%s", source, snap_name); } else { be_setup_snapshot_name(lbh, buf, sizeof(buf)); if (result != NULL && strlcpy(result, strrchr(buf, '/') + 1, sizeof(buf)) >= sizeof(buf)) return (set_error(lbh, BE_ERR_INVALIDNAME)); } if ((err = zfs_snapshot(lbh->lzh, buf, recursive, NULL)) != 0) { switch (err) { case EZFS_INVALIDNAME: return (set_error(lbh, BE_ERR_INVALIDNAME)); default: /* * The other errors that zfs_ioc_snapshot might return * shouldn't happen if we've set things up properly, so * we'll gloss over them and call it UNKNOWN as it will * require further triage. */ if (errno == ENOTSUP) return (set_error(lbh, BE_ERR_NOPOOL)); return (set_error(lbh, BE_ERR_UNKNOWN)); } } return (BE_ERR_SUCCESS); } /* * Create the boot environment specified by the name parameter */ int be_create(libbe_handle_t *lbh, const char *name) { int err; err = be_create_from_existing(lbh, name, be_active_path(lbh)); return (set_error(lbh, err)); } static int be_deep_clone_prop(int prop, void *cb) { int err; struct libbe_dccb *dccb; zprop_source_t src; char pval[BE_MAXPATHLEN]; char source[BE_MAXPATHLEN]; char *val; dccb = cb; /* Skip some properties we don't want to touch */ if (prop == ZFS_PROP_CANMOUNT) return (ZPROP_CONT); /* Don't copy readonly properties */ if (zfs_prop_readonly(prop)) return (ZPROP_CONT); if ((err = zfs_prop_get(dccb->zhp, prop, (char *)&pval, sizeof(pval), &src, (char *)&source, sizeof(source), false))) /* Just continue if we fail to read a property */ return (ZPROP_CONT); /* * Only copy locally defined or received properties. This continues * to avoid temporary/default/local properties intentionally without * breaking received datasets. */ if (src != ZPROP_SRC_LOCAL && src != ZPROP_SRC_RECEIVED) return (ZPROP_CONT); /* Augment mountpoint with altroot, if needed */ val = pval; if (prop == ZFS_PROP_MOUNTPOINT) val = be_mountpoint_augmented(dccb->lbh, val); nvlist_add_string(dccb->props, zfs_prop_to_name(prop), val); return (ZPROP_CONT); } /* * Return the corresponding boot environment path for a given * dataset path, the constructed path is placed in 'result'. * * example: say our new boot environment name is 'bootenv' and * the dataset path is 'zroot/ROOT/default/data/set'. * * result should produce: 'zroot/ROOT/bootenv/data/set' */ static int be_get_path(struct libbe_deep_clone *ldc, const char *dspath, char *result, int result_size) { char *pos; char *child_dataset; /* match the root path for the boot environments */ pos = strstr(dspath, ldc->lbh->root); /* no match, different pools? */ if (pos == NULL) return (BE_ERR_BADPATH); /* root path of the new boot environment */ snprintf(result, result_size, "%s/%s", ldc->lbh->root, ldc->bename); /* gets us to the parent dataset, the +1 consumes a trailing slash */ pos += strlen(ldc->lbh->root) + 1; /* skip the parent dataset */ if ((child_dataset = strchr(pos, '/')) != NULL) strlcat(result, child_dataset, result_size); return (BE_ERR_SUCCESS); } static int be_clone_cb(zfs_handle_t *ds, void *data) { int err; char be_path[BE_MAXPATHLEN]; char snap_path[BE_MAXPATHLEN]; const char *dspath; zfs_handle_t *snap_hdl; nvlist_t *props; struct libbe_deep_clone *ldc; struct libbe_dccb dccb; ldc = (struct libbe_deep_clone *)data; dspath = zfs_get_name(ds); snprintf(snap_path, sizeof(snap_path), "%s@%s", dspath, ldc->snapname); /* construct the boot environment path from the dataset we're cloning */ if (be_get_path(ldc, dspath, be_path, sizeof(be_path)) != BE_ERR_SUCCESS) return (set_error(ldc->lbh, BE_ERR_UNKNOWN)); /* the dataset to be created (i.e. the boot environment) already exists */ if (zfs_dataset_exists(ldc->lbh->lzh, be_path, ZFS_TYPE_DATASET)) return (set_error(ldc->lbh, BE_ERR_EXISTS)); /* no snapshot found for this dataset, silently skip it */ if (!zfs_dataset_exists(ldc->lbh->lzh, snap_path, ZFS_TYPE_SNAPSHOT)) return (0); if ((snap_hdl = zfs_open(ldc->lbh->lzh, snap_path, ZFS_TYPE_SNAPSHOT)) == NULL) return (set_error(ldc->lbh, BE_ERR_ZFSOPEN)); nvlist_alloc(&props, NV_UNIQUE_NAME, KM_SLEEP); nvlist_add_string(props, "canmount", "noauto"); dccb.lbh = ldc->lbh; dccb.zhp = ds; dccb.props = props; if (zprop_iter(be_deep_clone_prop, &dccb, B_FALSE, B_FALSE, ZFS_TYPE_FILESYSTEM) == ZPROP_INVAL) return (-1); if ((err = zfs_clone(snap_hdl, be_path, props)) != 0) return (set_error(ldc->lbh, BE_ERR_ZFSCLONE)); nvlist_free(props); zfs_close(snap_hdl); if (ldc->depth_limit == -1 || ldc->depth < ldc->depth_limit) { ldc->depth++; err = zfs_iter_filesystems(ds, be_clone_cb, ldc); ldc->depth--; } return (set_error(ldc->lbh, err)); } /* * Create a boot environment with a given name from a given snapshot. * Snapshots can be in the format 'zroot/ROOT/default@snapshot' or * 'default@snapshot'. In the latter case, 'default@snapshot' will be prepended * with the root path that libbe was initailized with. */ static int be_clone(libbe_handle_t *lbh, const char *bename, const char *snapshot, int depth) { int err; char snap_path[BE_MAXPATHLEN]; char *parentname, *snapname; zfs_handle_t *parent_hdl; struct libbe_deep_clone ldc; /* ensure the boot environment name is valid */ if ((err = be_validate_name(lbh, bename)) != 0) return (set_error(lbh, err)); /* * prepend the boot environment root path if we're * given a partial snapshot name. */ if ((err = be_root_concat(lbh, snapshot, snap_path)) != 0) return (set_error(lbh, err)); /* ensure the snapshot exists */ if ((err = be_validate_snap(lbh, snap_path)) != 0) return (set_error(lbh, err)); /* get a copy of the snapshot path so we can disect it */ if ((parentname = strdup(snap_path)) == NULL) return (set_error(lbh, BE_ERR_UNKNOWN)); /* split dataset name from snapshot name */ snapname = strchr(parentname, '@'); if (snapname == NULL) { free(parentname); return (set_error(lbh, BE_ERR_UNKNOWN)); } *snapname = '\0'; snapname++; /* set-up the boot environment */ ldc.lbh = lbh; ldc.bename = bename; ldc.snapname = snapname; ldc.depth = 0; ldc.depth_limit = depth; /* the boot environment will be cloned from this dataset */ parent_hdl = zfs_open(lbh->lzh, parentname, ZFS_TYPE_DATASET); /* create the boot environment */ err = be_clone_cb(parent_hdl, &ldc); free(parentname); return (set_error(lbh, err)); } /* * Create a boot environment from pre-existing snapshot, specifying a depth. */ int be_create_depth(libbe_handle_t *lbh, const char *bename, const char *snap, int depth) { return (be_clone(lbh, bename, snap, depth)); } /* * Create the boot environment from pre-existing snapshot */ int be_create_from_existing_snap(libbe_handle_t *lbh, const char *bename, const char *snap) { return (be_clone(lbh, bename, snap, -1)); } /* * Create a boot environment from an existing boot environment */ int be_create_from_existing(libbe_handle_t *lbh, const char *bename, const char *old) { int err; char snap[BE_MAXPATHLEN]; if ((err = be_snapshot(lbh, old, NULL, true, snap)) != 0) return (set_error(lbh, err)); err = be_clone(lbh, bename, snap, -1); return (set_error(lbh, err)); } /* * Verifies that a snapshot has a valid name, exists, and has a mountpoint of * '/'. Returns BE_ERR_SUCCESS (0), upon success, or the relevant BE_ERR_* upon * failure. Does not set the internal library error state. */ int be_validate_snap(libbe_handle_t *lbh, const char *snap_name) { if (strlen(snap_name) >= BE_MAXPATHLEN) return (BE_ERR_PATHLEN); if (!zfs_name_valid(snap_name, ZFS_TYPE_SNAPSHOT)) return (BE_ERR_INVALIDNAME); if (!zfs_dataset_exists(lbh->lzh, snap_name, ZFS_TYPE_SNAPSHOT)) return (BE_ERR_NOENT); return (BE_ERR_SUCCESS); } /* * Idempotently appends the name argument to the root boot environment path * and copies the resulting string into the result buffer (which is assumed * to be at least BE_MAXPATHLEN characters long. Returns BE_ERR_SUCCESS upon * success, BE_ERR_PATHLEN if the resulting path is longer than BE_MAXPATHLEN, * or BE_ERR_INVALIDNAME if the name is a path that does not begin with * zfs_be_root. Does not set internal library error state. */ int be_root_concat(libbe_handle_t *lbh, const char *name, char *result) { size_t name_len, root_len; name_len = strlen(name); root_len = strlen(lbh->root); /* Act idempotently; return be name if it is already a full path */ if (strrchr(name, '/') != NULL) { if (strstr(name, lbh->root) != name) return (BE_ERR_INVALIDNAME); if (name_len >= BE_MAXPATHLEN) return (BE_ERR_PATHLEN); strlcpy(result, name, BE_MAXPATHLEN); return (BE_ERR_SUCCESS); } else if (name_len + root_len + 1 < BE_MAXPATHLEN) { snprintf(result, BE_MAXPATHLEN, "%s/%s", lbh->root, name); return (BE_ERR_SUCCESS); } return (BE_ERR_PATHLEN); } /* * Verifies the validity of a boot environment name (A-Za-z0-9-_.). Returns * BE_ERR_SUCCESS (0) if name is valid, otherwise returns BE_ERR_INVALIDNAME * or BE_ERR_PATHLEN. * Does not set internal library error state. */ int be_validate_name(libbe_handle_t *lbh, const char *name) { /* * Impose the additional restriction that the entire dataset name must * not exceed the maximum length of a dataset, i.e. MAXNAMELEN. */ if (strlen(lbh->root) + 1 + strlen(name) > MAXNAMELEN) return (BE_ERR_PATHLEN); if (!zfs_name_valid(name, ZFS_TYPE_DATASET)) return (BE_ERR_INVALIDNAME); return (BE_ERR_SUCCESS); } /* * usage */ int be_rename(libbe_handle_t *lbh, const char *old, const char *new) { char full_old[BE_MAXPATHLEN]; char full_new[BE_MAXPATHLEN]; zfs_handle_t *zfs_hdl; int err; /* * be_validate_name is documented not to set error state, so we should * do so here. */ if ((err = be_validate_name(lbh, new)) != 0) return (set_error(lbh, err)); if ((err = be_root_concat(lbh, old, full_old)) != 0) return (set_error(lbh, err)); if ((err = be_root_concat(lbh, new, full_new)) != 0) return (set_error(lbh, err)); if (!zfs_dataset_exists(lbh->lzh, full_old, ZFS_TYPE_DATASET)) return (set_error(lbh, BE_ERR_NOENT)); if (zfs_dataset_exists(lbh->lzh, full_new, ZFS_TYPE_DATASET)) return (set_error(lbh, BE_ERR_EXISTS)); if ((zfs_hdl = zfs_open(lbh->lzh, full_old, ZFS_TYPE_FILESYSTEM)) == NULL) return (set_error(lbh, BE_ERR_ZFSOPEN)); /* recurse, nounmount, forceunmount */ struct renameflags flags = { .nounmount = 1, }; err = zfs_rename(zfs_hdl, NULL, full_new, flags); zfs_close(zfs_hdl); if (err != 0) return (set_error(lbh, BE_ERR_UNKNOWN)); return (0); } int be_export(libbe_handle_t *lbh, const char *bootenv, int fd) { char snap_name[BE_MAXPATHLEN]; char buf[BE_MAXPATHLEN]; zfs_handle_t *zfs; + sendflags_t flags = { 0 }; int err; if ((err = be_snapshot(lbh, bootenv, NULL, true, snap_name)) != 0) /* Use the error set by be_snapshot */ return (err); be_root_concat(lbh, snap_name, buf); if ((zfs = zfs_open(lbh->lzh, buf, ZFS_TYPE_DATASET)) == NULL) return (set_error(lbh, BE_ERR_ZFSOPEN)); - err = zfs_send_one(zfs, NULL, fd, 0); + err = zfs_send_one(zfs, NULL, fd, flags); zfs_close(zfs); return (err); } int be_import(libbe_handle_t *lbh, const char *bootenv, int fd) { char buf[BE_MAXPATHLEN]; nvlist_t *props; zfs_handle_t *zfs; recvflags_t flags = { .nomount = 1 }; int err; be_root_concat(lbh, bootenv, buf); if ((err = zfs_receive(lbh->lzh, buf, NULL, &flags, fd, NULL)) != 0) { switch (err) { case EINVAL: return (set_error(lbh, BE_ERR_NOORIGIN)); case ENOENT: return (set_error(lbh, BE_ERR_NOENT)); case EIO: return (set_error(lbh, BE_ERR_IO)); default: return (set_error(lbh, BE_ERR_UNKNOWN)); } } if ((zfs = zfs_open(lbh->lzh, buf, ZFS_TYPE_FILESYSTEM)) == NULL) return (set_error(lbh, BE_ERR_ZFSOPEN)); nvlist_alloc(&props, NV_UNIQUE_NAME, KM_SLEEP); nvlist_add_string(props, "canmount", "noauto"); nvlist_add_string(props, "mountpoint", "none"); err = zfs_prop_set_list(zfs, props); nvlist_free(props); zfs_close(zfs); if (err != 0) return (set_error(lbh, BE_ERR_UNKNOWN)); return (0); } #if SOON static int be_create_child_noent(libbe_handle_t *lbh, const char *active, const char *child_path) { nvlist_t *props; zfs_handle_t *zfs; int err; nvlist_alloc(&props, NV_UNIQUE_NAME, KM_SLEEP); nvlist_add_string(props, "canmount", "noauto"); nvlist_add_string(props, "mountpoint", child_path); /* Create */ if ((err = zfs_create(lbh->lzh, active, ZFS_TYPE_DATASET, props)) != 0) { switch (err) { case EZFS_EXISTS: return (set_error(lbh, BE_ERR_EXISTS)); case EZFS_NOENT: return (set_error(lbh, BE_ERR_NOENT)); case EZFS_BADTYPE: case EZFS_BADVERSION: return (set_error(lbh, BE_ERR_NOPOOL)); case EZFS_BADPROP: default: /* We set something up wrong, probably... */ return (set_error(lbh, BE_ERR_UNKNOWN)); } } nvlist_free(props); if ((zfs = zfs_open(lbh->lzh, active, ZFS_TYPE_DATASET)) == NULL) return (set_error(lbh, BE_ERR_ZFSOPEN)); /* Set props */ if ((err = zfs_prop_set(zfs, "canmount", "noauto")) != 0) { zfs_close(zfs); /* * Similar to other cases, this shouldn't fail unless we've * done something wrong. This is a new dataset that shouldn't * have been mounted anywhere between creation and now. */ if (err == EZFS_NOMEM) return (set_error(lbh, BE_ERR_NOMEM)); return (set_error(lbh, BE_ERR_UNKNOWN)); } zfs_close(zfs); return (BE_ERR_SUCCESS); } static int be_create_child_cloned(libbe_handle_t *lbh, const char *active) { char buf[BE_MAXPATHLEN], tmp[BE_MAXPATHLEN];; zfs_handle_t *zfs; int err; /* XXX TODO ? */ /* * Establish if the existing path is a zfs dataset or just * the subdirectory of one */ strlcpy(tmp, "tmp/be_snap.XXXXX", sizeof(tmp)); if (mktemp(tmp) == NULL) return (set_error(lbh, BE_ERR_UNKNOWN)); be_root_concat(lbh, tmp, buf); printf("Here %s?\n", buf); if ((err = zfs_snapshot(lbh->lzh, buf, false, NULL)) != 0) { switch (err) { case EZFS_INVALIDNAME: return (set_error(lbh, BE_ERR_INVALIDNAME)); default: /* * The other errors that zfs_ioc_snapshot might return * shouldn't happen if we've set things up properly, so * we'll gloss over them and call it UNKNOWN as it will * require further triage. */ if (errno == ENOTSUP) return (set_error(lbh, BE_ERR_NOPOOL)); return (set_error(lbh, BE_ERR_UNKNOWN)); } } /* Clone */ if ((zfs = zfs_open(lbh->lzh, buf, ZFS_TYPE_SNAPSHOT)) == NULL) return (BE_ERR_ZFSOPEN); if ((err = zfs_clone(zfs, active, NULL)) != 0) /* XXX TODO correct error */ return (set_error(lbh, BE_ERR_UNKNOWN)); /* set props */ zfs_close(zfs); return (BE_ERR_SUCCESS); } int be_add_child(libbe_handle_t *lbh, const char *child_path, bool cp_if_exists) { struct stat sb; char active[BE_MAXPATHLEN], buf[BE_MAXPATHLEN]; nvlist_t *props; const char *s; /* Require absolute paths */ if (*child_path != '/') return (set_error(lbh, BE_ERR_BADPATH)); strlcpy(active, be_active_path(lbh), BE_MAXPATHLEN); strcpy(buf, active); /* Create non-mountable parent dataset(s) */ s = child_path; for (char *p; (p = strchr(s+1, '/')) != NULL; s = p) { size_t len = p - s; strncat(buf, s, len); nvlist_alloc(&props, NV_UNIQUE_NAME, KM_SLEEP); nvlist_add_string(props, "canmount", "off"); nvlist_add_string(props, "mountpoint", "none"); zfs_create(lbh->lzh, buf, ZFS_TYPE_DATASET, props); nvlist_free(props); } /* Path does not exist as a descendent of / yet */ if (strlcat(active, child_path, BE_MAXPATHLEN) >= BE_MAXPATHLEN) return (set_error(lbh, BE_ERR_PATHLEN)); if (stat(child_path, &sb) != 0) { /* Verify that error is ENOENT */ if (errno != ENOENT) return (set_error(lbh, BE_ERR_UNKNOWN)); return (be_create_child_noent(lbh, active, child_path)); } else if (cp_if_exists) /* Path is already a descendent of / and should be copied */ return (be_create_child_cloned(lbh, active)); return (set_error(lbh, BE_ERR_EXISTS)); } #endif /* SOON */ static int be_set_nextboot(libbe_handle_t *lbh, nvlist_t *config, uint64_t pool_guid, const char *zfsdev) { nvlist_t **child; uint64_t vdev_guid; int c, children; if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; ++c) if (be_set_nextboot(lbh, child[c], pool_guid, zfsdev) != 0) return (1); return (0); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) != 0) { return (1); } if (zpool_nextboot(lbh->lzh, pool_guid, vdev_guid, zfsdev) != 0) { perror("ZFS_IOC_NEXTBOOT failed"); return (1); } return (0); } /* * Deactivate old BE dataset; currently just sets canmount=noauto */ static int be_deactivate(libbe_handle_t *lbh, const char *ds) { zfs_handle_t *zfs; if ((zfs = zfs_open(lbh->lzh, ds, ZFS_TYPE_DATASET)) == NULL) return (1); if (zfs_prop_set(zfs, "canmount", "noauto") != 0) return (1); zfs_close(zfs); return (0); } int be_activate(libbe_handle_t *lbh, const char *bootenv, bool temporary) { char be_path[BE_MAXPATHLEN]; char buf[BE_MAXPATHLEN]; nvlist_t *config, *dsprops, *vdevs; char *origin; uint64_t pool_guid; zfs_handle_t *zhp; int err; be_root_concat(lbh, bootenv, be_path); /* Note: be_exists fails if mountpoint is not / */ if ((err = be_exists(lbh, be_path)) != 0) return (set_error(lbh, err)); if (temporary) { config = zpool_get_config(lbh->active_phandle, NULL); if (config == NULL) /* config should be fetchable... */ return (set_error(lbh, BE_ERR_UNKNOWN)); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) /* Similarly, it shouldn't be possible */ return (set_error(lbh, BE_ERR_UNKNOWN)); /* Expected format according to zfsbootcfg(8) man */ snprintf(buf, sizeof(buf), "zfs:%s:", be_path); /* We have no config tree */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdevs) != 0) return (set_error(lbh, BE_ERR_NOPOOL)); return (be_set_nextboot(lbh, vdevs, pool_guid, buf)); } else { if (be_deactivate(lbh, lbh->bootfs) != 0) return (-1); /* Obtain bootenv zpool */ err = zpool_set_prop(lbh->active_phandle, "bootfs", be_path); if (err) return (-1); zhp = zfs_open(lbh->lzh, be_path, ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (-1); if (be_prop_list_alloc(&dsprops) != 0) return (-1); if (be_get_dataset_props(lbh, be_path, dsprops) != 0) { nvlist_free(dsprops); return (-1); } if (nvlist_lookup_string(dsprops, "origin", &origin) == 0) err = zfs_promote(zhp); nvlist_free(dsprops); zfs_close(zhp); if (err) return (-1); } return (BE_ERR_SUCCESS); } Index: projects/clang900-import/lib/libc/gen/sysctlnametomib.c =================================================================== --- projects/clang900-import/lib/libc/gen/sysctlnametomib.c (revision 352536) +++ projects/clang900-import/lib/libc/gen/sysctlnametomib.c (revision 352537) @@ -1,57 +1,57 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright 2001 The FreeBSD Project. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE FREEBSD PROJECT ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE FREEBSD PROJECT BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include /* * This function uses a presently undocumented interface to the kernel * to walk the tree and get the type so it can print the value. * This interface is under work and consideration, and should probably * be killed with a big axe by the first person who can find the time. * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. */ int sysctlnametomib(const char *name, int *mibp, size_t *sizep) { int oid[2]; int error; - oid[0] = 0; - oid[1] = 3; + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; *sizep *= sizeof(int); error = sysctl(oid, 2, mibp, sizep, name, strlen(name)); *sizep /= sizeof(int); return (error); } Index: projects/clang900-import/lib/libc/sys/open.2 =================================================================== --- projects/clang900-import/lib/libc/sys/open.2 (revision 352536) +++ projects/clang900-import/lib/libc/sys/open.2 (revision 352537) @@ -1,612 +1,617 @@ .\" Copyright (c) 1980, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)open.2 8.2 (Berkeley) 11/16/93 .\" $FreeBSD$ .\" -.Dd June 14, 2019 +.Dd September 17, 2019 .Dt OPEN 2 .Os .Sh NAME .Nm open , openat .Nd open or create a file for reading, writing or executing .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In fcntl.h .Ft int .Fn open "const char *path" "int flags" "..." .Ft int .Fn openat "int fd" "const char *path" "int flags" "..." .Sh DESCRIPTION The file name specified by .Fa path is opened for either execution or reading and/or writing as specified by the argument .Fa flags and the file descriptor returned to the calling process. The .Fa flags argument may indicate the file is to be created if it does not exist (by specifying the .Dv O_CREAT flag). In this case .Fn open and .Fn openat require an additional argument .Fa "mode_t mode" , and the file is created with mode .Fa mode as described in .Xr chmod 2 and modified by the process' umask value (see .Xr umask 2 ) . .Pp The .Fn openat function is equivalent to the .Fn open function except in the case where the .Fa path specifies a relative path, or the .Dv O_BENEATH flag is provided. For .Fn openat and relative .Fa path , the file to be opened is determined relative to the directory associated with the file descriptor .Fa fd instead of the current working directory. The .Fa flag parameter and the optional fourth parameter correspond exactly to the parameters of .Fn open . If .Fn openat is passed the special value .Dv AT_FDCWD in the .Fa fd parameter, the current working directory is used and the behavior is identical to a call to .Fn open . .Pp When .Fn openat is called with an absolute .Fa path without the .Dv O_BENEATH flag, it ignores the .Fa fd argument. When .Dv O_BENEATH is specified with an absolute .Fa path , a directory passed by the .Fa fd argument is used as the topping point for the resolution. See the definition of the .Dv O_BENEATH flag below. .Pp In .Xr capsicum 4 capability mode, .Fn open is not permitted. The .Fa path argument to .Fn openat must be strictly relative to a file descriptor .Fa fd , as defined in .Pa sys/kern/vfs_lookup.c . .Fa path must not be an absolute path and must not contain ".." components which cause the path resolution to escape the directory hierarchy starting at .Fa fd . Additionally, no symbolic link in .Fa path may target absolute path or contain escaping ".." components. .Fa fd must not be .Dv AT_FDCWD . .Pp If the .Dv vfs.lookup_cap_dotdot .Xr sysctl 3 MIB is set to zero, ".." components in the paths, used in capability mode, or with the .Dv O_BENEATH flag, are completely disabled. If the .Dv vfs.lookup_cap_dotdot_nonlocal MIB is set to zero, ".." is not allowed if found on non-local filesystem. .Pp The flags specified are formed by .Em or Ns 'ing the following values .Pp .Bd -literal -offset indent -compact O_RDONLY open for reading only O_WRONLY open for writing only O_RDWR open for reading and writing O_EXEC open for execute only O_NONBLOCK do not block on open O_APPEND append on each write O_CREAT create file if it does not exist O_TRUNC truncate size to 0 O_EXCL error if create and file exists O_SHLOCK atomically obtain a shared lock O_EXLOCK atomically obtain an exclusive lock O_DIRECT eliminate or reduce cache effects O_FSYNC synchronous writes O_SYNC synchronous writes O_NOFOLLOW do not follow symlinks O_NOCTTY ignored O_TTY_INIT ignored O_DIRECTORY error if file is not a directory O_CLOEXEC set FD_CLOEXEC upon open O_VERIFY verify the contents of the file O_BENEATH require path to be strictly relative to topping directory .Ed .Pp Opening a file with .Dv O_APPEND set causes each write on the file to be appended to the end. If .Dv O_TRUNC is specified and the file exists, the file is truncated to zero length. If .Dv O_EXCL is set with .Dv O_CREAT and the file already exists, .Fn open returns an error. This may be used to implement a simple exclusive access locking mechanism. If .Dv O_EXCL is set and the last component of the pathname is a symbolic link, .Fn open will fail even if the symbolic link points to a non-existent name. If the .Dv O_NONBLOCK flag is specified and the .Fn open system call would result in the process being blocked for some reason (e.g., waiting for carrier on a dialup line), .Fn open returns immediately. The descriptor remains in non-blocking mode for subsequent operations. .Pp If .Dv O_FSYNC is used in the mask, all writes will immediately and synchronously be written to disk. .Pp .Dv O_SYNC is a synonym for .Dv O_FSYNC required by .Tn POSIX . .Pp If .Dv O_NOFOLLOW is used in the mask and the target file passed to .Fn open is a symbolic link then the .Fn open will fail. .Pp When opening a file, a lock with .Xr flock 2 semantics can be obtained by setting .Dv O_SHLOCK for a shared lock, or .Dv O_EXLOCK for an exclusive lock. If creating a file with .Dv O_CREAT , the request for the lock will never fail (provided that the underlying file system supports locking). .Pp .Dv O_DIRECT may be used to minimize or eliminate the cache effects of reading and writing. The system will attempt to avoid caching the data you read or write. If it cannot avoid caching the data, it will minimize the impact the data has on the cache. Use of this flag can drastically reduce performance if not used with care. .Pp .Dv O_NOCTTY may be used to ensure the OS does not assign this file as the controlling terminal when it opens a tty device. This is the default on .Fx , but is present for .Tn POSIX compatibility. The .Fn open system call will not assign controlling terminals on .Fx . .Pp .Dv O_TTY_INIT may be used to ensure the OS restores the terminal attributes when initially opening a TTY. This is the default on .Fx , but is present for .Tn POSIX compatibility. The initial call to .Fn open on a TTY will always restore default terminal attributes on .Fx . .Pp .Dv O_DIRECTORY may be used to ensure the resulting file descriptor refers to a directory. This flag can be used to prevent applications with elevated privileges from opening files which are even unsafe to open with .Dv O_RDONLY , such as device nodes. .Pp .Dv O_CLOEXEC may be used to set .Dv FD_CLOEXEC flag for the newly returned file descriptor. .Pp .Dv O_VERIFY may be used to indicate to the kernel that the contents of the file should be verified before allowing the open to proceed. The details of what .Dq verified means is implementation specific. The run-time linker (rtld) uses this flag to ensure shared objects have been verified before operating on them. .Pp .Dv O_BENEATH returns .Er ENOTCAPABLE if the specified relative path, after resolving all symlinks and ".." references, does not reside in the directory hierarchy of children beneath the topping directory. Topping directory is the process current directory if relative .Fa path is used for .Fn open , and the directory referenced by the .Fa fd argument when using .Fn openat . If the specified path is absolute, .Dv O_BENEATH allows arbitrary prefix that ends up at the topping directory, after which all further resolved components must be under it. .Pp If successful, .Fn open returns a non-negative integer, termed a file descriptor. It returns \-1 on failure. The file pointer used to mark the current position within the file is set to the beginning of the file. .Pp If a sleeping open of a device node from .Xr devfs 5 is interrupted by a signal, the call always fails with .Er EINTR , even if the .Dv SA_RESTART flag is set for the signal. A sleeping open of a fifo (see .Xr mkfifo 2 ) is restarted as normal. .Pp When a new file is created it is given the group of the directory which contains it. .Pp Unless .Dv O_CLOEXEC flag was specified, the new descriptor is set to remain open across .Xr execve 2 system calls; see .Xr close 2 , .Xr fcntl 2 and .Dv O_CLOEXEC description. .Pp The system imposes a limit on the number of file descriptors open simultaneously by one process. The .Xr getdtablesize 2 system call returns the current system limit. .Sh RETURN VALUES If successful, .Fn open and .Fn openat return a non-negative integer, termed a file descriptor. They return \-1 on failure, and set .Va errno to indicate the error. .Sh ERRORS The named file is opened unless: .Bl -tag -width Er .It Bq Er ENOTDIR A component of the path prefix is not a directory. .It Bq Er ENAMETOOLONG A component of a pathname exceeded 255 characters, or an entire path name exceeded 1023 characters. .It Bq Er ENOENT .Dv O_CREAT is not set and the named file does not exist. .It Bq Er ENOENT A component of the path name that must exist does not exist. .It Bq Er EACCES Search permission is denied for a component of the path prefix. .It Bq Er EACCES The required permissions (for reading and/or writing) are denied for the given flags. .It Bq Er EACCES .Dv O_TRUNC is specified and write permission is denied. .It Bq Er EACCES .Dv O_CREAT is specified, the file does not exist, and the directory in which it is to be created does not permit writing. .It Bq Er EPERM .Dv O_CREAT is specified, the file does not exist, and the directory in which it is to be created has its immutable flag set, see the .Xr chflags 2 manual page for more information. .It Bq Er EPERM The named file has its immutable flag set and the file is to be modified. .It Bq Er EPERM The named file has its append-only flag set, the file is to be modified, and .Dv O_TRUNC is specified or .Dv O_APPEND is not specified. .It Bq Er ELOOP Too many symbolic links were encountered in translating the pathname. .It Bq Er EISDIR The named file is a directory, and the arguments specify it is to be modified. +.It Bq Er EISDIR +The named file is a directory, and the flags specified +.Dv O_CREAT +without +.Dv O_DIRECTORY . .It Bq Er EROFS The named file resides on a read-only file system, and the file is to be modified. .It Bq Er EROFS .Dv O_CREAT is specified and the named file would reside on a read-only file system. .It Bq Er EMFILE The process has already reached its limit for open file descriptors. .It Bq Er ENFILE The system file table is full. .It Bq Er EMLINK .Dv O_NOFOLLOW was specified and the target is a symbolic link. .It Bq Er ENXIO The named file is a character special or block special file, and the device associated with this special file does not exist. .It Bq Er ENXIO .Dv O_NONBLOCK is set, the named file is a fifo, .Dv O_WRONLY is set, and no process has the file open for reading. .It Bq Er EINTR The .Fn open operation was interrupted by a signal. .It Bq Er EOPNOTSUPP .Dv O_SHLOCK or .Dv O_EXLOCK is specified but the underlying file system does not support locking. .It Bq Er EOPNOTSUPP The named file is a special file mounted through a file system that does not support access to it (e.g.\& NFS). .It Bq Er EWOULDBLOCK .Dv O_NONBLOCK and one of .Dv O_SHLOCK or .Dv O_EXLOCK is specified and the file is locked. .It Bq Er ENOSPC .Dv O_CREAT is specified, the file does not exist, and the directory in which the entry for the new file is being placed cannot be extended because there is no space left on the file system containing the directory. .It Bq Er ENOSPC .Dv O_CREAT is specified, the file does not exist, and there are no free inodes on the file system on which the file is being created. .It Bq Er EDQUOT .Dv O_CREAT is specified, the file does not exist, and the directory in which the entry for the new file is being placed cannot be extended because the user's quota of disk blocks on the file system containing the directory has been exhausted. .It Bq Er EDQUOT .Dv O_CREAT is specified, the file does not exist, and the user's quota of inodes on the file system on which the file is being created has been exhausted. .It Bq Er EIO An I/O error occurred while making the directory entry or allocating the inode for .Dv O_CREAT . .It Bq Er ETXTBSY The file is a pure procedure (shared text) file that is being executed and the .Fn open system call requests write access. .It Bq Er EFAULT The .Fa path argument points outside the process's allocated address space. .It Bq Er EEXIST .Dv O_CREAT and .Dv O_EXCL were specified and the file exists. .It Bq Er EOPNOTSUPP An attempt was made to open a socket (not currently implemented). .It Bq Er EINVAL An attempt was made to open a descriptor with an illegal combination of .Dv O_RDONLY , .Dv O_WRONLY , .Dv O_RDWR and .Dv O_EXEC . .It Bq Er EBADF The .Fa path argument does not specify an absolute path and the .Fa fd argument is neither .Dv AT_FDCWD nor a valid file descriptor open for searching. .It Bq Er ENOTDIR The .Fa path argument is not an absolute path and .Fa fd is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. .It Bq Er ENOTDIR .Dv O_DIRECTORY is specified and the file is not a directory. .It Bq Er ECAPMODE .Dv AT_FDCWD is specified and the process is in capability mode. .It Bq Er ECAPMODE .Fn open was called and the process is in capability mode. .It Bq Er ENOTCAPABLE .Fa path is an absolute path, or contained a ".." component leading to a directory outside of the directory hierarchy specified by .Fa fd , and the process is in capability mode. .It Bq Er ENOTCAPABLE The .Dv O_BENEATH flag was provided, and the absolute .Fa path does not have its tail fully contained under the topping directory, or the relative .Fa path escapes it. .El .Sh SEE ALSO .Xr chmod 2 , .Xr close 2 , .Xr dup 2 , .Xr fexecve 2 , .Xr fhopen 2 , .Xr getdtablesize 2 , .Xr getfh 2 , .Xr lgetfh 2 , .Xr lseek 2 , .Xr read 2 , .Xr umask 2 , .Xr write 2 , .Xr fopen 3 , .Xr capsicum 4 .Sh STANDARDS These functions are specified by .St -p1003.1-2008 . .Fx sets .Va errno to .Er EMLINK instead of .Er ELOOP as specified by .Tn POSIX when .Dv O_NOFOLLOW is set in flags and the final component of pathname is a symbolic link to distinguish it from the case of too many symbolic link traversals in one of its non-final components. .Sh HISTORY The .Fn open function appeared in .At v1 . The .Fn openat function was introduced in .Fx 8.0 . .Sh BUGS The Open Group Extended API Set 2 specification requires that the test for whether .Fa fd is searchable is based on whether .Fa fd is open for searching, not whether the underlying directory currently permits searches. The present implementation of the .Fa openat checks the current permissions of directory instead. Index: projects/clang900-import/lib/libpmc/libpmc.c =================================================================== --- projects/clang900-import/lib/libpmc/libpmc.c (revision 352536) +++ projects/clang900-import/lib/libpmc/libpmc.c (revision 352537) @@ -1,1875 +1,1894 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2008 Joseph Koshy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libpmcinternal.h" /* Function prototypes */ #if defined(__amd64__) || defined(__i386__) static int k8_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif #if defined(__amd64__) || defined(__i386__) static int tsc_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif #if defined(__arm__) #if defined(__XSCALE__) static int xscale_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif static int armv7_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif #if defined(__aarch64__) static int arm64_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif #if defined(__mips__) static int mips_allocate_pmc(enum pmc_event _pe, char* ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif /* __mips__ */ static int soft_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #if defined(__powerpc__) static int powerpc_allocate_pmc(enum pmc_event _pe, char* ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif /* __powerpc__ */ #define PMC_CALL(cmd, params) \ syscall(pmc_syscall, PMC_OP_##cmd, (params)) /* * Event aliases provide a way for the user to ask for generic events * like "cache-misses", or "instructions-retired". These aliases are * mapped to the appropriate canonical event descriptions using a * lookup table. */ struct pmc_event_alias { const char *pm_alias; const char *pm_spec; }; static const struct pmc_event_alias *pmc_mdep_event_aliases; /* * The pmc_event_descr structure maps symbolic names known to the user * to integer codes used by the PMC KLD. */ struct pmc_event_descr { const char *pm_ev_name; enum pmc_event pm_ev_code; }; /* * The pmc_class_descr structure maps class name prefixes for * event names to event tables and other PMC class data. */ struct pmc_class_descr { const char *pm_evc_name; size_t pm_evc_name_size; enum pmc_class pm_evc_class; const struct pmc_event_descr *pm_evc_event_table; size_t pm_evc_event_table_size; int (*pm_evc_allocate_pmc)(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pa); }; #define PMC_TABLE_SIZE(N) (sizeof(N)/sizeof(N[0])) #define PMC_EVENT_TABLE_SIZE(N) PMC_TABLE_SIZE(N##_event_table) #undef __PMC_EV #define __PMC_EV(C,N) { #N, PMC_EV_ ## C ## _ ## N }, /* * PMC_CLASSDEP_TABLE(NAME, CLASS) * * Define a table mapping event names and aliases to HWPMC event IDs. */ #define PMC_CLASSDEP_TABLE(N, C) \ static const struct pmc_event_descr N##_event_table[] = \ { \ __PMC_EV_##C() \ } PMC_CLASSDEP_TABLE(iaf, IAF); PMC_CLASSDEP_TABLE(k8, K8); PMC_CLASSDEP_TABLE(xscale, XSCALE); PMC_CLASSDEP_TABLE(armv7, ARMV7); PMC_CLASSDEP_TABLE(armv8, ARMV8); +PMC_CLASSDEP_TABLE(beri, BERI); PMC_CLASSDEP_TABLE(mips24k, MIPS24K); PMC_CLASSDEP_TABLE(mips74k, MIPS74K); PMC_CLASSDEP_TABLE(octeon, OCTEON); PMC_CLASSDEP_TABLE(ppc7450, PPC7450); PMC_CLASSDEP_TABLE(ppc970, PPC970); PMC_CLASSDEP_TABLE(e500, E500); static struct pmc_event_descr soft_event_table[PMC_EV_DYN_COUNT]; #undef __PMC_EV_ALIAS #define __PMC_EV_ALIAS(N,CODE) { N, PMC_EV_##CODE }, static const struct pmc_event_descr cortex_a8_event_table[] = { __PMC_EV_ALIAS_ARMV7_CORTEX_A8() }; static const struct pmc_event_descr cortex_a9_event_table[] = { __PMC_EV_ALIAS_ARMV7_CORTEX_A9() }; static const struct pmc_event_descr cortex_a53_event_table[] = { __PMC_EV_ALIAS_ARMV8_CORTEX_A53() }; static const struct pmc_event_descr cortex_a57_event_table[] = { __PMC_EV_ALIAS_ARMV8_CORTEX_A57() }; /* * PMC_MDEP_TABLE(NAME, PRIMARYCLASS, ADDITIONAL_CLASSES...) * * Map a CPU to the PMC classes it supports. */ #define PMC_MDEP_TABLE(N,C,...) \ static const enum pmc_class N##_pmc_classes[] = { \ PMC_CLASS_##C, __VA_ARGS__ \ } PMC_MDEP_TABLE(k8, K8, PMC_CLASS_SOFT, PMC_CLASS_TSC); PMC_MDEP_TABLE(xscale, XSCALE, PMC_CLASS_SOFT, PMC_CLASS_XSCALE); +PMC_MDEP_TABLE(beri, BERI, PMC_CLASS_SOFT, PMC_CLASS_BERI); PMC_MDEP_TABLE(cortex_a8, ARMV7, PMC_CLASS_SOFT, PMC_CLASS_ARMV7); PMC_MDEP_TABLE(cortex_a9, ARMV7, PMC_CLASS_SOFT, PMC_CLASS_ARMV7); PMC_MDEP_TABLE(cortex_a53, ARMV8, PMC_CLASS_SOFT, PMC_CLASS_ARMV8); PMC_MDEP_TABLE(cortex_a57, ARMV8, PMC_CLASS_SOFT, PMC_CLASS_ARMV8); PMC_MDEP_TABLE(mips24k, MIPS24K, PMC_CLASS_SOFT, PMC_CLASS_MIPS24K); PMC_MDEP_TABLE(mips74k, MIPS74K, PMC_CLASS_SOFT, PMC_CLASS_MIPS74K); PMC_MDEP_TABLE(octeon, OCTEON, PMC_CLASS_SOFT, PMC_CLASS_OCTEON); PMC_MDEP_TABLE(ppc7450, PPC7450, PMC_CLASS_SOFT, PMC_CLASS_PPC7450, PMC_CLASS_TSC); PMC_MDEP_TABLE(ppc970, PPC970, PMC_CLASS_SOFT, PMC_CLASS_PPC970, PMC_CLASS_TSC); PMC_MDEP_TABLE(e500, E500, PMC_CLASS_SOFT, PMC_CLASS_E500, PMC_CLASS_TSC); PMC_MDEP_TABLE(generic, SOFT, PMC_CLASS_SOFT); static const struct pmc_event_descr tsc_event_table[] = { __PMC_EV_TSC() }; #undef PMC_CLASS_TABLE_DESC #define PMC_CLASS_TABLE_DESC(NAME, CLASS, EVENTS, ALLOCATOR) \ static const struct pmc_class_descr NAME##_class_table_descr = \ { \ .pm_evc_name = #CLASS "-", \ .pm_evc_name_size = sizeof(#CLASS "-") - 1, \ .pm_evc_class = PMC_CLASS_##CLASS , \ .pm_evc_event_table = EVENTS##_event_table , \ .pm_evc_event_table_size = \ PMC_EVENT_TABLE_SIZE(EVENTS), \ .pm_evc_allocate_pmc = ALLOCATOR##_allocate_pmc \ } #if defined(__i386__) || defined(__amd64__) PMC_CLASS_TABLE_DESC(k8, K8, k8, k8); #endif #if defined(__i386__) || defined(__amd64__) PMC_CLASS_TABLE_DESC(tsc, TSC, tsc, tsc); #endif #if defined(__arm__) #if defined(__XSCALE__) PMC_CLASS_TABLE_DESC(xscale, XSCALE, xscale, xscale); #endif PMC_CLASS_TABLE_DESC(cortex_a8, ARMV7, cortex_a8, armv7); PMC_CLASS_TABLE_DESC(cortex_a9, ARMV7, cortex_a9, armv7); #endif #if defined(__aarch64__) PMC_CLASS_TABLE_DESC(cortex_a53, ARMV8, cortex_a53, arm64); PMC_CLASS_TABLE_DESC(cortex_a57, ARMV8, cortex_a57, arm64); #endif #if defined(__mips__) +PMC_CLASS_TABLE_DESC(beri, BERI, beri, mips); PMC_CLASS_TABLE_DESC(mips24k, MIPS24K, mips24k, mips); PMC_CLASS_TABLE_DESC(mips74k, MIPS74K, mips74k, mips); PMC_CLASS_TABLE_DESC(octeon, OCTEON, octeon, mips); #endif /* __mips__ */ #if defined(__powerpc__) PMC_CLASS_TABLE_DESC(ppc7450, PPC7450, ppc7450, powerpc); PMC_CLASS_TABLE_DESC(ppc970, PPC970, ppc970, powerpc); PMC_CLASS_TABLE_DESC(e500, E500, e500, powerpc); #endif static struct pmc_class_descr soft_class_table_descr = { .pm_evc_name = "SOFT-", .pm_evc_name_size = sizeof("SOFT-") - 1, .pm_evc_class = PMC_CLASS_SOFT, .pm_evc_event_table = NULL, .pm_evc_event_table_size = 0, .pm_evc_allocate_pmc = soft_allocate_pmc }; #undef PMC_CLASS_TABLE_DESC static const struct pmc_class_descr **pmc_class_table; #define PMC_CLASS_TABLE_SIZE cpu_info.pm_nclass static const enum pmc_class *pmc_mdep_class_list; static size_t pmc_mdep_class_list_size; /* * Mapping tables, mapping enumeration values to human readable * strings. */ static const char * pmc_capability_names[] = { #undef __PMC_CAP #define __PMC_CAP(N,V,D) #N , __PMC_CAPS() }; struct pmc_class_map { enum pmc_class pm_class; const char *pm_name; }; static const struct pmc_class_map pmc_class_names[] = { #undef __PMC_CLASS #define __PMC_CLASS(S,V,D) { .pm_class = PMC_CLASS_##S, .pm_name = #S } , __PMC_CLASSES() }; struct pmc_cputype_map { enum pmc_cputype pm_cputype; const char *pm_name; }; static const struct pmc_cputype_map pmc_cputype_names[] = { #undef __PMC_CPU #define __PMC_CPU(S, V, D) { .pm_cputype = PMC_CPU_##S, .pm_name = #S } , __PMC_CPUS() }; static const char * pmc_disposition_names[] = { #undef __PMC_DISP #define __PMC_DISP(D) #D , __PMC_DISPOSITIONS() }; static const char * pmc_mode_names[] = { #undef __PMC_MODE #define __PMC_MODE(M,N) #M , __PMC_MODES() }; static const char * pmc_state_names[] = { #undef __PMC_STATE #define __PMC_STATE(S) #S , __PMC_STATES() }; /* * Filled in by pmc_init(). */ static int pmc_syscall = -1; static struct pmc_cpuinfo cpu_info; static struct pmc_op_getdyneventinfo soft_event_info; /* Event masks for events */ struct pmc_masks { const char *pm_name; const uint64_t pm_value; }; #define PMCMASK(N,V) { .pm_name = #N, .pm_value = (V) } #define NULLMASK { .pm_name = NULL } #if defined(__amd64__) || defined(__i386__) static int pmc_parse_mask(const struct pmc_masks *pmask, char *p, uint64_t *evmask) { const struct pmc_masks *pm; char *q, *r; int c; if (pmask == NULL) /* no mask keywords */ return (-1); q = strchr(p, '='); /* skip '=' */ if (*++q == '\0') /* no more data */ return (-1); c = 0; /* count of mask keywords seen */ while ((r = strsep(&q, "+")) != NULL) { for (pm = pmask; pm->pm_name && strcasecmp(r, pm->pm_name); pm++) ; if (pm->pm_name == NULL) /* not found */ return (-1); *evmask |= pm->pm_value; c++; } return (c); } #endif #define KWMATCH(p,kw) (strcasecmp((p), (kw)) == 0) #define KWPREFIXMATCH(p,kw) (strncasecmp((p), (kw), sizeof((kw)) - 1) == 0) #define EV_ALIAS(N,S) { .pm_alias = N, .pm_spec = S } #if defined(__amd64__) || defined(__i386__) /* * AMD K8 PMCs. * */ static struct pmc_event_alias k8_aliases[] = { EV_ALIAS("branches", "k8-fr-retired-taken-branches"), EV_ALIAS("branch-mispredicts", "k8-fr-retired-taken-branches-mispredicted"), EV_ALIAS("cycles", "tsc"), EV_ALIAS("dc-misses", "k8-dc-miss"), EV_ALIAS("ic-misses", "k8-ic-miss"), EV_ALIAS("instructions", "k8-fr-retired-x86-instructions"), EV_ALIAS("interrupts", "k8-fr-taken-hardware-interrupts"), EV_ALIAS("unhalted-cycles", "k8-bu-cpu-clk-unhalted"), EV_ALIAS(NULL, NULL) }; #define __K8MASK(N,V) PMCMASK(N,(1 << (V))) /* * Parsing tables */ /* fp dispatched fpu ops */ static const struct pmc_masks k8_mask_fdfo[] = { __K8MASK(add-pipe-excluding-junk-ops, 0), __K8MASK(multiply-pipe-excluding-junk-ops, 1), __K8MASK(store-pipe-excluding-junk-ops, 2), __K8MASK(add-pipe-junk-ops, 3), __K8MASK(multiply-pipe-junk-ops, 4), __K8MASK(store-pipe-junk-ops, 5), NULLMASK }; /* ls segment register loads */ static const struct pmc_masks k8_mask_lsrl[] = { __K8MASK(es, 0), __K8MASK(cs, 1), __K8MASK(ss, 2), __K8MASK(ds, 3), __K8MASK(fs, 4), __K8MASK(gs, 5), __K8MASK(hs, 6), NULLMASK }; /* ls locked operation */ static const struct pmc_masks k8_mask_llo[] = { __K8MASK(locked-instructions, 0), __K8MASK(cycles-in-request, 1), __K8MASK(cycles-to-complete, 2), NULLMASK }; /* dc refill from {l2,system} and dc copyback */ static const struct pmc_masks k8_mask_dc[] = { __K8MASK(invalid, 0), __K8MASK(shared, 1), __K8MASK(exclusive, 2), __K8MASK(owner, 3), __K8MASK(modified, 4), NULLMASK }; /* dc one bit ecc error */ static const struct pmc_masks k8_mask_dobee[] = { __K8MASK(scrubber, 0), __K8MASK(piggyback, 1), NULLMASK }; /* dc dispatched prefetch instructions */ static const struct pmc_masks k8_mask_ddpi[] = { __K8MASK(load, 0), __K8MASK(store, 1), __K8MASK(nta, 2), NULLMASK }; /* dc dcache accesses by locks */ static const struct pmc_masks k8_mask_dabl[] = { __K8MASK(accesses, 0), __K8MASK(misses, 1), NULLMASK }; /* bu internal l2 request */ static const struct pmc_masks k8_mask_bilr[] = { __K8MASK(ic-fill, 0), __K8MASK(dc-fill, 1), __K8MASK(tlb-reload, 2), __K8MASK(tag-snoop, 3), __K8MASK(cancelled, 4), NULLMASK }; /* bu fill request l2 miss */ static const struct pmc_masks k8_mask_bfrlm[] = { __K8MASK(ic-fill, 0), __K8MASK(dc-fill, 1), __K8MASK(tlb-reload, 2), NULLMASK }; /* bu fill into l2 */ static const struct pmc_masks k8_mask_bfil[] = { __K8MASK(dirty-l2-victim, 0), __K8MASK(victim-from-l2, 1), NULLMASK }; /* fr retired fpu instructions */ static const struct pmc_masks k8_mask_frfi[] = { __K8MASK(x87, 0), __K8MASK(mmx-3dnow, 1), __K8MASK(packed-sse-sse2, 2), __K8MASK(scalar-sse-sse2, 3), NULLMASK }; /* fr retired fastpath double op instructions */ static const struct pmc_masks k8_mask_frfdoi[] = { __K8MASK(low-op-pos-0, 0), __K8MASK(low-op-pos-1, 1), __K8MASK(low-op-pos-2, 2), NULLMASK }; /* fr fpu exceptions */ static const struct pmc_masks k8_mask_ffe[] = { __K8MASK(x87-reclass-microfaults, 0), __K8MASK(sse-retype-microfaults, 1), __K8MASK(sse-reclass-microfaults, 2), __K8MASK(sse-and-x87-microtraps, 3), NULLMASK }; /* nb memory controller page access event */ static const struct pmc_masks k8_mask_nmcpae[] = { __K8MASK(page-hit, 0), __K8MASK(page-miss, 1), __K8MASK(page-conflict, 2), NULLMASK }; /* nb memory controller turnaround */ static const struct pmc_masks k8_mask_nmct[] = { __K8MASK(dimm-turnaround, 0), __K8MASK(read-to-write-turnaround, 1), __K8MASK(write-to-read-turnaround, 2), NULLMASK }; /* nb memory controller bypass saturation */ static const struct pmc_masks k8_mask_nmcbs[] = { __K8MASK(memory-controller-hi-pri-bypass, 0), __K8MASK(memory-controller-lo-pri-bypass, 1), __K8MASK(dram-controller-interface-bypass, 2), __K8MASK(dram-controller-queue-bypass, 3), NULLMASK }; /* nb sized commands */ static const struct pmc_masks k8_mask_nsc[] = { __K8MASK(nonpostwrszbyte, 0), __K8MASK(nonpostwrszdword, 1), __K8MASK(postwrszbyte, 2), __K8MASK(postwrszdword, 3), __K8MASK(rdszbyte, 4), __K8MASK(rdszdword, 5), __K8MASK(rdmodwr, 6), NULLMASK }; /* nb probe result */ static const struct pmc_masks k8_mask_npr[] = { __K8MASK(probe-miss, 0), __K8MASK(probe-hit, 1), __K8MASK(probe-hit-dirty-no-memory-cancel, 2), __K8MASK(probe-hit-dirty-with-memory-cancel, 3), NULLMASK }; /* nb hypertransport bus bandwidth */ static const struct pmc_masks k8_mask_nhbb[] = { /* HT bus bandwidth */ __K8MASK(command, 0), __K8MASK(data, 1), __K8MASK(buffer-release, 2), __K8MASK(nop, 3), NULLMASK }; #undef __K8MASK #define K8_KW_COUNT "count" #define K8_KW_EDGE "edge" #define K8_KW_INV "inv" #define K8_KW_MASK "mask" #define K8_KW_OS "os" #define K8_KW_USR "usr" static int k8_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { char *e, *p, *q; int n; uint32_t count; uint64_t evmask; const struct pmc_masks *pm, *pmask; pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); pmc_config->pm_md.pm_amd.pm_amd_config = 0; pmask = NULL; evmask = 0; #define __K8SETMASK(M) pmask = k8_mask_##M /* setup parsing tables */ switch (pe) { case PMC_EV_K8_FP_DISPATCHED_FPU_OPS: __K8SETMASK(fdfo); break; case PMC_EV_K8_LS_SEGMENT_REGISTER_LOAD: __K8SETMASK(lsrl); break; case PMC_EV_K8_LS_LOCKED_OPERATION: __K8SETMASK(llo); break; case PMC_EV_K8_DC_REFILL_FROM_L2: case PMC_EV_K8_DC_REFILL_FROM_SYSTEM: case PMC_EV_K8_DC_COPYBACK: __K8SETMASK(dc); break; case PMC_EV_K8_DC_ONE_BIT_ECC_ERROR: __K8SETMASK(dobee); break; case PMC_EV_K8_DC_DISPATCHED_PREFETCH_INSTRUCTIONS: __K8SETMASK(ddpi); break; case PMC_EV_K8_DC_DCACHE_ACCESSES_BY_LOCKS: __K8SETMASK(dabl); break; case PMC_EV_K8_BU_INTERNAL_L2_REQUEST: __K8SETMASK(bilr); break; case PMC_EV_K8_BU_FILL_REQUEST_L2_MISS: __K8SETMASK(bfrlm); break; case PMC_EV_K8_BU_FILL_INTO_L2: __K8SETMASK(bfil); break; case PMC_EV_K8_FR_RETIRED_FPU_INSTRUCTIONS: __K8SETMASK(frfi); break; case PMC_EV_K8_FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS: __K8SETMASK(frfdoi); break; case PMC_EV_K8_FR_FPU_EXCEPTIONS: __K8SETMASK(ffe); break; case PMC_EV_K8_NB_MEMORY_CONTROLLER_PAGE_ACCESS_EVENT: __K8SETMASK(nmcpae); break; case PMC_EV_K8_NB_MEMORY_CONTROLLER_TURNAROUND: __K8SETMASK(nmct); break; case PMC_EV_K8_NB_MEMORY_CONTROLLER_BYPASS_SATURATION: __K8SETMASK(nmcbs); break; case PMC_EV_K8_NB_SIZED_COMMANDS: __K8SETMASK(nsc); break; case PMC_EV_K8_NB_PROBE_RESULT: __K8SETMASK(npr); break; case PMC_EV_K8_NB_HT_BUS0_BANDWIDTH: case PMC_EV_K8_NB_HT_BUS1_BANDWIDTH: case PMC_EV_K8_NB_HT_BUS2_BANDWIDTH: __K8SETMASK(nhbb); break; default: break; /* no options defined */ } while ((p = strsep(&ctrspec, ",")) != NULL) { if (KWPREFIXMATCH(p, K8_KW_COUNT "=")) { q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); count = strtol(q, &e, 0); if (e == q || *e != '\0') return (-1); pmc_config->pm_caps |= PMC_CAP_THRESHOLD; pmc_config->pm_md.pm_amd.pm_amd_config |= AMD_PMC_TO_COUNTER(count); } else if (KWMATCH(p, K8_KW_EDGE)) { pmc_config->pm_caps |= PMC_CAP_EDGE; } else if (KWMATCH(p, K8_KW_INV)) { pmc_config->pm_caps |= PMC_CAP_INVERT; } else if (KWPREFIXMATCH(p, K8_KW_MASK "=")) { if ((n = pmc_parse_mask(pmask, p, &evmask)) < 0) return (-1); pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } else if (KWMATCH(p, K8_KW_OS)) { pmc_config->pm_caps |= PMC_CAP_SYSTEM; } else if (KWMATCH(p, K8_KW_USR)) { pmc_config->pm_caps |= PMC_CAP_USER; } else return (-1); } /* other post processing */ switch (pe) { case PMC_EV_K8_FP_DISPATCHED_FPU_OPS: case PMC_EV_K8_FP_CYCLES_WITH_NO_FPU_OPS_RETIRED: case PMC_EV_K8_FP_DISPATCHED_FPU_FAST_FLAG_OPS: case PMC_EV_K8_FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS: case PMC_EV_K8_FR_RETIRED_FPU_INSTRUCTIONS: case PMC_EV_K8_FR_FPU_EXCEPTIONS: /* XXX only available in rev B and later */ break; case PMC_EV_K8_DC_DCACHE_ACCESSES_BY_LOCKS: /* XXX only available in rev C and later */ break; case PMC_EV_K8_LS_LOCKED_OPERATION: /* XXX CPU Rev A,B evmask is to be zero */ if (evmask & (evmask - 1)) /* > 1 bit set */ return (-1); if (evmask == 0) { evmask = 0x01; /* Rev C and later: #instrs */ pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } break; default: if (evmask == 0 && pmask != NULL) { for (pm = pmask; pm->pm_name; pm++) evmask |= pm->pm_value; pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } } if (pmc_config->pm_caps & PMC_CAP_QUALIFIER) pmc_config->pm_md.pm_amd.pm_amd_config = AMD_PMC_TO_UNITMASK(evmask); return (0); } #endif #if defined(__i386__) || defined(__amd64__) static int tsc_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { if (pe != PMC_EV_TSC_TSC) return (-1); /* TSC events must be unqualified. */ if (ctrspec && *ctrspec != '\0') return (-1); pmc_config->pm_md.pm_amd.pm_amd_config = 0; pmc_config->pm_caps |= PMC_CAP_READ; return (0); } #endif static struct pmc_event_alias generic_aliases[] = { EV_ALIAS("instructions", "SOFT-CLOCK.HARD"), EV_ALIAS(NULL, NULL) }; static int soft_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { (void)ctrspec; (void)pmc_config; if ((int)pe < PMC_EV_SOFT_FIRST || (int)pe > PMC_EV_SOFT_LAST) return (-1); pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); return (0); } #if defined(__arm__) #if defined(__XSCALE__) static struct pmc_event_alias xscale_aliases[] = { EV_ALIAS("branches", "BRANCH_RETIRED"), EV_ALIAS("branch-mispredicts", "BRANCH_MISPRED"), EV_ALIAS("dc-misses", "DC_MISS"), EV_ALIAS("ic-misses", "IC_MISS"), EV_ALIAS("instructions", "INSTR_RETIRED"), EV_ALIAS(NULL, NULL) }; static int xscale_allocate_pmc(enum pmc_event pe, char *ctrspec __unused, struct pmc_op_pmcallocate *pmc_config __unused) { switch (pe) { default: break; } return (0); } #endif static struct pmc_event_alias cortex_a8_aliases[] = { EV_ALIAS("dc-misses", "L1_DCACHE_REFILL"), EV_ALIAS("ic-misses", "L1_ICACHE_REFILL"), EV_ALIAS("instructions", "INSTR_EXECUTED"), EV_ALIAS(NULL, NULL) }; static struct pmc_event_alias cortex_a9_aliases[] = { EV_ALIAS("dc-misses", "L1_DCACHE_REFILL"), EV_ALIAS("ic-misses", "L1_ICACHE_REFILL"), EV_ALIAS("instructions", "INSTR_EXECUTED"), EV_ALIAS(NULL, NULL) }; static int armv7_allocate_pmc(enum pmc_event pe, char *ctrspec __unused, struct pmc_op_pmcallocate *pmc_config __unused) { switch (pe) { default: break; } return (0); } #endif #if defined(__aarch64__) static struct pmc_event_alias cortex_a53_aliases[] = { EV_ALIAS(NULL, NULL) }; static struct pmc_event_alias cortex_a57_aliases[] = { EV_ALIAS(NULL, NULL) }; static int arm64_allocate_pmc(enum pmc_event pe, char *ctrspec __unused, struct pmc_op_pmcallocate *pmc_config __unused) { switch (pe) { default: break; } return (0); } #endif #if defined(__mips__) +static struct pmc_event_alias beri_aliases[] = { + EV_ALIAS("instructions", "INST"), + EV_ALIAS(NULL, NULL) +}; + static struct pmc_event_alias mips24k_aliases[] = { EV_ALIAS("instructions", "INSTR_EXECUTED"), EV_ALIAS("branches", "BRANCH_COMPLETED"), EV_ALIAS("branch-mispredicts", "BRANCH_MISPRED"), EV_ALIAS(NULL, NULL) }; static struct pmc_event_alias mips74k_aliases[] = { EV_ALIAS("instructions", "INSTR_EXECUTED"), EV_ALIAS("branches", "BRANCH_INSNS"), EV_ALIAS("branch-mispredicts", "MISPREDICTED_BRANCH_INSNS"), EV_ALIAS(NULL, NULL) }; static struct pmc_event_alias octeon_aliases[] = { EV_ALIAS("instructions", "RET"), EV_ALIAS("branches", "BR"), EV_ALIAS("branch-mispredicts", "BRMIS"), EV_ALIAS(NULL, NULL) }; #define MIPS_KW_OS "os" #define MIPS_KW_USR "usr" #define MIPS_KW_ANYTHREAD "anythread" static int mips_allocate_pmc(enum pmc_event pe, char *ctrspec __unused, struct pmc_op_pmcallocate *pmc_config __unused) { char *p; (void) pe; pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); while ((p = strsep(&ctrspec, ",")) != NULL) { if (KWMATCH(p, MIPS_KW_OS)) pmc_config->pm_caps |= PMC_CAP_SYSTEM; else if (KWMATCH(p, MIPS_KW_USR)) pmc_config->pm_caps |= PMC_CAP_USER; else if (KWMATCH(p, MIPS_KW_ANYTHREAD)) pmc_config->pm_caps |= (PMC_CAP_USER | PMC_CAP_SYSTEM); else return (-1); } return (0); } #endif /* __mips__ */ #if defined(__powerpc__) static struct pmc_event_alias ppc7450_aliases[] = { EV_ALIAS("instructions", "INSTR_COMPLETED"), EV_ALIAS("branches", "BRANCHES_COMPLETED"), EV_ALIAS("branch-mispredicts", "MISPREDICTED_BRANCHES"), EV_ALIAS(NULL, NULL) }; static struct pmc_event_alias ppc970_aliases[] = { EV_ALIAS("instructions", "INSTR_COMPLETED"), EV_ALIAS("cycles", "CYCLES"), EV_ALIAS(NULL, NULL) }; static struct pmc_event_alias e500_aliases[] = { EV_ALIAS("instructions", "INSTR_COMPLETED"), EV_ALIAS("cycles", "CYCLES"), EV_ALIAS(NULL, NULL) }; #define POWERPC_KW_OS "os" #define POWERPC_KW_USR "usr" #define POWERPC_KW_ANYTHREAD "anythread" static int powerpc_allocate_pmc(enum pmc_event pe, char *ctrspec __unused, struct pmc_op_pmcallocate *pmc_config __unused) { char *p; (void) pe; pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); while ((p = strsep(&ctrspec, ",")) != NULL) { if (KWMATCH(p, POWERPC_KW_OS)) pmc_config->pm_caps |= PMC_CAP_SYSTEM; else if (KWMATCH(p, POWERPC_KW_USR)) pmc_config->pm_caps |= PMC_CAP_USER; else if (KWMATCH(p, POWERPC_KW_ANYTHREAD)) pmc_config->pm_caps |= (PMC_CAP_USER | PMC_CAP_SYSTEM); else return (-1); } return (0); } #endif /* __powerpc__ */ /* * Match an event name `name' with its canonical form. * * Matches are case insensitive and spaces, periods, underscores and * hyphen characters are considered to match each other. * * Returns 1 for a match, 0 otherwise. */ static int pmc_match_event_name(const char *name, const char *canonicalname) { int cc, nc; const unsigned char *c, *n; c = (const unsigned char *) canonicalname; n = (const unsigned char *) name; for (; (nc = *n) && (cc = *c); n++, c++) { if ((nc == ' ' || nc == '_' || nc == '-' || nc == '.') && (cc == ' ' || cc == '_' || cc == '-' || cc == '.')) continue; if (toupper(nc) == toupper(cc)) continue; return (0); } if (*n == '\0' && *c == '\0') return (1); return (0); } /* * Match an event name against all the event named supported by a * PMC class. * * Returns an event descriptor pointer on match or NULL otherwise. */ static const struct pmc_event_descr * pmc_match_event_class(const char *name, const struct pmc_class_descr *pcd) { size_t n; const struct pmc_event_descr *ev; ev = pcd->pm_evc_event_table; for (n = 0; n < pcd->pm_evc_event_table_size; n++, ev++) if (pmc_match_event_name(name, ev->pm_ev_name)) return (ev); return (NULL); } static int pmc_mdep_is_compatible_class(enum pmc_class pc) { size_t n; for (n = 0; n < pmc_mdep_class_list_size; n++) if (pmc_mdep_class_list[n] == pc) return (1); return (0); } /* * API entry points */ int pmc_allocate(const char *ctrspec, enum pmc_mode mode, uint32_t flags, int cpu, pmc_id_t *pmcid, uint64_t count) { size_t n; int retval; char *r, *spec_copy; const char *ctrname; const struct pmc_event_descr *ev; const struct pmc_event_alias *alias; struct pmc_op_pmcallocate pmc_config; const struct pmc_class_descr *pcd; spec_copy = NULL; retval = -1; if (mode != PMC_MODE_SS && mode != PMC_MODE_TS && mode != PMC_MODE_SC && mode != PMC_MODE_TC) { errno = EINVAL; goto out; } bzero(&pmc_config, sizeof(pmc_config)); pmc_config.pm_cpu = cpu; pmc_config.pm_mode = mode; pmc_config.pm_flags = flags; pmc_config.pm_count = count; if (PMC_IS_SAMPLING_MODE(mode)) pmc_config.pm_caps |= PMC_CAP_INTERRUPT; /* * Can we pull this straight from the pmu table? */ r = spec_copy = strdup(ctrspec); ctrname = strsep(&r, ","); if (pmc_pmu_enabled()) { if (pmc_pmu_pmcallocate(ctrname, &pmc_config) == 0) { if (PMC_CALL(PMCALLOCATE, &pmc_config) < 0) { goto out; } retval = 0; *pmcid = pmc_config.pm_pmcid; goto out; } errx(EX_USAGE, "ERROR: pmc_pmu_allocate failed, check for ctrname %s\n", ctrname); } else { free(spec_copy); spec_copy = NULL; } /* replace an event alias with the canonical event specifier */ if (pmc_mdep_event_aliases) for (alias = pmc_mdep_event_aliases; alias->pm_alias; alias++) if (!strcasecmp(ctrspec, alias->pm_alias)) { spec_copy = strdup(alias->pm_spec); break; } if (spec_copy == NULL) spec_copy = strdup(ctrspec); r = spec_copy; ctrname = strsep(&r, ","); /* * If a explicit class prefix was given by the user, restrict the * search for the event to the specified PMC class. */ ev = NULL; for (n = 0; n < PMC_CLASS_TABLE_SIZE; n++) { pcd = pmc_class_table[n]; if (pcd && pmc_mdep_is_compatible_class(pcd->pm_evc_class) && strncasecmp(ctrname, pcd->pm_evc_name, pcd->pm_evc_name_size) == 0) { if ((ev = pmc_match_event_class(ctrname + pcd->pm_evc_name_size, pcd)) == NULL) { errno = EINVAL; goto out; } break; } } /* * Otherwise, search for this event in all compatible PMC * classes. */ for (n = 0; ev == NULL && n < PMC_CLASS_TABLE_SIZE; n++) { pcd = pmc_class_table[n]; if (pcd && pmc_mdep_is_compatible_class(pcd->pm_evc_class)) ev = pmc_match_event_class(ctrname, pcd); } if (ev == NULL) { errno = EINVAL; goto out; } pmc_config.pm_ev = ev->pm_ev_code; pmc_config.pm_class = pcd->pm_evc_class; if (pcd->pm_evc_allocate_pmc(ev->pm_ev_code, r, &pmc_config) < 0) { errno = EINVAL; goto out; } if (PMC_CALL(PMCALLOCATE, &pmc_config) < 0) goto out; *pmcid = pmc_config.pm_pmcid; retval = 0; out: if (spec_copy) free(spec_copy); return (retval); } int pmc_attach(pmc_id_t pmc, pid_t pid) { struct pmc_op_pmcattach pmc_attach_args; pmc_attach_args.pm_pmc = pmc; pmc_attach_args.pm_pid = pid; return (PMC_CALL(PMCATTACH, &pmc_attach_args)); } int pmc_capabilities(pmc_id_t pmcid, uint32_t *caps) { unsigned int i; enum pmc_class cl; cl = PMC_ID_TO_CLASS(pmcid); for (i = 0; i < cpu_info.pm_nclass; i++) if (cpu_info.pm_classes[i].pm_class == cl) { *caps = cpu_info.pm_classes[i].pm_caps; return (0); } errno = EINVAL; return (-1); } int pmc_configure_logfile(int fd) { struct pmc_op_configurelog cla; cla.pm_logfd = fd; if (PMC_CALL(CONFIGURELOG, &cla) < 0) return (-1); return (0); } int pmc_cpuinfo(const struct pmc_cpuinfo **pci) { if (pmc_syscall == -1) { errno = ENXIO; return (-1); } *pci = &cpu_info; return (0); } int pmc_detach(pmc_id_t pmc, pid_t pid) { struct pmc_op_pmcattach pmc_detach_args; pmc_detach_args.pm_pmc = pmc; pmc_detach_args.pm_pid = pid; return (PMC_CALL(PMCDETACH, &pmc_detach_args)); } int pmc_disable(int cpu, int pmc) { struct pmc_op_pmcadmin ssa; ssa.pm_cpu = cpu; ssa.pm_pmc = pmc; ssa.pm_state = PMC_STATE_DISABLED; return (PMC_CALL(PMCADMIN, &ssa)); } int pmc_enable(int cpu, int pmc) { struct pmc_op_pmcadmin ssa; ssa.pm_cpu = cpu; ssa.pm_pmc = pmc; ssa.pm_state = PMC_STATE_FREE; return (PMC_CALL(PMCADMIN, &ssa)); } /* * Return a list of events known to a given PMC class. 'cl' is the * PMC class identifier, 'eventnames' is the returned list of 'const * char *' pointers pointing to the names of the events. 'nevents' is * the number of event name pointers returned. * * The space for 'eventnames' is allocated using malloc(3). The caller * is responsible for freeing this space when done. */ int pmc_event_names_of_class(enum pmc_class cl, const char ***eventnames, int *nevents) { int count; const char **names; const struct pmc_event_descr *ev; switch (cl) { case PMC_CLASS_IAF: ev = iaf_event_table; count = PMC_EVENT_TABLE_SIZE(iaf); break; case PMC_CLASS_TSC: ev = tsc_event_table; count = PMC_EVENT_TABLE_SIZE(tsc); break; case PMC_CLASS_K8: ev = k8_event_table; count = PMC_EVENT_TABLE_SIZE(k8); break; case PMC_CLASS_XSCALE: ev = xscale_event_table; count = PMC_EVENT_TABLE_SIZE(xscale); break; case PMC_CLASS_ARMV7: switch (cpu_info.pm_cputype) { default: case PMC_CPU_ARMV7_CORTEX_A8: ev = cortex_a8_event_table; count = PMC_EVENT_TABLE_SIZE(cortex_a8); break; case PMC_CPU_ARMV7_CORTEX_A9: ev = cortex_a9_event_table; count = PMC_EVENT_TABLE_SIZE(cortex_a9); break; } break; case PMC_CLASS_ARMV8: switch (cpu_info.pm_cputype) { default: case PMC_CPU_ARMV8_CORTEX_A53: ev = cortex_a53_event_table; count = PMC_EVENT_TABLE_SIZE(cortex_a53); break; case PMC_CPU_ARMV8_CORTEX_A57: ev = cortex_a57_event_table; count = PMC_EVENT_TABLE_SIZE(cortex_a57); break; } break; + case PMC_CLASS_BERI: + ev = beri_event_table; + count = PMC_EVENT_TABLE_SIZE(beri); + break; case PMC_CLASS_MIPS24K: ev = mips24k_event_table; count = PMC_EVENT_TABLE_SIZE(mips24k); break; case PMC_CLASS_MIPS74K: ev = mips74k_event_table; count = PMC_EVENT_TABLE_SIZE(mips74k); break; case PMC_CLASS_OCTEON: ev = octeon_event_table; count = PMC_EVENT_TABLE_SIZE(octeon); break; case PMC_CLASS_PPC7450: ev = ppc7450_event_table; count = PMC_EVENT_TABLE_SIZE(ppc7450); break; case PMC_CLASS_PPC970: ev = ppc970_event_table; count = PMC_EVENT_TABLE_SIZE(ppc970); break; case PMC_CLASS_E500: ev = e500_event_table; count = PMC_EVENT_TABLE_SIZE(e500); break; case PMC_CLASS_SOFT: ev = soft_event_table; count = soft_event_info.pm_nevent; break; default: errno = EINVAL; return (-1); } if ((names = malloc(count * sizeof(const char *))) == NULL) return (-1); *eventnames = names; *nevents = count; for (;count--; ev++, names++) *names = ev->pm_ev_name; return (0); } int pmc_flush_logfile(void) { return (PMC_CALL(FLUSHLOG,0)); } int pmc_close_logfile(void) { return (PMC_CALL(CLOSELOG,0)); } int pmc_get_driver_stats(struct pmc_driverstats *ds) { struct pmc_op_getdriverstats gms; if (PMC_CALL(GETDRIVERSTATS, &gms) < 0) return (-1); /* copy out fields in the current userland<->library interface */ ds->pm_intr_ignored = gms.pm_intr_ignored; ds->pm_intr_processed = gms.pm_intr_processed; ds->pm_intr_bufferfull = gms.pm_intr_bufferfull; ds->pm_syscalls = gms.pm_syscalls; ds->pm_syscall_errors = gms.pm_syscall_errors; ds->pm_buffer_requests = gms.pm_buffer_requests; ds->pm_buffer_requests_failed = gms.pm_buffer_requests_failed; ds->pm_log_sweeps = gms.pm_log_sweeps; return (0); } int pmc_get_msr(pmc_id_t pmc, uint32_t *msr) { struct pmc_op_getmsr gm; gm.pm_pmcid = pmc; if (PMC_CALL(PMCGETMSR, &gm) < 0) return (-1); *msr = gm.pm_msr; return (0); } int pmc_init(void) { int error, pmc_mod_id; unsigned int n; uint32_t abi_version; struct module_stat pmc_modstat; struct pmc_op_getcpuinfo op_cpu_info; #if defined(__amd64__) || defined(__i386__) int cpu_has_iaf_counters; unsigned int t; #endif if (pmc_syscall != -1) /* already inited */ return (0); /* retrieve the system call number from the KLD */ if ((pmc_mod_id = modfind(PMC_MODULE_NAME)) < 0) return (-1); pmc_modstat.version = sizeof(struct module_stat); if ((error = modstat(pmc_mod_id, &pmc_modstat)) < 0) return (-1); pmc_syscall = pmc_modstat.data.intval; /* check the kernel module's ABI against our compiled-in version */ abi_version = PMC_VERSION; if (PMC_CALL(GETMODULEVERSION, &abi_version) < 0) return (pmc_syscall = -1); /* ignore patch & minor numbers for the comparison */ if ((abi_version & 0xFF000000) != (PMC_VERSION & 0xFF000000)) { errno = EPROGMISMATCH; return (pmc_syscall = -1); } bzero(&op_cpu_info, sizeof(op_cpu_info)); if (PMC_CALL(GETCPUINFO, &op_cpu_info) < 0) return (pmc_syscall = -1); cpu_info.pm_cputype = op_cpu_info.pm_cputype; cpu_info.pm_ncpu = op_cpu_info.pm_ncpu; cpu_info.pm_npmc = op_cpu_info.pm_npmc; cpu_info.pm_nclass = op_cpu_info.pm_nclass; for (n = 0; n < op_cpu_info.pm_nclass; n++) memcpy(&cpu_info.pm_classes[n], &op_cpu_info.pm_classes[n], sizeof(cpu_info.pm_classes[n])); pmc_class_table = malloc(PMC_CLASS_TABLE_SIZE * sizeof(struct pmc_class_descr *)); if (pmc_class_table == NULL) return (-1); for (n = 0; n < PMC_CLASS_TABLE_SIZE; n++) pmc_class_table[n] = NULL; /* * Get soft events list. */ soft_event_info.pm_class = PMC_CLASS_SOFT; if (PMC_CALL(GETDYNEVENTINFO, &soft_event_info) < 0) return (pmc_syscall = -1); /* Map soft events to static list. */ for (n = 0; n < soft_event_info.pm_nevent; n++) { soft_event_table[n].pm_ev_name = soft_event_info.pm_events[n].pm_ev_name; soft_event_table[n].pm_ev_code = soft_event_info.pm_events[n].pm_ev_code; } soft_class_table_descr.pm_evc_event_table_size = \ soft_event_info.pm_nevent; soft_class_table_descr.pm_evc_event_table = \ soft_event_table; /* * Fill in the class table. */ n = 0; /* Fill soft events information. */ pmc_class_table[n++] = &soft_class_table_descr; #if defined(__amd64__) || defined(__i386__) if (cpu_info.pm_cputype != PMC_CPU_GENERIC) pmc_class_table[n++] = &tsc_class_table_descr; /* * Check if this CPU has fixed function counters. */ cpu_has_iaf_counters = 0; for (t = 0; t < cpu_info.pm_nclass; t++) if (cpu_info.pm_classes[t].pm_class == PMC_CLASS_IAF && cpu_info.pm_classes[t].pm_num > 0) cpu_has_iaf_counters = 1; #endif #define PMC_MDEP_INIT(C) do { \ pmc_mdep_event_aliases = C##_aliases; \ pmc_mdep_class_list = C##_pmc_classes; \ pmc_mdep_class_list_size = \ PMC_TABLE_SIZE(C##_pmc_classes); \ } while (0) #define PMC_MDEP_INIT_INTEL_V2(C) do { \ PMC_MDEP_INIT(C); \ pmc_class_table[n++] = &iaf_class_table_descr; \ if (!cpu_has_iaf_counters) \ pmc_mdep_event_aliases = \ C##_aliases_without_iaf; \ pmc_class_table[n] = &C##_class_table_descr; \ } while (0) /* Configure the event name parser. */ switch (cpu_info.pm_cputype) { #if defined(__amd64__) || defined(__i386__) case PMC_CPU_AMD_K8: PMC_MDEP_INIT(k8); pmc_class_table[n] = &k8_class_table_descr; break; #endif case PMC_CPU_GENERIC: PMC_MDEP_INIT(generic); break; #if defined(__arm__) #if defined(__XSCALE__) case PMC_CPU_INTEL_XSCALE: PMC_MDEP_INIT(xscale); pmc_class_table[n] = &xscale_class_table_descr; break; #endif case PMC_CPU_ARMV7_CORTEX_A8: PMC_MDEP_INIT(cortex_a8); pmc_class_table[n] = &cortex_a8_class_table_descr; break; case PMC_CPU_ARMV7_CORTEX_A9: PMC_MDEP_INIT(cortex_a9); pmc_class_table[n] = &cortex_a9_class_table_descr; break; #endif #if defined(__aarch64__) case PMC_CPU_ARMV8_CORTEX_A53: PMC_MDEP_INIT(cortex_a53); pmc_class_table[n] = &cortex_a53_class_table_descr; break; case PMC_CPU_ARMV8_CORTEX_A57: PMC_MDEP_INIT(cortex_a57); pmc_class_table[n] = &cortex_a57_class_table_descr; break; #endif #if defined(__mips__) + case PMC_CPU_MIPS_BERI: + PMC_MDEP_INIT(beri); + pmc_class_table[n] = &beri_class_table_descr; + break; case PMC_CPU_MIPS_24K: PMC_MDEP_INIT(mips24k); pmc_class_table[n] = &mips24k_class_table_descr; break; case PMC_CPU_MIPS_74K: PMC_MDEP_INIT(mips74k); pmc_class_table[n] = &mips74k_class_table_descr; break; case PMC_CPU_MIPS_OCTEON: PMC_MDEP_INIT(octeon); pmc_class_table[n] = &octeon_class_table_descr; break; #endif /* __mips__ */ #if defined(__powerpc__) case PMC_CPU_PPC_7450: PMC_MDEP_INIT(ppc7450); pmc_class_table[n] = &ppc7450_class_table_descr; break; case PMC_CPU_PPC_970: PMC_MDEP_INIT(ppc970); pmc_class_table[n] = &ppc970_class_table_descr; break; case PMC_CPU_PPC_E500: PMC_MDEP_INIT(e500); pmc_class_table[n] = &e500_class_table_descr; break; #endif default: /* * Some kind of CPU this version of the library knows nothing * about. This shouldn't happen since the abi version check * should have caught this. */ #if defined(__amd64__) || defined(__i386__) break; #endif errno = ENXIO; return (pmc_syscall = -1); } return (0); } const char * pmc_name_of_capability(enum pmc_caps cap) { int i; /* * 'cap' should have a single bit set and should be in * range. */ if ((cap & (cap - 1)) || cap < PMC_CAP_FIRST || cap > PMC_CAP_LAST) { errno = EINVAL; return (NULL); } i = ffs(cap); return (pmc_capability_names[i - 1]); } const char * pmc_name_of_class(enum pmc_class pc) { size_t n; for (n = 0; n < PMC_TABLE_SIZE(pmc_class_names); n++) if (pc == pmc_class_names[n].pm_class) return (pmc_class_names[n].pm_name); errno = EINVAL; return (NULL); } const char * pmc_name_of_cputype(enum pmc_cputype cp) { size_t n; for (n = 0; n < PMC_TABLE_SIZE(pmc_cputype_names); n++) if (cp == pmc_cputype_names[n].pm_cputype) return (pmc_cputype_names[n].pm_name); errno = EINVAL; return (NULL); } const char * pmc_name_of_disposition(enum pmc_disp pd) { if ((int) pd >= PMC_DISP_FIRST && pd <= PMC_DISP_LAST) return (pmc_disposition_names[pd]); errno = EINVAL; return (NULL); } const char * _pmc_name_of_event(enum pmc_event pe, enum pmc_cputype cpu) { const struct pmc_event_descr *ev, *evfence; ev = evfence = NULL; if (pe >= PMC_EV_K8_FIRST && pe <= PMC_EV_K8_LAST) { ev = k8_event_table; evfence = k8_event_table + PMC_EVENT_TABLE_SIZE(k8); } else if (pe >= PMC_EV_XSCALE_FIRST && pe <= PMC_EV_XSCALE_LAST) { ev = xscale_event_table; evfence = xscale_event_table + PMC_EVENT_TABLE_SIZE(xscale); } else if (pe >= PMC_EV_ARMV7_FIRST && pe <= PMC_EV_ARMV7_LAST) { switch (cpu) { case PMC_CPU_ARMV7_CORTEX_A8: ev = cortex_a8_event_table; evfence = cortex_a8_event_table + PMC_EVENT_TABLE_SIZE(cortex_a8); break; case PMC_CPU_ARMV7_CORTEX_A9: ev = cortex_a9_event_table; evfence = cortex_a9_event_table + PMC_EVENT_TABLE_SIZE(cortex_a9); break; default: /* Unknown CPU type. */ break; } } else if (pe >= PMC_EV_ARMV8_FIRST && pe <= PMC_EV_ARMV8_LAST) { switch (cpu) { case PMC_CPU_ARMV8_CORTEX_A53: ev = cortex_a53_event_table; evfence = cortex_a53_event_table + PMC_EVENT_TABLE_SIZE(cortex_a53); break; case PMC_CPU_ARMV8_CORTEX_A57: ev = cortex_a57_event_table; evfence = cortex_a57_event_table + PMC_EVENT_TABLE_SIZE(cortex_a57); break; default: /* Unknown CPU type. */ break; } + } else if (pe >= PMC_EV_BERI_FIRST && pe <= PMC_EV_BERI_LAST) { + ev = beri_event_table; + evfence = beri_event_table + PMC_EVENT_TABLE_SIZE(beri); } else if (pe >= PMC_EV_MIPS24K_FIRST && pe <= PMC_EV_MIPS24K_LAST) { ev = mips24k_event_table; evfence = mips24k_event_table + PMC_EVENT_TABLE_SIZE(mips24k); } else if (pe >= PMC_EV_MIPS74K_FIRST && pe <= PMC_EV_MIPS74K_LAST) { ev = mips74k_event_table; evfence = mips74k_event_table + PMC_EVENT_TABLE_SIZE(mips74k); } else if (pe >= PMC_EV_OCTEON_FIRST && pe <= PMC_EV_OCTEON_LAST) { ev = octeon_event_table; evfence = octeon_event_table + PMC_EVENT_TABLE_SIZE(octeon); } else if (pe >= PMC_EV_PPC7450_FIRST && pe <= PMC_EV_PPC7450_LAST) { ev = ppc7450_event_table; evfence = ppc7450_event_table + PMC_EVENT_TABLE_SIZE(ppc7450); } else if (pe >= PMC_EV_PPC970_FIRST && pe <= PMC_EV_PPC970_LAST) { ev = ppc970_event_table; evfence = ppc970_event_table + PMC_EVENT_TABLE_SIZE(ppc970); } else if (pe >= PMC_EV_E500_FIRST && pe <= PMC_EV_E500_LAST) { ev = e500_event_table; evfence = e500_event_table + PMC_EVENT_TABLE_SIZE(e500); } else if (pe == PMC_EV_TSC_TSC) { ev = tsc_event_table; evfence = tsc_event_table + PMC_EVENT_TABLE_SIZE(tsc); } else if ((int)pe >= PMC_EV_SOFT_FIRST && (int)pe <= PMC_EV_SOFT_LAST) { ev = soft_event_table; evfence = soft_event_table + soft_event_info.pm_nevent; } for (; ev != evfence; ev++) if (pe == ev->pm_ev_code) return (ev->pm_ev_name); return (NULL); } const char * pmc_name_of_event(enum pmc_event pe) { const char *n; if ((n = _pmc_name_of_event(pe, cpu_info.pm_cputype)) != NULL) return (n); errno = EINVAL; return (NULL); } const char * pmc_name_of_mode(enum pmc_mode pm) { if ((int) pm >= PMC_MODE_FIRST && pm <= PMC_MODE_LAST) return (pmc_mode_names[pm]); errno = EINVAL; return (NULL); } const char * pmc_name_of_state(enum pmc_state ps) { if ((int) ps >= PMC_STATE_FIRST && ps <= PMC_STATE_LAST) return (pmc_state_names[ps]); errno = EINVAL; return (NULL); } int pmc_ncpu(void) { if (pmc_syscall == -1) { errno = ENXIO; return (-1); } return (cpu_info.pm_ncpu); } int pmc_npmc(int cpu) { if (pmc_syscall == -1) { errno = ENXIO; return (-1); } if (cpu < 0 || cpu >= (int) cpu_info.pm_ncpu) { errno = EINVAL; return (-1); } return (cpu_info.pm_npmc); } int pmc_pmcinfo(int cpu, struct pmc_pmcinfo **ppmci) { int nbytes, npmc; struct pmc_op_getpmcinfo *pmci; if ((npmc = pmc_npmc(cpu)) < 0) return (-1); nbytes = sizeof(struct pmc_op_getpmcinfo) + npmc * sizeof(struct pmc_info); if ((pmci = calloc(1, nbytes)) == NULL) return (-1); pmci->pm_cpu = cpu; if (PMC_CALL(GETPMCINFO, pmci) < 0) { free(pmci); return (-1); } /* kernel<->library, library<->userland interfaces are identical */ *ppmci = (struct pmc_pmcinfo *) pmci; return (0); } int pmc_read(pmc_id_t pmc, pmc_value_t *value) { struct pmc_op_pmcrw pmc_read_op; pmc_read_op.pm_pmcid = pmc; pmc_read_op.pm_flags = PMC_F_OLDVALUE; pmc_read_op.pm_value = -1; if (PMC_CALL(PMCRW, &pmc_read_op) < 0) return (-1); *value = pmc_read_op.pm_value; return (0); } int pmc_release(pmc_id_t pmc) { struct pmc_op_simple pmc_release_args; pmc_release_args.pm_pmcid = pmc; return (PMC_CALL(PMCRELEASE, &pmc_release_args)); } int pmc_rw(pmc_id_t pmc, pmc_value_t newvalue, pmc_value_t *oldvaluep) { struct pmc_op_pmcrw pmc_rw_op; pmc_rw_op.pm_pmcid = pmc; pmc_rw_op.pm_flags = PMC_F_NEWVALUE | PMC_F_OLDVALUE; pmc_rw_op.pm_value = newvalue; if (PMC_CALL(PMCRW, &pmc_rw_op) < 0) return (-1); *oldvaluep = pmc_rw_op.pm_value; return (0); } int pmc_set(pmc_id_t pmc, pmc_value_t value) { struct pmc_op_pmcsetcount sc; sc.pm_pmcid = pmc; sc.pm_count = value; if (PMC_CALL(PMCSETCOUNT, &sc) < 0) return (-1); return (0); } int pmc_start(pmc_id_t pmc) { struct pmc_op_simple pmc_start_args; pmc_start_args.pm_pmcid = pmc; return (PMC_CALL(PMCSTART, &pmc_start_args)); } int pmc_stop(pmc_id_t pmc) { struct pmc_op_simple pmc_stop_args; pmc_stop_args.pm_pmcid = pmc; return (PMC_CALL(PMCSTOP, &pmc_stop_args)); } int pmc_width(pmc_id_t pmcid, uint32_t *width) { unsigned int i; enum pmc_class cl; cl = PMC_ID_TO_CLASS(pmcid); for (i = 0; i < cpu_info.pm_nclass; i++) if (cpu_info.pm_classes[i].pm_class == cl) { *width = cpu_info.pm_classes[i].pm_width; return (0); } errno = EINVAL; return (-1); } int pmc_write(pmc_id_t pmc, pmc_value_t value) { struct pmc_op_pmcrw pmc_write_op; pmc_write_op.pm_pmcid = pmc; pmc_write_op.pm_flags = PMC_F_NEWVALUE; pmc_write_op.pm_value = value; return (PMC_CALL(PMCRW, &pmc_write_op)); } int pmc_writelog(uint32_t userdata) { struct pmc_op_writelog wl; wl.pm_userdata = userdata; return (PMC_CALL(WRITELOG, &wl)); } Index: projects/clang900-import/sbin/ifconfig/ifmedia.c =================================================================== --- projects/clang900-import/sbin/ifconfig/ifmedia.c (revision 352536) +++ projects/clang900-import/sbin/ifconfig/ifmedia.c (revision 352537) @@ -1,793 +1,815 @@ /* $NetBSD: ifconfig.c,v 1.34 1997/04/21 01:17:58 lukem Exp $ */ /* $FreeBSD$ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1997 Jason R. Thorpe. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project * by Jason R. Thorpe. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1983, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include "ifconfig.h" static void domediaopt(const char *, int, int); static int get_media_subtype(int, const char *); static int get_media_mode(int, const char *); static int get_media_options(int, const char *); static int lookup_media_word(struct ifmedia_description *, const char *); static void print_media_word(int, int); static void print_media_word_ifconfig(int); static struct ifmedia_description *get_toptype_desc(int); static struct ifmedia_type_to_subtype *get_toptype_ttos(int); static struct ifmedia_description *get_subtype_desc(int, struct ifmedia_type_to_subtype *ttos); #define IFM_OPMODE(x) \ ((x) & (IFM_IEEE80211_ADHOC | IFM_IEEE80211_HOSTAP | \ IFM_IEEE80211_IBSS | IFM_IEEE80211_WDS | IFM_IEEE80211_MONITOR | \ IFM_IEEE80211_MBSS)) #define IFM_IEEE80211_STA 0 static void media_status(int s) { struct ifmediareq ifmr; + struct ifdownreason ifdr; int *media_list, i; - int xmedia = 1; + bool no_carrier, xmedia; (void) memset(&ifmr, 0, sizeof(ifmr)); (void) strlcpy(ifmr.ifm_name, name, sizeof(ifmr.ifm_name)); + xmedia = true; /* * Check if interface supports extended media types. */ if (ioctl(s, SIOCGIFXMEDIA, (caddr_t)&ifmr) < 0) - xmedia = 0; - if (xmedia == 0 && ioctl(s, SIOCGIFMEDIA, (caddr_t)&ifmr) < 0) { + xmedia = false; + if (!xmedia && ioctl(s, SIOCGIFMEDIA, (caddr_t)&ifmr) < 0) { /* * Interface doesn't support SIOC{G,S}IFMEDIA. */ return; } if (ifmr.ifm_count == 0) { warnx("%s: no media types?", name); return; } media_list = (int *)malloc(ifmr.ifm_count * sizeof(int)); if (media_list == NULL) err(1, "malloc"); ifmr.ifm_ulist = media_list; if (xmedia) { if (ioctl(s, SIOCGIFXMEDIA, (caddr_t)&ifmr) < 0) err(1, "SIOCGIFXMEDIA"); } else { if (ioctl(s, SIOCGIFMEDIA, (caddr_t)&ifmr) < 0) err(1, "SIOCGIFMEDIA"); } printf("\tmedia: "); print_media_word(ifmr.ifm_current, 1); if (ifmr.ifm_active != ifmr.ifm_current) { putchar(' '); putchar('('); print_media_word(ifmr.ifm_active, 0); putchar(')'); } putchar('\n'); if (ifmr.ifm_status & IFM_AVALID) { + no_carrier = false; printf("\tstatus: "); switch (IFM_TYPE(ifmr.ifm_active)) { case IFM_ETHER: case IFM_ATM: if (ifmr.ifm_status & IFM_ACTIVE) printf("active"); else - printf("no carrier"); + no_carrier = true; break; case IFM_IEEE80211: if (ifmr.ifm_status & IFM_ACTIVE) { /* NB: only sta mode associates */ if (IFM_OPMODE(ifmr.ifm_active) == IFM_IEEE80211_STA) printf("associated"); else printf("running"); } else - printf("no carrier"); + no_carrier = true; break; + } + if (no_carrier) { + printf("no carrier"); + memset(&ifdr, 0, sizeof(ifdr)); + strlcpy(ifdr.ifdr_name, name, sizeof(ifdr.ifdr_name)); + if (ioctl(s, SIOCGIFDOWNREASON, (caddr_t)&ifdr) == 0) { + switch (ifdr.ifdr_reason) { + case IFDR_REASON_MSG: + printf(" (%s)", ifdr.ifdr_msg); + break; + case IFDR_REASON_VENDOR: + printf(" (vendor code %d)", + ifdr.ifdr_vendor); + break; + default: + break; + } + } } putchar('\n'); } if (ifmr.ifm_count > 0 && supmedia) { printf("\tsupported media:\n"); for (i = 0; i < ifmr.ifm_count; i++) { printf("\t\t"); print_media_word_ifconfig(media_list[i]); putchar('\n'); } } free(media_list); } struct ifmediareq * ifmedia_getstate(int s) { static struct ifmediareq *ifmr = NULL; int *mwords; int xmedia = 1; if (ifmr == NULL) { ifmr = (struct ifmediareq *)malloc(sizeof(struct ifmediareq)); if (ifmr == NULL) err(1, "malloc"); (void) memset(ifmr, 0, sizeof(struct ifmediareq)); (void) strlcpy(ifmr->ifm_name, name, sizeof(ifmr->ifm_name)); ifmr->ifm_count = 0; ifmr->ifm_ulist = NULL; /* * We must go through the motions of reading all * supported media because we need to know both * the current media type and the top-level type. */ if (ioctl(s, SIOCGIFXMEDIA, (caddr_t)ifmr) < 0) { xmedia = 0; } if (xmedia == 0 && ioctl(s, SIOCGIFMEDIA, (caddr_t)ifmr) < 0) { err(1, "SIOCGIFMEDIA"); } if (ifmr->ifm_count == 0) errx(1, "%s: no media types?", name); mwords = (int *)malloc(ifmr->ifm_count * sizeof(int)); if (mwords == NULL) err(1, "malloc"); ifmr->ifm_ulist = mwords; if (xmedia) { if (ioctl(s, SIOCGIFXMEDIA, (caddr_t)ifmr) < 0) err(1, "SIOCGIFXMEDIA"); } else { if (ioctl(s, SIOCGIFMEDIA, (caddr_t)ifmr) < 0) err(1, "SIOCGIFMEDIA"); } } return ifmr; } static void setifmediacallback(int s, void *arg) { struct ifmediareq *ifmr = (struct ifmediareq *)arg; static int did_it = 0; if (!did_it) { ifr.ifr_media = ifmr->ifm_current; if (ioctl(s, SIOCSIFMEDIA, (caddr_t)&ifr) < 0) err(1, "SIOCSIFMEDIA (media)"); free(ifmr->ifm_ulist); free(ifmr); did_it = 1; } } static void setmedia(const char *val, int d, int s, const struct afswtch *afp) { struct ifmediareq *ifmr; int subtype; ifmr = ifmedia_getstate(s); /* * We are primarily concerned with the top-level type. * However, "current" may be only IFM_NONE, so we just look * for the top-level type in the first "supported type" * entry. * * (I'm assuming that all supported media types for a given * interface will be the same top-level type..) */ subtype = get_media_subtype(IFM_TYPE(ifmr->ifm_ulist[0]), val); strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_media = (ifmr->ifm_current & IFM_IMASK) | IFM_TYPE(ifmr->ifm_ulist[0]) | subtype; ifmr->ifm_current = ifr.ifr_media; callback_register(setifmediacallback, (void *)ifmr); } static void setmediaopt(const char *val, int d, int s, const struct afswtch *afp) { domediaopt(val, 0, s); } static void unsetmediaopt(const char *val, int d, int s, const struct afswtch *afp) { domediaopt(val, 1, s); } static void domediaopt(const char *val, int clear, int s) { struct ifmediareq *ifmr; int options; ifmr = ifmedia_getstate(s); options = get_media_options(IFM_TYPE(ifmr->ifm_ulist[0]), val); strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_media = ifmr->ifm_current; if (clear) ifr.ifr_media &= ~options; else { if (options & IFM_HDX) { ifr.ifr_media &= ~IFM_FDX; options &= ~IFM_HDX; } ifr.ifr_media |= options; } ifmr->ifm_current = ifr.ifr_media; callback_register(setifmediacallback, (void *)ifmr); } static void setmediainst(const char *val, int d, int s, const struct afswtch *afp) { struct ifmediareq *ifmr; int inst; ifmr = ifmedia_getstate(s); inst = atoi(val); if (inst < 0 || inst > (int)IFM_INST_MAX) errx(1, "invalid media instance: %s", val); strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_media = (ifmr->ifm_current & ~IFM_IMASK) | inst << IFM_ISHIFT; ifmr->ifm_current = ifr.ifr_media; callback_register(setifmediacallback, (void *)ifmr); } static void setmediamode(const char *val, int d, int s, const struct afswtch *afp) { struct ifmediareq *ifmr; int mode; ifmr = ifmedia_getstate(s); mode = get_media_mode(IFM_TYPE(ifmr->ifm_ulist[0]), val); strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_media = (ifmr->ifm_current & ~IFM_MMASK) | mode; ifmr->ifm_current = ifr.ifr_media; callback_register(setifmediacallback, (void *)ifmr); } /********************************************************************** * A good chunk of this is duplicated from sys/net/if_media.c **********************************************************************/ static struct ifmedia_description ifm_type_descriptions[] = IFM_TYPE_DESCRIPTIONS; static struct ifmedia_description ifm_subtype_ethernet_descriptions[] = IFM_SUBTYPE_ETHERNET_DESCRIPTIONS; static struct ifmedia_description ifm_subtype_ethernet_aliases[] = IFM_SUBTYPE_ETHERNET_ALIASES; static struct ifmedia_description ifm_subtype_ethernet_option_descriptions[] = IFM_SUBTYPE_ETHERNET_OPTION_DESCRIPTIONS; static struct ifmedia_description ifm_subtype_ieee80211_descriptions[] = IFM_SUBTYPE_IEEE80211_DESCRIPTIONS; static struct ifmedia_description ifm_subtype_ieee80211_aliases[] = IFM_SUBTYPE_IEEE80211_ALIASES; static struct ifmedia_description ifm_subtype_ieee80211_option_descriptions[] = IFM_SUBTYPE_IEEE80211_OPTION_DESCRIPTIONS; struct ifmedia_description ifm_subtype_ieee80211_mode_descriptions[] = IFM_SUBTYPE_IEEE80211_MODE_DESCRIPTIONS; struct ifmedia_description ifm_subtype_ieee80211_mode_aliases[] = IFM_SUBTYPE_IEEE80211_MODE_ALIASES; static struct ifmedia_description ifm_subtype_atm_descriptions[] = IFM_SUBTYPE_ATM_DESCRIPTIONS; static struct ifmedia_description ifm_subtype_atm_aliases[] = IFM_SUBTYPE_ATM_ALIASES; static struct ifmedia_description ifm_subtype_atm_option_descriptions[] = IFM_SUBTYPE_ATM_OPTION_DESCRIPTIONS; static struct ifmedia_description ifm_subtype_shared_descriptions[] = IFM_SUBTYPE_SHARED_DESCRIPTIONS; static struct ifmedia_description ifm_subtype_shared_aliases[] = IFM_SUBTYPE_SHARED_ALIASES; static struct ifmedia_description ifm_shared_option_descriptions[] = IFM_SHARED_OPTION_DESCRIPTIONS; static struct ifmedia_description ifm_shared_option_aliases[] = IFM_SHARED_OPTION_ALIASES; struct ifmedia_type_to_subtype { struct { struct ifmedia_description *desc; int alias; } subtypes[5]; struct { struct ifmedia_description *desc; int alias; } options[4]; struct { struct ifmedia_description *desc; int alias; } modes[3]; }; /* must be in the same order as IFM_TYPE_DESCRIPTIONS */ static struct ifmedia_type_to_subtype ifmedia_types_to_subtypes[] = { { { { &ifm_subtype_shared_descriptions[0], 0 }, { &ifm_subtype_shared_aliases[0], 1 }, { &ifm_subtype_ethernet_descriptions[0], 0 }, { &ifm_subtype_ethernet_aliases[0], 1 }, { NULL, 0 }, }, { { &ifm_shared_option_descriptions[0], 0 }, { &ifm_shared_option_aliases[0], 1 }, { &ifm_subtype_ethernet_option_descriptions[0], 0 }, { NULL, 0 }, }, { { NULL, 0 }, }, }, { { { &ifm_subtype_shared_descriptions[0], 0 }, { &ifm_subtype_shared_aliases[0], 1 }, { &ifm_subtype_ieee80211_descriptions[0], 0 }, { &ifm_subtype_ieee80211_aliases[0], 1 }, { NULL, 0 }, }, { { &ifm_shared_option_descriptions[0], 0 }, { &ifm_shared_option_aliases[0], 1 }, { &ifm_subtype_ieee80211_option_descriptions[0], 0 }, { NULL, 0 }, }, { { &ifm_subtype_ieee80211_mode_descriptions[0], 0 }, { &ifm_subtype_ieee80211_mode_aliases[0], 0 }, { NULL, 0 }, }, }, { { { &ifm_subtype_shared_descriptions[0], 0 }, { &ifm_subtype_shared_aliases[0], 1 }, { &ifm_subtype_atm_descriptions[0], 0 }, { &ifm_subtype_atm_aliases[0], 1 }, { NULL, 0 }, }, { { &ifm_shared_option_descriptions[0], 0 }, { &ifm_shared_option_aliases[0], 1 }, { &ifm_subtype_atm_option_descriptions[0], 0 }, { NULL, 0 }, }, { { NULL, 0 }, }, }, }; static int get_media_subtype(int type, const char *val) { struct ifmedia_description *desc; struct ifmedia_type_to_subtype *ttos; int rval, i; /* Find the top-level interface type. */ for (desc = ifm_type_descriptions, ttos = ifmedia_types_to_subtypes; desc->ifmt_string != NULL; desc++, ttos++) if (type == desc->ifmt_word) break; if (desc->ifmt_string == NULL) errx(1, "unknown media type 0x%x", type); for (i = 0; ttos->subtypes[i].desc != NULL; i++) { rval = lookup_media_word(ttos->subtypes[i].desc, val); if (rval != -1) return (rval); } errx(1, "unknown media subtype: %s", val); /*NOTREACHED*/ } static int get_media_mode(int type, const char *val) { struct ifmedia_description *desc; struct ifmedia_type_to_subtype *ttos; int rval, i; /* Find the top-level interface type. */ for (desc = ifm_type_descriptions, ttos = ifmedia_types_to_subtypes; desc->ifmt_string != NULL; desc++, ttos++) if (type == desc->ifmt_word) break; if (desc->ifmt_string == NULL) errx(1, "unknown media mode 0x%x", type); for (i = 0; ttos->modes[i].desc != NULL; i++) { rval = lookup_media_word(ttos->modes[i].desc, val); if (rval != -1) return (rval); } return -1; } static int get_media_options(int type, const char *val) { struct ifmedia_description *desc; struct ifmedia_type_to_subtype *ttos; char *optlist, *optptr; int option = 0, i, rval = 0; /* We muck with the string, so copy it. */ optlist = strdup(val); if (optlist == NULL) err(1, "strdup"); /* Find the top-level interface type. */ for (desc = ifm_type_descriptions, ttos = ifmedia_types_to_subtypes; desc->ifmt_string != NULL; desc++, ttos++) if (type == desc->ifmt_word) break; if (desc->ifmt_string == NULL) errx(1, "unknown media type 0x%x", type); /* * Look up the options in the user-provided comma-separated * list. */ optptr = optlist; for (; (optptr = strtok(optptr, ",")) != NULL; optptr = NULL) { for (i = 0; ttos->options[i].desc != NULL; i++) { option = lookup_media_word(ttos->options[i].desc, optptr); if (option != -1) break; } if (option == 0) errx(1, "unknown option: %s", optptr); rval |= option; } free(optlist); return (rval); } static int lookup_media_word(struct ifmedia_description *desc, const char *val) { for (; desc->ifmt_string != NULL; desc++) if (strcasecmp(desc->ifmt_string, val) == 0) return (desc->ifmt_word); return (-1); } static struct ifmedia_description *get_toptype_desc(int ifmw) { struct ifmedia_description *desc; for (desc = ifm_type_descriptions; desc->ifmt_string != NULL; desc++) if (IFM_TYPE(ifmw) == desc->ifmt_word) break; return desc; } static struct ifmedia_type_to_subtype *get_toptype_ttos(int ifmw) { struct ifmedia_description *desc; struct ifmedia_type_to_subtype *ttos; for (desc = ifm_type_descriptions, ttos = ifmedia_types_to_subtypes; desc->ifmt_string != NULL; desc++, ttos++) if (IFM_TYPE(ifmw) == desc->ifmt_word) break; return ttos; } static struct ifmedia_description *get_subtype_desc(int ifmw, struct ifmedia_type_to_subtype *ttos) { int i; struct ifmedia_description *desc; for (i = 0; ttos->subtypes[i].desc != NULL; i++) { if (ttos->subtypes[i].alias) continue; for (desc = ttos->subtypes[i].desc; desc->ifmt_string != NULL; desc++) { if (IFM_SUBTYPE(ifmw) == desc->ifmt_word) return desc; } } return NULL; } static struct ifmedia_description *get_mode_desc(int ifmw, struct ifmedia_type_to_subtype *ttos) { int i; struct ifmedia_description *desc; for (i = 0; ttos->modes[i].desc != NULL; i++) { if (ttos->modes[i].alias) continue; for (desc = ttos->modes[i].desc; desc->ifmt_string != NULL; desc++) { if (IFM_MODE(ifmw) == desc->ifmt_word) return desc; } } return NULL; } static void print_media_word(int ifmw, int print_toptype) { struct ifmedia_description *desc; struct ifmedia_type_to_subtype *ttos; int seen_option = 0, i; /* Find the top-level interface type. */ desc = get_toptype_desc(ifmw); ttos = get_toptype_ttos(ifmw); if (desc->ifmt_string == NULL) { printf(""); return; } else if (print_toptype) { printf("%s", desc->ifmt_string); } /* * Don't print the top-level type; it's not like we can * change it, or anything. */ /* Find subtype. */ desc = get_subtype_desc(ifmw, ttos); if (desc == NULL) { printf(""); return; } if (print_toptype) putchar(' '); printf("%s", desc->ifmt_string); if (print_toptype) { desc = get_mode_desc(ifmw, ttos); if (desc != NULL && strcasecmp("autoselect", desc->ifmt_string)) printf(" mode %s", desc->ifmt_string); } /* Find options. */ for (i = 0; ttos->options[i].desc != NULL; i++) { if (ttos->options[i].alias) continue; for (desc = ttos->options[i].desc; desc->ifmt_string != NULL; desc++) { if (ifmw & desc->ifmt_word) { if (seen_option == 0) printf(" <"); printf("%s%s", seen_option++ ? "," : "", desc->ifmt_string); } } } printf("%s", seen_option ? ">" : ""); if (print_toptype && IFM_INST(ifmw) != 0) printf(" instance %d", IFM_INST(ifmw)); } static void print_media_word_ifconfig(int ifmw) { struct ifmedia_description *desc; struct ifmedia_type_to_subtype *ttos; int seen_option = 0, i; /* Find the top-level interface type. */ desc = get_toptype_desc(ifmw); ttos = get_toptype_ttos(ifmw); if (desc->ifmt_string == NULL) { printf(""); return; } /* * Don't print the top-level type; it's not like we can * change it, or anything. */ /* Find subtype. */ desc = get_subtype_desc(ifmw, ttos); if (desc == NULL) { printf(""); return; } printf("media %s", desc->ifmt_string); desc = get_mode_desc(ifmw, ttos); if (desc != NULL) printf(" mode %s", desc->ifmt_string); /* Find options. */ for (i = 0; ttos->options[i].desc != NULL; i++) { if (ttos->options[i].alias) continue; for (desc = ttos->options[i].desc; desc->ifmt_string != NULL; desc++) { if (ifmw & desc->ifmt_word) { if (seen_option == 0) printf(" mediaopt "); printf("%s%s", seen_option++ ? "," : "", desc->ifmt_string); } } } if (IFM_INST(ifmw) != 0) printf(" instance %d", IFM_INST(ifmw)); } /********************************************************************** * ...until here. **********************************************************************/ static struct cmd media_cmds[] = { DEF_CMD_ARG("media", setmedia), DEF_CMD_ARG("mode", setmediamode), DEF_CMD_ARG("mediaopt", setmediaopt), DEF_CMD_ARG("-mediaopt",unsetmediaopt), DEF_CMD_ARG("inst", setmediainst), DEF_CMD_ARG("instance", setmediainst), }; static struct afswtch af_media = { .af_name = "af_media", .af_af = AF_UNSPEC, .af_other_status = media_status, }; static __constructor void ifmedia_ctor(void) { size_t i; for (i = 0; i < nitems(media_cmds); i++) cmd_register(&media_cmds[i]); af_register(&af_media); } Index: projects/clang900-import/share/man/man5/src.conf.5 =================================================================== --- projects/clang900-import/share/man/man5/src.conf.5 (revision 352536) +++ projects/clang900-import/share/man/man5/src.conf.5 (revision 352537) @@ -1,2009 +1,2020 @@ .\" DO NOT EDIT-- this file is @generated by tools/build/options/makeman. .\" $FreeBSD$ -.Dd August 16, 2019 +.Dd September 17, 2019 .Dt SRC.CONF 5 .Os .Sh NAME .Nm src.conf .Nd "source build options" .Sh DESCRIPTION The .Nm file contains settings that will apply to every build involving the .Fx source tree; see .Xr build 7 . .Pp The .Nm file uses the standard makefile syntax. However, .Nm should not specify any dependencies to .Xr make 1 . Instead, .Nm is to set .Xr make 1 variables that control the aspects of how the system builds. .Pp The default location of .Nm is .Pa /etc/src.conf , though an alternative location can be specified in the .Xr make 1 variable .Va SRCCONF . Overriding the location of .Nm may be necessary if the system-wide settings are not suitable for a particular build. For instance, setting .Va SRCCONF to .Pa /dev/null effectively resets all build controls to their defaults. .Pp The only purpose of .Nm is to control the compilation of the .Fx source code, which is usually located in .Pa /usr/src . As a rule, the system administrator creates .Nm when the values of certain control variables need to be changed from their defaults. .Pp In addition, control variables can be specified for a particular build via the .Fl D option of .Xr make 1 or in its environment; see .Xr environ 7 . .Pp The environment of .Xr make 1 for the build can be controlled via the .Va SRC_ENV_CONF variable, which defaults to .Pa /etc/src-env.conf . Some examples that may only be set in this file are .Va WITH_DIRDEPS_BUILD , and .Va WITH_META_MODE , and .Va MAKEOBJDIRPREFIX as they are environment-only variables. .Pp The values of variables are ignored regardless of their setting; even if they would be set to .Dq Li FALSE or .Dq Li NO . The presence of an option causes it to be honored by .Xr make 1 . .Pp This list provides a name and short description for variables that can be used for source builds. .Bl -tag -width indent .It Va WITHOUT_ACCT Set to not build process accounting tools such as .Xr accton 8 and .Xr sa 8 . .It Va WITHOUT_ACPI Set to not build .Xr acpiconf 8 , .Xr acpidump 8 and related programs. .It Va WITHOUT_AMD Set to not build .Xr amd 8 , and related programs. .It Va WITHOUT_APM Set to not build .Xr apm 8 , .Xr apmd 8 and related programs. .It Va WITHOUT_ASSERT_DEBUG Set to compile programs and libraries without the .Xr assert 3 checks. .It Va WITHOUT_AT Set to not build .Xr at 1 and related utilities. .It Va WITHOUT_ATM Set to not build programs and libraries related to ATM networking. .It Va WITHOUT_AUDIT Set to not build audit support into system programs. .It Va WITHOUT_AUTHPF Set to not build .Xr authpf 8 . .It Va WITHOUT_AUTOFS Set to not build .Xr autofs 5 related programs, libraries, and kernel modules. .It Va WITHOUT_AUTO_OBJ Disable automatic creation of objdirs. This is enabled by default if the wanted OBJDIR is writable by the current user. .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITH_BEARSSL Build the BearSSL library. .Pp BearSSL is a tiny SSL library suitable for embedded environments. For details see .Lk http://www.BearSSL.org/ .Pp This library is currently only used to perform signature verification and related operations for Verified Exec and .Xr loader 8 . When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITH_LOADER_EFI_SECUREBOOT (unless .Va WITHOUT_LOADER_EFI_SECUREBOOT is set explicitly) .It Va WITH_LOADER_VERIEXEC (unless .Va WITHOUT_LOADER_VERIEXEC is set explicitly) .It Va WITH_VERIEXEC (unless .Va WITHOUT_VERIEXEC is set explicitly) .El .It Va WITHOUT_BHYVE Set to not build or install .Xr bhyve 8 , associated utilities, and examples. .Pp This option only affects amd64/amd64. .It Va WITH_BIND_NOW Build all binaries with the .Dv DF_BIND_NOW flag set to indicate that the run-time loader should perform all relocation processing at process startup rather than on demand. .It Va WITHOUT_BINUTILS Set to not build or install GNU .Xr as 1 , .Xr objdump 1 , and for some CPU architectures .Xr ld.bfd 1 as part of the normal system build. The resulting system cannot build programs from source. .Pp This is a default setting on arm64/aarch64 and riscv/riscv64. .It Va WITH_BINUTILS Set to build and install GNU .Xr as 1 , .Xr objdump 1 , and for some CPU architectures .Xr ld.bfd 1 as part of the normal system build. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and sparc64/sparc64. .It Va WITHOUT_BINUTILS_BOOTSTRAP Set to not build binutils (as, ld, and objdump) as part of the bootstrap process. .Bf -symbolic The option does not work for build targets unless some alternative toolchain is provided. .Ef .Pp This is a default setting on arm64/aarch64 and riscv/riscv64. .It Va WITH_BINUTILS_BOOTSTRAP Set build binutils (as, ld, and objdump) as part of the bootstrap process. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and sparc64/sparc64. .It Va WITHOUT_BLACKLIST Set this if you do not want to build .Xr blacklistd 8 and .Xr blacklistctl 8 . When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_BLACKLIST_SUPPORT (unless .Va WITH_BLACKLIST_SUPPORT is set explicitly) .El .It Va WITHOUT_BLACKLIST_SUPPORT Set to build some programs without .Xr libblacklist 3 support, like .Xr fingerd 8 , .Xr ftpd 8 , and .Xr sshd 8 . .It Va WITHOUT_BLUETOOTH Set to not build Bluetooth related kernel modules, programs and libraries. .It Va WITHOUT_BOOT Set to not build the boot blocks and loader. .It Va WITHOUT_BOOTPARAMD Set to not build or install .Xr bootparamd 8 . .It Va WITHOUT_BOOTPD Set to not build or install .Xr bootpd 8 . .It Va WITHOUT_BSDINSTALL Set to not build .Xr bsdinstall 8 , .Xr sade 8 , and related programs. .It Va WITHOUT_BSD_CPIO Set to not build the BSD licensed version of cpio based on .Xr libarchive 3 . .It Va WITHOUT_BSD_CRTBEGIN Disable the BSD licensed .Pa crtbegin.o and .Pa crtend.o . .Pp This is a default setting on sparc64/sparc64. .It Va WITH_BSD_CRTBEGIN Enable the BSD licensed .Pa crtbegin.o and .Pa crtend.o . .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and riscv/riscv64. .It Va WITH_BSD_GREP Install BSD-licensed grep as '[ef]grep' instead of GNU grep. .It Va WITHOUT_BSNMP Set to not build or install .Xr bsnmpd 1 and related libraries and data files. .It Va WITHOUT_BZIP2 Set to not build contributed bzip2 software as a part of the base system. .Bf -symbolic The option has no effect yet. .Ef When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_BZIP2_SUPPORT (unless .Va WITH_BZIP2_SUPPORT is set explicitly) .El .It Va WITHOUT_BZIP2_SUPPORT Set to build some programs without optional bzip2 support. .It Va WITHOUT_CALENDAR Set to not build .Xr calendar 1 . .It Va WITHOUT_CAPSICUM Set to not build Capsicum support into system programs. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_CASPER .El .It Va WITHOUT_CASPER Set to not build Casper program and related libraries. .It Va WITH_CCACHE_BUILD Set to use .Xr ccache 1 for the build. No configuration is required except to install the .Sy devel/ccache package. When using with .Xr distcc 1 , set .Sy CCACHE_PREFIX=/usr/local/bin/distcc . The default cache directory of .Pa $HOME/.ccache will be used, which can be overridden by setting .Sy CCACHE_DIR . The .Sy CCACHE_COMPILERCHECK option defaults to .Sy content when using the in-tree bootstrap compiler, and .Sy mtime when using an external compiler. The .Sy CCACHE_CPP2 option is used for Clang but not GCC. .Pp Sharing a cache between multiple work directories requires using a layout similar to .Pa /some/prefix/src .Pa /some/prefix/obj and an environment such as: .Bd -literal -offset indent CCACHE_BASEDIR='${SRCTOP:H}' MAKEOBJDIRPREFIX='${SRCTOP:H}/obj' .Ed .Pp See .Xr ccache 1 for more configuration options. .It Va WITHOUT_CCD Set to not build .Xr geom_ccd 4 and related utilities. .It Va WITHOUT_CDDL Set to not build code licensed under Sun's CDDL. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_CTF .It .Va WITHOUT_LOADER_ZFS .It .Va WITHOUT_ZFS .El .It Va WITHOUT_CLANG Set to not build the Clang C/C++ compiler during the regular phase of the build. .Pp This is a default setting on riscv/riscv64 and sparc64/sparc64. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_CLANG_EXTRAS .It .Va WITHOUT_CLANG_FULL .It .Va WITHOUT_LLVM_COV .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_LLVM_TARGET_AARCH64 (unless .Va WITH_LLVM_TARGET_AARCH64 is set explicitly) .It Va WITHOUT_LLVM_TARGET_ALL (unless .Va WITH_LLVM_TARGET_ALL is set explicitly) .It Va WITHOUT_LLVM_TARGET_ARM (unless .Va WITH_LLVM_TARGET_ARM is set explicitly) .It Va WITHOUT_LLVM_TARGET_MIPS (unless .Va WITH_LLVM_TARGET_MIPS is set explicitly) .It Va WITHOUT_LLVM_TARGET_POWERPC (unless .Va WITH_LLVM_TARGET_POWERPC is set explicitly) .It Va WITHOUT_LLVM_TARGET_SPARC (unless .Va WITH_LLVM_TARGET_SPARC is set explicitly) .It Va WITHOUT_LLVM_TARGET_X86 (unless .Va WITH_LLVM_TARGET_X86 is set explicitly) .El .It Va WITH_CLANG Set to build the Clang C/C++ compiler during the normal phase of the build. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITHOUT_CLANG_BOOTSTRAP Set to not build the Clang C/C++ compiler during the bootstrap phase of the build. To be able to build the system, either gcc or clang bootstrap must be enabled unless an alternate compiler is provided via XCC. .Pp This is a default setting on mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_CLANG_BOOTSTRAP Set to build the Clang C/C++ compiler during the bootstrap phase of the build. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64 and i386/i386. .It Va WITH_CLANG_EXTRAS Set to build additional clang and llvm tools, such as bugpoint and clang-format. .It Va WITHOUT_CLANG_FULL Set to avoid building the ARCMigrate, Rewriter and StaticAnalyzer components of the Clang C/C++ compiler. .Pp This is a default setting on riscv/riscv64 and sparc64/sparc64. .It Va WITH_CLANG_FULL Set to build the ARCMigrate, Rewriter and StaticAnalyzer components of the Clang C/C++ compiler. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITHOUT_CLANG_IS_CC Set to install the GCC compiler as .Pa /usr/bin/cc , .Pa /usr/bin/c++ and .Pa /usr/bin/cpp . .Pp This is a default setting on mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_CLANG_IS_CC Set to install the Clang C/C++ compiler as .Pa /usr/bin/cc , .Pa /usr/bin/c++ and .Pa /usr/bin/cpp . .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64 and i386/i386. .It Va WITHOUT_CPP Set to not build .Xr cpp 1 . .It Va WITHOUT_CROSS_COMPILER Set to not build any cross compiler in the cross-tools stage of buildworld. When compiling a different version of .Fx than what is installed on the system, provide an alternate compiler with XCC to ensure success. When compiling with an identical version of .Fx to the host, this option may be safely used. This option may also be safe when the host version of .Fx is close to the sources being built, but all bets are off if there have been any changes to the toolchain between the versions. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_BINUTILS_BOOTSTRAP .It .Va WITHOUT_CLANG_BOOTSTRAP .It .Va WITHOUT_ELFTOOLCHAIN_BOOTSTRAP .It .Va WITHOUT_GCC_BOOTSTRAP .It .Va WITHOUT_LLD_BOOTSTRAP .El .It Va WITHOUT_CRYPT Set to not build any crypto code. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_KERBEROS .It .Va WITHOUT_OPENSSH .It .Va WITHOUT_OPENSSL .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_GSSAPI (unless .Va WITH_GSSAPI is set explicitly) .El .It Va WITH_CTF Set to compile with CTF (Compact C Type Format) data. CTF data encapsulates a reduced form of debugging information similar to DWARF and the venerable stabs and is required for DTrace. .It Va WITHOUT_CUSE Set to not build CUSE-related programs and libraries. .It Va WITHOUT_CXGBETOOL Set to not build .Xr cxgbetool 8 .Pp This is a default setting on arm/arm, arm/armv6, arm/armv7, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpcspe and riscv/riscv64. .It Va WITH_CXGBETOOL Set to build .Xr cxgbetool 8 .Pp This is a default setting on amd64/amd64, arm64/aarch64, i386/i386, powerpc/powerpc64 and sparc64/sparc64. .It Va WITHOUT_CXX Set to not build .Xr c++ 1 and related libraries. It will also prevent building of .Xr gperf 1 and .Xr devd 8 . When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_CLANG .It .Va WITHOUT_CLANG_EXTRAS .It .Va WITHOUT_CLANG_FULL .It .Va WITHOUT_DTRACE_TESTS .It .Va WITHOUT_GNUCXX .It .Va WITHOUT_LLVM_COV .It .Va WITHOUT_TESTS .El .It Va WITHOUT_DEBUG_FILES Set to avoid building or installing standalone debug files for each executable binary and shared library. .It Va WITHOUT_DIALOG Set to not build .Xr dialog 1 , .Xr dialog 3 , .Xr dpv 1 , and .Xr dpv 3 . When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_BSDINSTALL .El .It Va WITHOUT_DICT Set to not build the Webster dictionary files. .It Va WITH_DIRDEPS_BUILD This is an experimental build system. For details see http://www.crufty.net/sjg/docs/freebsd-meta-mode.htm. Build commands can be seen from the top-level with: .Dl make show-valid-targets The build is driven by dirdeps.mk using .Va DIRDEPS stored in Makefile.depend files found in each directory. .Pp The build can be started from anywhere, and behaves the same. The initial instance of .Xr make 1 recursively reads .Va DIRDEPS from .Pa Makefile.depend , computing a graph of tree dependencies from the current origin. Setting .Va NO_DIRDEPS skips checking dirdep dependencies and will only build in the current and child directories. .Va NO_DIRDEPS_BELOW skips building any dirdeps and only build the current directory. .Pp This also utilizes the .Va WITH_META_MODE logic for incremental builds. .Pp The build hides commands executed unless .Va NO_SILENT is defined. .Pp Note that there is currently no mass install feature for this. .Pp When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITH_INSTALL_AS_USER .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITH_META_MODE (unless .Va WITHOUT_META_MODE is set explicitly) .It Va WITH_STAGING (unless .Va WITHOUT_STAGING is set explicitly) .It Va WITH_STAGING_MAN (unless .Va WITHOUT_STAGING_MAN is set explicitly) .It Va WITH_STAGING_PROG (unless .Va WITHOUT_STAGING_PROG is set explicitly) .It Va WITH_SYSROOT (unless .Va WITHOUT_SYSROOT is set explicitly) .El .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITH_DIRDEPS_CACHE Cache result of dirdeps.mk which can save significant time for subsequent builds. Depends on .Va WITH_DIRDEPS_BUILD . .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITHOUT_DMAGENT Set to not build dma Mail Transport Agent. .It Va WITHOUT_DOCCOMPRESS Set to not install compressed system documentation. Only the uncompressed version will be installed. .It Va WITH_DTRACE_TESTS Set to build and install the DTrace test suite in .Pa /usr/tests/cddl/usr.sbin/dtrace . This test suite is considered experimental on architectures other than amd64/amd64 and running it may cause system instability. .It Va WITHOUT_DYNAMICROOT Set this if you do not want to link .Pa /bin and .Pa /sbin dynamically. .It Va WITHOUT_EE Set to not build and install .Xr edit 1 , .Xr ee 1 , and related programs. .It Va WITHOUT_EFI Set not to build .Xr efivar 3 and .Xr efivar 8 . .Pp This is a default setting on mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_EFI Set to build .Xr efivar 3 and .Xr efivar 8 . .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64 and i386/i386. .It Va WITHOUT_ELFTOOLCHAIN_BOOTSTRAP Set to not build ELF Tool Chain tools (addr2line, nm, size, strings and strip) as part of the bootstrap process. .Bf -symbolic An alternate bootstrap tool chain must be provided. .Ef .It Va WITHOUT_EXAMPLES Set to avoid installing examples to .Pa /usr/share/examples/ . .It Va WITH_EXPERIMENTAL Set to include experimental features in the build. .It Va WITH_EXTRA_TCP_STACKS Set to build extra TCP stack modules. .It Va WITHOUT_FDT Set to not build Flattened Device Tree support as part of the base system. This includes the device tree compiler (dtc) and libfdt support library. .It Va WITHOUT_FILE Set to not build .Xr file 1 and related programs. .It Va WITHOUT_FINGER Set to not build or install .Xr finger 1 and .Xr fingerd 8 . .It Va WITHOUT_FLOPPY Set to not build or install programs for operating floppy disk driver. .It Va WITHOUT_FMTREE Set to not build and install .Pa /usr/sbin/fmtree . .It Va WITHOUT_FORMAT_EXTENSIONS Set to not enable .Fl fformat-extensions when compiling the kernel. Also disables all format checking. .It Va WITHOUT_FORTH Set to build bootloaders without Forth support. .It Va WITHOUT_FP_LIBC Set to build .Nm libc without floating-point support. .It Va WITHOUT_FREEBSD_UPDATE Set to not build .Xr freebsd-update 8 . .It Va WITHOUT_FTP Set to not build or install .Xr ftp 1 and .Xr ftpd 8 . .It Va WITHOUT_GAMES Set to not build games. .It Va WITHOUT_GCC Set to not build and install gcc and g++ as part of the normal build process. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386 and riscv/riscv64. .It Va WITH_GCC Set to build and install gcc and g++. .Pp This is a default setting on mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and sparc64/sparc64. .It Va WITHOUT_GCC_BOOTSTRAP Set to not build gcc and g++ as part of the bootstrap process. You must enable either gcc or clang bootstrap to be able to build the system, unless an alternative compiler is provided via XCC. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386 and riscv/riscv64. .It Va WITH_GCC_BOOTSTRAP Set to build gcc and g++ as part of the bootstrap process. .Pp This is a default setting on mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and sparc64/sparc64. .It Va WITHOUT_GCOV Set to not build the .Xr gcov 1 tool. .It Va WITHOUT_GDB Set to not build .Xr gdb 1 . .Pp This is a default setting on arm64/aarch64 and riscv/riscv64. .It Va WITH_GDB Set to build .Xr gdb 1 . .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and sparc64/sparc64. .It Va WITHOUT_GDB_LIBEXEC Set to install .Xr gdb 1 into .Pa /usr/bin . .Pp This is a default setting on sparc64/sparc64. .It Va WITH_GDB_LIBEXEC Set to install .Xr gdb 1 into .Pa /usr/libexec . This permits .Xr gdb 1 to be used as a fallback for .Xr crashinfo 8 if a newer version is not installed. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and riscv/riscv64. .It Va WITHOUT_GNUCXX Do not build the GNU C++ stack (g++, libstdc++). This is the default on platforms where clang is the system compiler. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64 and i386/i386. .It Va WITH_GNUCXX Build the GNU C++ stack (g++, libstdc++). This is the default on platforms where gcc is the system compiler. .Pp This is a default setting on mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITHOUT_GNU_DIFF Set to not build GNU .Xr diff 1 and .Xr diff3 1 . .It Va WITHOUT_GNU_GREP Set to not build GNU .Xr grep 1 . .It Va WITH_GNU_GREP_COMPAT Set this option to include GNU extensions in .Xr bsdgrep 1 by linking against libgnuregex. .It Va WITHOUT_GOOGLETEST Set to neither build nor install .Lb libgmock , .Lb libgtest , and dependent tests. +.Pp +This is a default setting on +mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf and mips/mips64hf. +.It Va WITH_GOOGLETEST +Set to build and install +.Lb libgmock , +.Lb libgtest , +and dependent tests. +.Pp +This is a default setting on +amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITHOUT_GPIO Set to not build .Xr gpioctl 8 as part of the base system. .It Va WITHOUT_GPL_DTC Set to build the BSD licensed version of the device tree compiler rather than the GPLed one from elinux.org. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64 and i386/i386. .It Va WITH_GPL_DTC Set to build the GPL'd version of the device tree compiler from elinux.org, instead of the BSD licensed one. .Pp This is a default setting on mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITHOUT_GSSAPI Set to not build libgssapi. .It Va WITHOUT_HAST Set to not build .Xr hastd 8 and related utilities. .It Va WITH_HESIOD Set to build Hesiod support. .It Va WITHOUT_HTML Set to not build HTML docs. .It Va WITHOUT_HYPERV Set to not build or install HyperV utilities. .Pp This is a default setting on arm/arm, arm/armv6, arm/armv7, arm64/aarch64, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_HYPERV Set to build or install HyperV utilities. .Pp This is a default setting on amd64/amd64 and i386/i386. .It Va WITHOUT_ICONV Set to not build iconv as part of libc. .It Va WITHOUT_INCLUDES Set to not install header files. This option used to be spelled .Va NO_INCS . .Bf -symbolic The option does not work for build targets. .Ef .It Va WITHOUT_INET Set to not build programs and libraries related to IPv4 networking. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_INET_SUPPORT .El .It Va WITHOUT_INET6 Set to not build programs and libraries related to IPv6 networking. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_INET6_SUPPORT .El .It Va WITHOUT_INET6_SUPPORT Set to build libraries, programs, and kernel modules without IPv6 support. .It Va WITHOUT_INETD Set to not build .Xr inetd 8 . .It Va WITHOUT_INET_SUPPORT Set to build libraries, programs, and kernel modules without IPv4 support. .It Va WITHOUT_INSTALLLIB Set this to not install optional libraries. For example, when creating a .Xr nanobsd 8 image. .Bf -symbolic The option does not work for build targets. .Ef .It Va WITH_INSTALL_AS_USER Set to make install targets succeed for non-root users by installing files with owner and group attributes set to that of the user running the .Xr make 1 command. The user still must set the .Va DESTDIR variable to point to a directory where the user has write permissions. .It Va WITHOUT_IPFILTER Set to not build IP Filter package. .It Va WITHOUT_IPFW Set to not build IPFW tools. .It Va WITHOUT_IPSEC_SUPPORT Set to not build the kernel with .Xr ipsec 4 support. This option is needed for .Xr ipsec 4 and .Xr tcpmd5 4 . .It Va WITHOUT_ISCSI Set to not build .Xr iscsid 8 and related utilities. .It Va WITHOUT_JAIL Set to not build tools for the support of jails; e.g., .Xr jail 8 . .It Va WITHOUT_KDUMP Set to not build .Xr kdump 1 and .Xr truss 1 . .It Va WITHOUT_KERBEROS Set this to not build Kerberos 5 (KTH Heimdal). When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_GSSAPI (unless .Va WITH_GSSAPI is set explicitly) .It Va WITHOUT_KERBEROS_SUPPORT (unless .Va WITH_KERBEROS_SUPPORT is set explicitly) .El .It Va WITHOUT_KERBEROS_SUPPORT Set to build some programs without Kerberos support, like .Xr ssh 1 , .Xr telnet 1 , .Xr sshd 8 , and .Xr telnetd 8 . .It Va WITH_KERNEL_RETPOLINE Set to enable the "retpoline" mitigation for CVE-2017-5715 in the kernel build. .It Va WITHOUT_KERNEL_SYMBOLS Set to not install kernel symbol files. .Bf -symbolic This option is recommended for those people who have small root partitions. .Ef .It Va WITHOUT_KVM Set to not build the .Nm libkvm library as a part of the base system. .Bf -symbolic The option has no effect yet. .Ef When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_KVM_SUPPORT (unless .Va WITH_KVM_SUPPORT is set explicitly) .El .It Va WITHOUT_KVM_SUPPORT Set to build some programs without optional .Nm libkvm support. .It Va WITHOUT_LDNS Setting this variable will prevent the LDNS library from being built. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_LDNS_UTILS .It .Va WITHOUT_UNBOUND .El .It Va WITHOUT_LDNS_UTILS Setting this variable will prevent building the LDNS utilities .Xr drill 1 and .Xr host 1 . .It Va WITHOUT_LEGACY_CONSOLE Set to not build programs that support a legacy PC console; e.g., .Xr kbdcontrol 1 and .Xr vidcontrol 1 . .It Va WITHOUT_LIB32 On 64-bit platforms, set to not build 32-bit library set and a .Nm ld-elf32.so.1 runtime linker. .It Va WITHOUT_LIBCPLUSPLUS Set to avoid building libcxxrt and libc++. .It Va WITHOUT_LIBPTHREAD Set to not build the .Nm libpthread providing library, .Nm libthr . When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_LIBTHR .El .It Va WITH_LIBSOFT On armv6 only, set to enable soft float ABI compatibility libraries. This option is for transitioning to the new hard float ABI. .It Va WITHOUT_LIBTHR Set to not build the .Nm libthr (1:1 threading) library. .It Va WITHOUT_LLD Set to not build LLVM's lld linker. .Pp This is a default setting on riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLD Set to build LLVM's lld linker. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITHOUT_LLDB Set to not build the LLDB debugger. .Pp This is a default setting on arm/arm, arm/armv6, arm/armv7, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLDB Set to build the LLDB debugger. .Pp This is a default setting on amd64/amd64, arm64/aarch64 and i386/i386. .It Va WITHOUT_LLD_BOOTSTRAP Set to not build the LLD linker during the bootstrap phase of the build. To be able to build the system, either Binutils or LLD bootstrap must be enabled unless an alternate linker is provided via XLD. .Pp This is a default setting on arm/arm, arm/armv6, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLD_BOOTSTRAP Set to build the LLD linker during the bootstrap phase of the build, and use it during buildworld and buildkernel. .Pp This is a default setting on amd64/amd64, arm/armv7, arm64/aarch64 and i386/i386. .It Va WITHOUT_LLD_IS_LD Set to use GNU binutils ld as the system linker, instead of LLVM's LLD. .Pp This is a default setting on arm/arm, arm/armv6, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLD_IS_LD Set to use LLVM's LLD as the system linker, instead of GNU binutils ld. .Pp This is a default setting on amd64/amd64, arm/armv7, arm64/aarch64 and i386/i386. .It Va WITHOUT_LLVM_COV Set to not build the .Xr llvm-cov 1 tool. .Pp This is a default setting on riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLVM_COV Set to build the .Xr llvm-cov 1 tool. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITHOUT_LLVM_LIBUNWIND Set to use GCC's stack unwinder (instead of LLVM's libunwind). .Pp This is a default setting on arm/arm, arm/armv6, arm/armv7, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and sparc64/sparc64. .It Va WITH_LLVM_LIBUNWIND Set to use LLVM's libunwind stack unwinder (instead of GCC's unwinder). .Pp This is a default setting on amd64/amd64, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf and riscv/riscv64. .It Va WITHOUT_LLVM_TARGET_AARCH64 Set to not build LLVM target support for AArch64. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on arm/arm, arm/armv6, riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLVM_TARGET_AARCH64 Set to build LLVM target support for AArch64. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on amd64/amd64, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITHOUT_LLVM_TARGET_ALL Set to only build the required LLVM target support. This option is preferred to specific target support options. .Pp This is a default setting on riscv/riscv64 and sparc64/sparc64. When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_LLVM_TARGET_AARCH64 (unless .Va WITH_LLVM_TARGET_AARCH64 is set explicitly) .It Va WITHOUT_LLVM_TARGET_ARM (unless .Va WITH_LLVM_TARGET_ARM is set explicitly) .It Va WITHOUT_LLVM_TARGET_MIPS (unless .Va WITH_LLVM_TARGET_MIPS is set explicitly) .It Va WITHOUT_LLVM_TARGET_POWERPC (unless .Va WITH_LLVM_TARGET_POWERPC is set explicitly) .It Va WITHOUT_LLVM_TARGET_SPARC (unless .Va WITH_LLVM_TARGET_SPARC is set explicitly) .El .It Va WITH_LLVM_TARGET_ALL Set to build support for all LLVM targets. This option is always applied to the bootstrap compiler for buildworld when LLVM is used. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITHOUT_LLVM_TARGET_ARM Set to not build LLVM target support for ARM. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLVM_TARGET_ARM Set to build LLVM target support for ARM. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITH_LLVM_TARGET_BPF Set to build LLVM target support for BPF. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .It Va WITHOUT_LLVM_TARGET_MIPS Set to not build LLVM target support for MIPS. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on arm/arm, arm/armv6, riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLVM_TARGET_MIPS Set to build LLVM target support for MIPS. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on amd64/amd64, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITHOUT_LLVM_TARGET_POWERPC Set to not build LLVM target support for PowerPC. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on arm/arm, arm/armv6, riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLVM_TARGET_POWERPC Set to build LLVM target support for PowerPC. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on amd64/amd64, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITH_LLVM_TARGET_RISCV Set to build LLVM target support for RISC-V. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .It Va WITHOUT_LLVM_TARGET_SPARC Set to not build LLVM target support for SPARC. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on arm/arm, arm/armv6, riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLVM_TARGET_SPARC Set to build LLVM target support for SPARC. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on amd64/amd64, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITHOUT_LLVM_TARGET_X86 Set to not build LLVM target support for X86. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on arm/arm, arm/armv6, riscv/riscv64 and sparc64/sparc64. .It Va WITH_LLVM_TARGET_X86 Set to build LLVM target support for X86. The .Va LLVM_TARGET_ALL option should be used rather than this in most cases. .Pp This is a default setting on amd64/amd64, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITH_LOADER_EFI_SECUREBOOT Enable building .Xr loader 8 with support for verification based on certificates obtained from UEFI. .Pp .It Va WITH_LOADER_FIREWIRE Enable firewire support in /boot/loader on x86. This option is a nop on all other platforms. .It Va WITH_LOADER_FORCE_LE Set to force the powerpc boot loader to launch the kernel in little endian mode. .It Va WITHOUT_LOADER_GELI Disable inclusion of GELI crypto support in the boot chain binaries. .Pp This is a default setting on powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and sparc64/sparc64. .It Va WITH_LOADER_GELI Set to build GELI bootloader support. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf and riscv/riscv64. .It Va WITHOUT_LOADER_LUA Set to not build LUA bindings for the boot loader. .Pp This is a default setting on powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and sparc64/sparc64. .It Va WITH_LOADER_LUA Set to build LUA bindings for the boot loader. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf and riscv/riscv64. .It Va WITHOUT_LOADER_OFW Disable building of openfirmware bootloader components. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf and riscv/riscv64. .It Va WITH_LOADER_OFW Set to build openfirmware bootloader components. .Pp This is a default setting on powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe and sparc64/sparc64. .It Va WITHOUT_LOADER_UBOOT Disable building of ubldr. .Pp This is a default setting on amd64/amd64, arm64/aarch64, i386/i386, riscv/riscv64 and sparc64/sparc64. .It Va WITH_LOADER_UBOOT Set to build ubldr. .Pp This is a default setting on arm/arm, arm/armv6, arm/armv7, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64 and powerpc/powerpcspe. .It Va WITH_LOADER_VERBOSE Set to build with extra verbose debugging in the loader. May explode already nearly too large loader over the limit. Use with care. .It Va WITH_LOADER_VERIEXEC Enable building .Xr loader 8 with support for verifcation similar to Verified Exec. .Pp It depends on .Va WITH_BEARSSL When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITH_LOADER_EFI_SECUREBOOT (unless .Va WITHOUT_LOADER_EFI_SECUREBOOT is set explicitly) .El .It Va WITHOUT_LOADER_ZFS Set to not build ZFS file system boot loader support. .It Va WITHOUT_LOCALES Set to not build localization files; see .Xr locale 1 . .It Va WITHOUT_LOCATE Set to not build .Xr locate 1 and related programs. .It Va WITHOUT_LPR Set to not build .Xr lpr 1 and related programs. .It Va WITHOUT_LS_COLORS Set to build .Xr ls 1 without support for colors to distinguish file types. .It Va WITHOUT_LZMA_SUPPORT Set to build some programs without optional lzma compression support. .It Va WITHOUT_MAIL Set to not build any mail support (MUA or MTA). When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_DMAGENT .It .Va WITHOUT_MAILWRAPPER .It .Va WITHOUT_SENDMAIL .El .It Va WITHOUT_MAILWRAPPER Set to not build the .Xr mailwrapper 8 MTA selector. .It Va WITHOUT_MAKE Set to not install .Xr make 1 and related support files. .It Va WITHOUT_MAKE_CHECK_USE_SANDBOX Set to not execute .Dq Li "make check" in limited sandbox mode. This option should be paired with .Va WITH_INSTALL_AS_USER if executed as an unprivileged user. See .Xr tests 7 for more details. .It Va WITHOUT_MAN Set to not build manual pages. When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_MAN_UTILS (unless .Va WITH_MAN_UTILS is set explicitly) .El .It Va WITHOUT_MANCOMPRESS Set to not to install compressed man pages. Only the uncompressed versions will be installed. .It Va WITHOUT_MAN_UTILS Set to not build utilities for manual pages, .Xr apropos 1 , .Xr makewhatis 1 , .Xr man 1 , .Xr whatis 1 , .Xr manctl 8 , and related support files. .It Va WITH_META_MODE Create .Xr make 1 meta files when building, which can provide a reliable incremental build when using .Xr filemon 4 . The meta file is created in OBJDIR as .Pa target.meta . These meta files track the command that was executed, its output, and the current directory. The .Xr filemon 4 module is required unless .Va NO_FILEMON is defined. When the module is loaded, any files used by the commands executed are tracked as dependencies for the target in its meta file. The target is considered out-of-date and rebuilt if any of these conditions are true compared to the last build: .Bl -bullet -compact .It The command to execute changes. .It The current working directory changes. .It The target's meta file is missing. .It The target's meta file is missing filemon data when filemon is loaded and a previous run did not have it loaded. .It [requires .Xr filemon 4 ] Files read, executed or linked to are newer than the target. .It [requires .Xr filemon 4 ] Files read, written, executed or linked are missing. .El The meta files can also be useful for debugging. .Pp The build hides commands that are executed unless .Va NO_SILENT is defined. Errors cause .Xr make 1 to show some of its environment for further debugging. .Pp The build operates as it normally would otherwise. This option originally invoked a different build system but that was renamed to .Va WITH_DIRDEPS_BUILD . .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITHOUT_MLX5TOOL Set to not build .Xr mlx5tool 8 .Pp This is a default setting on arm/arm, arm/armv6, arm/armv7, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpcspe and riscv/riscv64. .It Va WITH_MLX5TOOL Set to build .Xr mlx5tool 8 .Pp This is a default setting on amd64/amd64, arm64/aarch64, i386/i386, powerpc/powerpc64 and sparc64/sparc64. .It Va WITHOUT_NDIS Set to not build programs and libraries related to NDIS emulation support. .It Va WITHOUT_NETCAT Set to not build .Xr nc 1 utility. .It Va WITHOUT_NETGRAPH Set to not build applications to support .Xr netgraph 4 . When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_ATM .It .Va WITHOUT_BLUETOOTH .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_NETGRAPH_SUPPORT (unless .Va WITH_NETGRAPH_SUPPORT is set explicitly) .El .It Va WITHOUT_NETGRAPH_SUPPORT Set to build libraries, programs, and kernel modules without netgraph support. .It Va WITHOUT_NIS Set to not build .Xr NIS 8 support and related programs. If set, you might need to adopt your .Xr nsswitch.conf 5 and remove .Sq nis entries. .It Va WITHOUT_NLS Set to not build NLS catalogs. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_NLS_CATALOGS .El .It Va WITHOUT_NLS_CATALOGS Set to not build NLS catalog support for .Xr csh 1 . .It Va WITHOUT_NS_CACHING Set to disable name caching in the .Pa nsswitch subsystem. The generic caching daemon, .Xr nscd 8 , will not be built either if this option is set. .It Va WITHOUT_NTP Set to not build .Xr ntpd 8 and related programs. .It Va WITHOUT_NVME Set to not build nvme related tools and kernel modules. .Pp This is a default setting on arm/arm, arm/armv6, arm/armv7, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_NVME Set to build nvme related tools and kernel modules. .Pp This is a default setting on amd64/amd64, arm64/aarch64, i386/i386 and powerpc/powerpc64. .It Va WITH_OFED Set to build the .Dq "OpenFabrics Enterprise Distribution" Infiniband software stack. .It Va WITH_OFED_EXTRA Set to build the non-essential components of the .Dq "OpenFabrics Enterprise Distribution" Infiniband software stack, mostly examples. .It Va WITH_OPENLDAP Enable building openldap support for kerberos. .It Va WITHOUT_OPENMP Set to not build LLVM's OpenMP runtime. .Pp This is a default setting on arm/arm, arm/armv6, arm/armv7, arm64/aarch64, mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf, mips/mips64hf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_OPENMP Set to build LLVM's OpenMP runtime. .Pp This is a default setting on amd64/amd64 and i386/i386. .It Va WITHOUT_OPENSSH Set to not build OpenSSH. .It Va WITHOUT_OPENSSL Set to not build OpenSSL. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_KERBEROS .It .Va WITHOUT_OPENSSH .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_GSSAPI (unless .Va WITH_GSSAPI is set explicitly) .El .It Va WITHOUT_PAM Set to not build PAM library and modules. .Bf -symbolic This option is deprecated and does nothing. .Ef When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_PAM_SUPPORT (unless .Va WITH_PAM_SUPPORT is set explicitly) .El .It Va WITHOUT_PAM_SUPPORT Set to build some programs without PAM support, particularly .Xr ftpd 8 and .Xr ppp 8 . .It Va WITHOUT_PF Set to not build PF firewall package. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_AUTHPF .El .It Va WITH_PIE Build dynamically linked binaries as Position-Independent Executable (PIE). .It Va WITHOUT_PKGBOOTSTRAP Set to not build .Xr pkg 7 bootstrap tool. .It Va WITHOUT_PMC Set to not build .Xr pmccontrol 8 and related programs. .It Va WITHOUT_PORTSNAP Set to not build or install .Xr portsnap 8 and related files. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_FREEBSD_UPDATE .El .It Va WITHOUT_PPP Set to not build .Xr ppp 8 and related programs. .It Va WITHOUT_PROFILE Set to not build profiled libraries for use with .Xr gprof 8 . .Pp This is a default setting on mips/mips64el, mips/mips64, mips/mips64elhf and mips/mips64hf. .It Va WITH_PROFILE Set to build profiled libraries for use with .Xr gprof 8 . .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, mips/mipsel, mips/mips, mips/mipsn32, mips/mipselhf, mips/mipshf, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITHOUT_QUOTAS Set to not build .Xr quota 1 and related programs. .It Va WITHOUT_RADIUS_SUPPORT Set to not build radius support into various applications, like .Xr pam_radius 8 and .Xr ppp 8 . .It Va WITH_RATELIMIT Set to build the system with rate limit support. .Pp This makes .Dv SO_MAX_PACING_RATE effective in .Xr getsockopt 2 , and .Ar txrlimit support in .Xr ifconfig 8 , by proxy. .It Va WITHOUT_RBOOTD Set to not build or install .Xr rbootd 8 . .It Va WITH_REPRODUCIBLE_BUILD Set to exclude build metadata (such as the build time, user, or host) from the kernel, boot loaders, and uname output, so that builds produce bit-for-bit identical output. .It Va WITHOUT_RESCUE Set to not build .Xr rescue 8 . .It Va WITH_RETPOLINE Set to build the base system with the retpoline speculative execution vulnerability mitigation for CVE-2017-5715. .It Va WITHOUT_ROUTED Set to not build .Xr routed 8 utility. .It Va WITH_RPCBIND_WARMSTART_SUPPORT Set to build .Xr rpcbind 8 with warmstart support. .It Va WITHOUT_SENDMAIL Set to not build .Xr sendmail 8 and related programs. .It Va WITHOUT_SERVICESDB Set to not install .Pa /var/db/services.db . .It Va WITHOUT_SETUID_LOGIN Set this to disable the installation of .Xr login 1 as a set-user-ID root program. .It Va WITHOUT_SHAREDOCS Set to not build the .Bx 4.4 legacy docs. .It Va WITH_SHARED_TOOLCHAIN Set to build the toolchain binaries shared. The set includes .Xr cc 1 , .Xr make 1 and necessary utilities like assembler, linker and library archive manager. .It Va WITH_SORT_THREADS Set to enable threads in .Xr sort 1 . .It Va WITHOUT_SOURCELESS Set to not build kernel modules that include sourceless code (either microcode or native code for host CPU). When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_SOURCELESS_HOST .It .Va WITHOUT_SOURCELESS_UCODE .El .It Va WITHOUT_SOURCELESS_HOST Set to not build kernel modules that include sourceless native code for host CPU. .It Va WITHOUT_SOURCELESS_UCODE Set to not build kernel modules that include sourceless microcode. .It Va WITHOUT_SSP Set to not build world with propolice stack smashing protection. .Pp This is a default setting on mips/mipsel, mips/mips, mips/mips64el, mips/mips64, mips/mipsn32, mips/mipselhf, mips/mipshf, mips/mips64elhf and mips/mips64hf. .It Va WITH_SSP Set to build world with propolice stack smashing protection. .Pp This is a default setting on amd64/amd64, arm/arm, arm/armv6, arm/armv7, arm64/aarch64, i386/i386, powerpc/powerpc, powerpc/powerpc64, powerpc/powerpcspe, riscv/riscv64 and sparc64/sparc64. .It Va WITH_STAGING Enable staging of files to a stage tree. This can be best thought of as auto-install to .Va DESTDIR with some extra meta data to ensure dependencies can be tracked. Depends on .Va WITH_DIRDEPS_BUILD . When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITH_STAGING_MAN (unless .Va WITHOUT_STAGING_MAN is set explicitly) .It Va WITH_STAGING_PROG (unless .Va WITHOUT_STAGING_PROG is set explicitly) .El .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITH_STAGING_MAN Enable staging of man pages to stage tree. .It Va WITH_STAGING_PROG Enable staging of PROGs to stage tree. .It Va WITH_STALE_STAGED Check staged files are not stale. .It Va WITH_SVN Set to install .Xr svnlite 1 as .Xr svn 1 . .It Va WITHOUT_SVNLITE Set to not build .Xr svnlite 1 and related programs. .It Va WITHOUT_SYMVER Set to disable symbol versioning when building shared libraries. .It Va WITHOUT_SYSCONS Set to not build .Xr syscons 4 support files such as keyboard maps, fonts, and screen output maps. .It Va WITH_SYSROOT Enable use of sysroot during build. Depends on .Va WITH_DIRDEPS_BUILD . .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITHOUT_SYSTEM_COMPILER Set to not opportunistically skip building a cross-compiler during the bootstrap phase of the build. Normally, if the currently installed compiler matches the planned bootstrap compiler type and revision, then it will not be built. This does not prevent a compiler from being built for installation though, only for building one for the build itself. The .Va WITHOUT_CLANG and .Va WITHOUT_GCC options control those. .It Va WITHOUT_SYSTEM_LINKER Set to not opportunistically skip building a cross-linker during the bootstrap phase of the build. Normally, if the currently installed linker matches the planned bootstrap linker type and revision, then it will not be built. This does not prevent a linker from being built for installation though, only for building one for the build itself. The .Va WITHOUT_LLD and .Va WITHOUT_BINUTILS options control those. .Pp This option is only relevant when .Va WITH_LLD_BOOTSTRAP is set. .It Va WITHOUT_TALK Set to not build or install .Xr talk 1 and .Xr talkd 8 . .It Va WITHOUT_TCP_WRAPPERS Set to not build or install .Xr tcpd 8 , and related utilities. .It Va WITHOUT_TCSH Set to not build and install .Pa /bin/csh (which is .Xr tcsh 1 ) . .It Va WITHOUT_TELNET Set to not build .Xr telnet 1 and related programs. .It Va WITHOUT_TESTS Set to not build nor install the .Fx Test Suite in .Pa /usr/tests/ . See .Xr tests 7 for more details. This also disables the build of all test-related dependencies, including ATF. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_DTRACE_TESTS .El .Pp When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_GOOGLETEST (unless .Va WITH_GOOGLETEST is set explicitly) .It Va WITHOUT_TESTS_SUPPORT (unless .Va WITH_TESTS_SUPPORT is set explicitly) .El .It Va WITHOUT_TESTS_SUPPORT Set to disables the build of all test-related dependencies, including ATF. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_GOOGLETEST .El .It Va WITHOUT_TEXTPROC Set to not build programs used for text processing. .It Va WITHOUT_TFTP Set to not build or install .Xr tftp 1 and .Xr tftpd 8 . .It Va WITHOUT_TOOLCHAIN Set to not install header or programs used for program development, compilers, debuggers etc. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_BINUTILS .It .Va WITHOUT_CLANG .It .Va WITHOUT_CLANG_EXTRAS .It .Va WITHOUT_CLANG_FULL .It .Va WITHOUT_GCC .It .Va WITHOUT_GDB .It .Va WITHOUT_INCLUDES .It .Va WITHOUT_LLD .It .Va WITHOUT_LLDB .It .Va WITHOUT_LLVM_COV .El .It Va WITHOUT_UNBOUND Set to not build .Xr unbound 8 and related programs. .It Va WITHOUT_UNIFIED_OBJDIR Set to use the historical object directory format for .Xr build 7 targets. For native-builds and builds done directly in sub-directories the format of .Pa ${MAKEOBJDIRPREFIX}/${.CURDIR} is used, while for cross-builds .Pa ${MAKEOBJDIRPREFIX}/${TARGET}.${TARGET_ARCH}/${.CURDIR} is used. .Pp This option is transitional and will be removed before the 12.0 release, at which time .va WITH_UNIFIED_OBJDIR will be enabled permanently. .Pp This must be set in the environment, make command line, or .Pa /etc/src-env.conf , not .Pa /etc/src.conf . .It Va WITHOUT_USB Set to not build USB-related programs and libraries. .It Va WITHOUT_USB_GADGET_EXAMPLES Set to not build USB gadget kernel modules. .It Va WITHOUT_UTMPX Set to not build user accounting tools such as .Xr last 1 , .Xr users 1 , .Xr who 1 , .Xr ac 8 , .Xr lastlogin 8 and .Xr utx 8 . .It Va WITH_VERIEXEC Enable building .Xr veriexec 8 which loads the contents of verified manifests into the kernel for use by .Xr mac_veriexec 4 .Pp It depends on .Va WITH_BEARSSL .It Va WITHOUT_VI Set to not build and install vi, view, ex and related programs. .It Va WITHOUT_VT Set to not build .Xr vt 4 support files (fonts and keymaps). .It Va WITHOUT_WARNS Set this to not add warning flags to the compiler invocations. Useful as a temporary workaround when code enters the tree which triggers warnings in environments that differ from the original developer. .It Va WITHOUT_WIRELESS Set to not build programs used for 802.11 wireless networks; especially .Xr wpa_supplicant 8 and .Xr hostapd 8 . When set, these options are also in effect: .Pp .Bl -inset -compact .It Va WITHOUT_WIRELESS_SUPPORT (unless .Va WITH_WIRELESS_SUPPORT is set explicitly) .El .It Va WITHOUT_WIRELESS_SUPPORT Set to build libraries, programs, and kernel modules without 802.11 wireless support. .It Va WITHOUT_WPA_SUPPLICANT_EAPOL Build .Xr wpa_supplicant 8 without support for the IEEE 802.1X protocol and without support for EAP-PEAP, EAP-TLS, EAP-LEAP, and EAP-TTLS protocols (usable only via 802.1X). .It Va WITHOUT_ZFS Set to not build ZFS file system kernel module, libraries, and user commands. .It Va WITHOUT_ZONEINFO Set to not build the timezone database. When set, it enforces these options: .Pp .Bl -item -compact .It .Va WITHOUT_ZONEINFO_LEAPSECONDS_SUPPORT .It .Va WITHOUT_ZONEINFO_OLD_TIMEZONES_SUPPORT .El .It Va WITH_ZONEINFO_LEAPSECONDS_SUPPORT Set to build leapsecond information in to the timezone database. .It Va WITH_ZONEINFO_OLD_TIMEZONES_SUPPORT Set to build backward compatibility timezone aliases in to the timezone database. .El .Sh FILES .Bl -tag -compact -width Pa .It Pa /etc/src.conf .It Pa /etc/src-env.conf .It Pa /usr/share/mk/bsd.own.mk .El .Sh SEE ALSO .Xr make 1 , .Xr make.conf 5 , .Xr build 7 , .Xr ports 7 .Sh HISTORY The .Nm file appeared in .Fx 7.0 . .Sh AUTHORS This manual page was autogenerated by .An tools/build/options/makeman . Index: projects/clang900-import/share/mk/src.libnames.mk =================================================================== --- projects/clang900-import/share/mk/src.libnames.mk (revision 352536) +++ projects/clang900-import/share/mk/src.libnames.mk (revision 352537) @@ -1,655 +1,655 @@ # $FreeBSD$ # # The include file define library names suitable # for INTERNALLIB and PRIVATELIB definition .if !target(____) .error src.libnames.mk cannot be included directly. .endif .if !target(____) ____: .include _PRIVATELIBS= \ atf_c \ atf_cxx \ bsdstat \ devdctl \ event \ gmock \ gtest \ gmock_main \ gtest_main \ heimipcc \ heimipcs \ ldns \ sqlite3 \ ssh \ ucl \ unbound \ zstd _INTERNALLIBS= \ amu \ bsnmptools \ c_nossp_pic \ cron \ elftc \ fifolog \ ifconfig \ ipf \ lpr \ netbsd \ ntp \ ntpevent \ openbsd \ opts \ parse \ pe \ pmcstat \ sl \ sm \ smdb \ smutil \ telnet \ vers _LIBRARIES= \ ${_PRIVATELIBS} \ ${_INTERNALLIBS} \ ${LOCAL_LIBRARIES} \ 80211 \ alias \ archive \ asn1 \ auditd \ avl \ be \ begemot \ bluetooth \ bsdxml \ bsm \ bsnmp \ bz2 \ c \ c_pic \ calendar \ cam \ casper \ cap_dns \ cap_fileargs \ cap_grp \ cap_pwd \ cap_sysctl \ cap_syslog \ com_err \ compiler_rt \ crypt \ crypto \ ctf \ cuse \ cxxrt \ devctl \ devdctl \ devinfo \ devstat \ dialog \ dl \ dpv \ dtrace \ dwarf \ edit \ efivar \ elf \ execinfo \ fetch \ figpar \ geom \ gnuregex \ gpio \ gssapi \ gssapi_krb5 \ hdb \ heimbase \ heimntlm \ heimsqlite \ hx509 \ ipsec \ ipt \ jail \ kadm5clnt \ kadm5srv \ kafs5 \ kdc \ kiconv \ krb5 \ kvm \ l \ lzma \ m \ magic \ md \ memstat \ mp \ mt \ ncurses \ ncursesw \ netgraph \ ngatm \ nv \ nvpair \ opencsd \ opie \ pam \ panel \ panelw \ pcap \ pcsclite \ pjdlog \ pmc \ proc \ procstat \ pthread \ radius \ regex \ roken \ rpcsec_gss \ rpcsvc \ rt \ rtld_db \ sbuf \ sdp \ sm \ smb \ ssl \ ssp_nonshared \ stdthreads \ supcplusplus \ sysdecode \ tacplus \ termcap \ termcapw \ ufs \ ugidfw \ ulog \ umem \ usb \ usbhid \ util \ uutil \ vmmapi \ wind \ wrap \ xo \ y \ ypclnt \ z \ zfs_core \ zfs \ zpool \ .if ${MK_BLACKLIST} != "no" _LIBRARIES+= \ blacklist \ .endif .if ${MK_OFED} != "no" _LIBRARIES+= \ cxgb4 \ ibcm \ ibmad \ ibnetdisc \ ibumad \ ibverbs \ mlx4 \ mlx5 \ rdmacm \ osmcomp \ opensm \ osmvendor .endif .if ${MK_BEARSSL} == "yes" _INTERNALLIBS+= \ bearssl \ secureboot \ LIBBEARSSL?= ${LIBBEARSSLDIR}/libbearssl${PIE_SUFFIX}.a LIBSECUREBOOT?= ${LIBSECUREBOOTDIR}/libsecureboot${PIE_SUFFIX}.a .endif .if ${MK_VERIEXEC} == "yes" _INTERNALLIBS+= veriexec LIBVERIEXEC?= ${LIBVERIEXECDIR}/libveriexec${PIE_SUFFIX}.a .endif # Each library's LIBADD needs to be duplicated here for static linkage of # 2nd+ order consumers. Auto-generating this would be better. _DP_80211= sbuf bsdxml -_DP_archive= z bz2 lzma bsdxml +_DP_archive= z bz2 lzma bsdxml zstd _DP_zstd= pthread .if ${MK_BLACKLIST} != "no" _DP_blacklist+= pthread .endif _DP_crypto= pthread .if ${MK_OPENSSL} != "no" _DP_archive+= crypto .else _DP_archive+= md .endif _DP_sqlite3= pthread _DP_ssl= crypto _DP_ssh= crypto crypt z .if ${MK_LDNS} != "no" _DP_ssh+= ldns .endif _DP_edit= ncursesw .if ${MK_OPENSSL} != "no" _DP_bsnmp= crypto .endif _DP_geom= bsdxml sbuf _DP_cam= sbuf _DP_kvm= elf _DP_casper= nv _DP_cap_dns= nv _DP_cap_fileargs= nv _DP_cap_grp= nv _DP_cap_pwd= nv _DP_cap_sysctl= nv _DP_cap_syslog= nv .if ${MK_OFED} != "no" _DP_pcap= ibverbs mlx5 .endif _DP_pjdlog= util _DP_opie= md _DP_usb= pthread _DP_unbound= ssl crypto pthread _DP_rt= pthread .if ${MK_OPENSSL} == "no" _DP_radius= md .else _DP_radius= crypto .endif _DP_rtld_db= elf procstat _DP_procstat= kvm util elf .if ${MK_CXX} == "yes" .if ${MK_LIBCPLUSPLUS} != "no" _DP_proc= cxxrt .else _DP_proc= supcplusplus .endif .endif .if ${MK_CDDL} != "no" _DP_proc+= ctf .endif _DP_proc+= elf procstat rtld_db util _DP_mp= crypto _DP_memstat= kvm _DP_magic= z _DP_mt= sbuf bsdxml _DP_ldns= ssl crypto .if ${MK_OPENSSL} != "no" _DP_fetch= ssl crypto .else _DP_fetch= md .endif _DP_execinfo= elf _DP_dwarf= elf _DP_dpv= dialog figpar util ncursesw _DP_dialog= ncursesw m _DP_cuse= pthread _DP_atf_cxx= atf_c _DP_gtest= pthread _DP_gmock= gtest _DP_gmock_main= gmock _DP_gtest_main= gtest _DP_devstat= kvm _DP_pam= radius tacplus opie md util .if ${MK_KERBEROS} != "no" _DP_pam+= krb5 .endif .if ${MK_OPENSSH} != "no" _DP_pam+= ssh .endif .if ${MK_NIS} != "no" _DP_pam+= ypclnt .endif _DP_roken= crypt _DP_kadm5clnt= com_err krb5 roken _DP_kadm5srv= com_err hdb krb5 roken _DP_heimntlm= crypto com_err krb5 roken _DP_hx509= asn1 com_err crypto roken wind _DP_hdb= asn1 com_err krb5 roken sqlite3 _DP_asn1= com_err roken _DP_kdc= roken hdb hx509 krb5 heimntlm asn1 crypto _DP_wind= com_err roken _DP_heimbase= pthread _DP_heimipcc= heimbase roken pthread _DP_heimipcs= heimbase roken pthread _DP_kafs5= asn1 krb5 roken _DP_krb5+= asn1 com_err crypt crypto hx509 roken wind heimbase heimipcc _DP_gssapi_krb5+= gssapi krb5 crypto roken asn1 com_err _DP_lzma= pthread _DP_ucl= m _DP_vmmapi= util _DP_opencsd= cxxrt _DP_ctf= z _DP_dtrace= ctf elf proc pthread rtld_db _DP_xo= util # The libc dependencies are not strictly needed but are defined to make the # assert happy. _DP_c= compiler_rt .if ${MK_SSP} != "no" _DP_c+= ssp_nonshared .endif _DP_stdthreads= pthread _DP_tacplus= md _DP_panel= ncurses _DP_panelw= ncursesw _DP_rpcsec_gss= gssapi _DP_smb= kiconv _DP_ulog= md _DP_fifolog= z _DP_ipf= kvm _DP_zfs= md pthread umem util uutil m nvpair avl bsdxml geom nvpair z \ zfs_core _DP_zfs_core= nvpair _DP_zpool= md pthread z nvpair avl umem _DP_be= zfs nvpair # OFED support .if ${MK_OFED} != "no" _DP_cxgb4= ibverbs pthread _DP_ibcm= ibverbs _DP_ibmad= ibumad _DP_ibnetdisc= osmcomp ibmad ibumad _DP_ibumad= _DP_ibverbs= _DP_mlx4= ibverbs pthread _DP_mlx5= ibverbs pthread _DP_rdmacm= ibverbs _DP_osmcomp= pthread _DP_opensm= pthread _DP_osmvendor= ibumad pthread .endif # Define special cases LDADD_supcplusplus= -lsupc++ LIBATF_C= ${LIBDESTDIR}${LIBDIR_BASE}/libprivateatf-c.a LIBATF_CXX= ${LIBDESTDIR}${LIBDIR_BASE}/libprivateatf-c++.a LDADD_atf_c= -lprivateatf-c LDADD_atf_cxx= -lprivateatf-c++ LIBGMOCK= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategmock.a LIBGMOCK_MAIN= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategmock_main.a LIBGTEST= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategtest.a LIBGTEST_MAIN= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategtest_main.a LDADD_gmock= -lprivategmock LDADD_gtest= -lprivategtest LDADD_gmock_main= -lprivategmock_main LDADD_gtest_main= -lprivategtest_main .for _l in ${_PRIVATELIBS} LIB${_l:tu}?= ${LIBDESTDIR}${LIBDIR_BASE}/libprivate${_l}.a .endfor .if ${MK_PIE} != "no" PIE_SUFFIX= _pie .endif .for _l in ${_LIBRARIES} .if ${_INTERNALLIBS:M${_l}} || !defined(SYSROOT) LDADD_${_l}_L+= -L${LIB${_l:tu}DIR} .endif DPADD_${_l}?= ${LIB${_l:tu}} .if ${_PRIVATELIBS:M${_l}} LDADD_${_l}?= -lprivate${_l} .elif ${_INTERNALLIBS:M${_l}} LDADD_${_l}?= ${LDADD_${_l}_L} -l${_l:S/${PIE_SUFFIX}//}${PIE_SUFFIX} .else LDADD_${_l}?= ${LDADD_${_l}_L} -l${_l} .endif # Add in all dependencies for static linkage. .if defined(_DP_${_l}) && (${_INTERNALLIBS:M${_l}} || \ (defined(NO_SHARED) && ${NO_SHARED:tl} != "no")) .for _d in ${_DP_${_l}} DPADD_${_l}+= ${DPADD_${_d}} LDADD_${_l}+= ${LDADD_${_d}} .endfor .endif .endfor # These are special cases where the library is broken and anything that uses # it needs to add more dependencies. Broken usually means that it has a # cyclic dependency and cannot link its own dependencies. This is bad, please # fix the library instead. # Unless the library itself is broken then the proper place to define # dependencies is _DP_* above. # libatf-c++ exposes libatf-c abi hence we need to explicit link to atf_c for # atf_cxx DPADD_atf_cxx+= ${DPADD_atf_c} LDADD_atf_cxx+= ${LDADD_atf_c} DPADD_gmock+= ${DPADD_gtest} LDADD_gmock+= ${LDADD_gtest} DPADD_gmock_main+= ${DPADD_gmock} LDADD_gmock_main+= ${LDADD_gmock} DPADD_gtest_main+= ${DPADD_gtest} LDADD_gtest_main+= ${LDADD_gtest} # Detect LDADD/DPADD that should be LIBADD, before modifying LDADD here. _BADLDADD= .for _l in ${LDADD:M-l*:N-l*/*:C,^-l,,} .if ${_LIBRARIES:M${_l}} && !${_PRIVATELIBS:M${_l}} _BADLDADD+= ${_l} .endif .endfor .if !empty(_BADLDADD) .error ${.CURDIR}: These libraries should be LIBADD+=foo rather than DPADD/LDADD+=-lfoo: ${_BADLDADD} .endif .for _l in ${LIBADD} DPADD+= ${DPADD_${_l}} LDADD+= ${LDADD_${_l}} .endfor # INTERNALLIB definitions. LIBELFTCDIR= ${OBJTOP}/lib/libelftc LIBELFTC?= ${LIBELFTCDIR}/libelftc${PIE_SUFFIX}.a LIBPEDIR= ${OBJTOP}/lib/libpe LIBPE?= ${LIBPEDIR}/libpe${PIE_SUFFIX}.a LIBOPENBSDDIR= ${OBJTOP}/lib/libopenbsd LIBOPENBSD?= ${LIBOPENBSDDIR}/libopenbsd${PIE_SUFFIX}.a LIBSMDIR= ${OBJTOP}/lib/libsm LIBSM?= ${LIBSMDIR}/libsm${PIE_SUFFIX}.a LIBSMDBDIR= ${OBJTOP}/lib/libsmdb LIBSMDB?= ${LIBSMDBDIR}/libsmdb${PIE_SUFFIX}.a LIBSMUTILDIR= ${OBJTOP}/lib/libsmutil LIBSMUTIL?= ${LIBSMUTILDIR}/libsmutil${PIE_SUFFIX}.a LIBNETBSDDIR?= ${OBJTOP}/lib/libnetbsd LIBNETBSD?= ${LIBNETBSDDIR}/libnetbsd${PIE_SUFFIX}.a LIBVERSDIR?= ${OBJTOP}/kerberos5/lib/libvers LIBVERS?= ${LIBVERSDIR}/libvers${PIE_SUFFIX}.a LIBSLDIR= ${OBJTOP}/kerberos5/lib/libsl LIBSL?= ${LIBSLDIR}/libsl${PIE_SUFFIX}.a LIBIFCONFIGDIR= ${OBJTOP}/lib/libifconfig LIBIFCONFIG?= ${LIBIFCONFIGDIR}/libifconfig${PIE_SUFFIX}.a LIBIPFDIR= ${OBJTOP}/sbin/ipf/libipf LIBIPF?= ${LIBIPFDIR}/libipf${PIE_SUFFIX}.a LIBTELNETDIR= ${OBJTOP}/lib/libtelnet LIBTELNET?= ${LIBTELNETDIR}/libtelnet${PIE_SUFFIX}.a LIBCRONDIR= ${OBJTOP}/usr.sbin/cron/lib LIBCRON?= ${LIBCRONDIR}/libcron${PIE_SUFFIX}.a LIBNTPDIR= ${OBJTOP}/usr.sbin/ntp/libntp LIBNTP?= ${LIBNTPDIR}/libntp${PIE_SUFFIX}.a LIBNTPEVENTDIR= ${OBJTOP}/usr.sbin/ntp/libntpevent LIBNTPEVENT?= ${LIBNTPEVENTDIR}/libntpevent${PIE_SUFFIX}.a LIBOPTSDIR= ${OBJTOP}/usr.sbin/ntp/libopts LIBOPTS?= ${LIBOPTSDIR}/libopts${PIE_SUFFIX}.a LIBPARSEDIR= ${OBJTOP}/usr.sbin/ntp/libparse LIBPARSE?= ${LIBPARSEDIR}/libparse${PIE_SUFFIX}.a LIBLPRDIR= ${OBJTOP}/usr.sbin/lpr/common_source LIBLPR?= ${LIBLPRDIR}/liblpr${PIE_SUFFIX}.a LIBFIFOLOGDIR= ${OBJTOP}/usr.sbin/fifolog/lib LIBFIFOLOG?= ${LIBFIFOLOGDIR}/libfifolog${PIE_SUFFIX}.a LIBBSNMPTOOLSDIR= ${OBJTOP}/usr.sbin/bsnmpd/tools/libbsnmptools LIBBSNMPTOOLS?= ${LIBBSNMPTOOLSDIR}/libbsnmptools${PIE_SUFFIX}.a LIBAMUDIR= ${OBJTOP}/usr.sbin/amd/libamu LIBAMU?= ${LIBAMUDIR}/libamu${PIE_SUFFIX}.a LIBBE?= ${LIBBEDIR}/libbe${PIE_SUFFIX}.a LIBPMCSTATDIR= ${OBJTOP}/lib/libpmcstat LIBPMCSTAT?= ${LIBPMCSTATDIR}/libpmcstat${PIE_SUFFIX}.a LIBC_NOSSP_PICDIR= ${OBJTOP}/lib/libc LIBC_NOSSP_PIC?= ${LIBC_NOSSP_PICDIR}/libc_nossp_pic.a # Define a directory for each library. This is useful for adding -L in when # not using a --sysroot or for meta mode bootstrapping when there is no # Makefile.depend. These are sorted by directory. LIBAVLDIR= ${OBJTOP}/cddl/lib/libavl LIBCTFDIR= ${OBJTOP}/cddl/lib/libctf LIBDTRACEDIR= ${OBJTOP}/cddl/lib/libdtrace LIBNVPAIRDIR= ${OBJTOP}/cddl/lib/libnvpair LIBUMEMDIR= ${OBJTOP}/cddl/lib/libumem LIBUUTILDIR= ${OBJTOP}/cddl/lib/libuutil LIBZFSDIR= ${OBJTOP}/cddl/lib/libzfs LIBZFS_COREDIR= ${OBJTOP}/cddl/lib/libzfs_core LIBZPOOLDIR= ${OBJTOP}/cddl/lib/libzpool # OFED support LIBCXGB4DIR= ${OBJTOP}/lib/ofed/libcxgb4 LIBIBCMDIR= ${OBJTOP}/lib/ofed/libibcm LIBIBMADDIR= ${OBJTOP}/lib/ofed/libibmad LIBIBNETDISCDIR=${OBJTOP}/lib/ofed/libibnetdisc LIBIBUMADDIR= ${OBJTOP}/lib/ofed/libibumad LIBIBVERBSDIR= ${OBJTOP}/lib/ofed/libibverbs LIBMLX4DIR= ${OBJTOP}/lib/ofed/libmlx4 LIBMLX5DIR= ${OBJTOP}/lib/ofed/libmlx5 LIBRDMACMDIR= ${OBJTOP}/lib/ofed/librdmacm LIBOSMCOMPDIR= ${OBJTOP}/lib/ofed/complib LIBOPENSMDIR= ${OBJTOP}/lib/ofed/libopensm LIBOSMVENDORDIR=${OBJTOP}/lib/ofed/libvendor LIBDIALOGDIR= ${OBJTOP}/gnu/lib/libdialog LIBGCOVDIR= ${OBJTOP}/gnu/lib/libgcov LIBGOMPDIR= ${OBJTOP}/gnu/lib/libgomp LIBGNUREGEXDIR= ${OBJTOP}/gnu/lib/libregex LIBSSPDIR= ${OBJTOP}/gnu/lib/libssp LIBSSP_NONSHAREDDIR= ${OBJTOP}/gnu/lib/libssp/libssp_nonshared LIBSUPCPLUSPLUSDIR= ${OBJTOP}/gnu/lib/libsupc++ LIBASN1DIR= ${OBJTOP}/kerberos5/lib/libasn1 LIBGSSAPI_KRB5DIR= ${OBJTOP}/kerberos5/lib/libgssapi_krb5 LIBGSSAPI_NTLMDIR= ${OBJTOP}/kerberos5/lib/libgssapi_ntlm LIBGSSAPI_SPNEGODIR= ${OBJTOP}/kerberos5/lib/libgssapi_spnego LIBHDBDIR= ${OBJTOP}/kerberos5/lib/libhdb LIBHEIMBASEDIR= ${OBJTOP}/kerberos5/lib/libheimbase LIBHEIMIPCCDIR= ${OBJTOP}/kerberos5/lib/libheimipcc LIBHEIMIPCSDIR= ${OBJTOP}/kerberos5/lib/libheimipcs LIBHEIMNTLMDIR= ${OBJTOP}/kerberos5/lib/libheimntlm LIBHX509DIR= ${OBJTOP}/kerberos5/lib/libhx509 LIBKADM5CLNTDIR= ${OBJTOP}/kerberos5/lib/libkadm5clnt LIBKADM5SRVDIR= ${OBJTOP}/kerberos5/lib/libkadm5srv LIBKAFS5DIR= ${OBJTOP}/kerberos5/lib/libkafs5 LIBKDCDIR= ${OBJTOP}/kerberos5/lib/libkdc LIBKRB5DIR= ${OBJTOP}/kerberos5/lib/libkrb5 LIBROKENDIR= ${OBJTOP}/kerberos5/lib/libroken LIBWINDDIR= ${OBJTOP}/kerberos5/lib/libwind LIBATF_CDIR= ${OBJTOP}/lib/atf/libatf-c LIBATF_CXXDIR= ${OBJTOP}/lib/atf/libatf-c++ LIBGMOCKDIR= ${OBJTOP}/lib/googletest/gmock LIBGMOCK_MAINDIR= ${OBJTOP}/lib/googletest/gmock_main LIBGTESTDIR= ${OBJTOP}/lib/googletest/gtest LIBGTEST_MAINDIR= ${OBJTOP}/lib/googletest/gtest_main LIBALIASDIR= ${OBJTOP}/lib/libalias/libalias LIBBLACKLISTDIR= ${OBJTOP}/lib/libblacklist LIBBLOCKSRUNTIMEDIR= ${OBJTOP}/lib/libblocksruntime LIBBSNMPDIR= ${OBJTOP}/lib/libbsnmp/libbsnmp LIBCASPERDIR= ${OBJTOP}/lib/libcasper/libcasper LIBCAP_DNSDIR= ${OBJTOP}/lib/libcasper/services/cap_dns LIBCAP_GRPDIR= ${OBJTOP}/lib/libcasper/services/cap_grp LIBCAP_PWDDIR= ${OBJTOP}/lib/libcasper/services/cap_pwd LIBCAP_SYSCTLDIR= ${OBJTOP}/lib/libcasper/services/cap_sysctl LIBCAP_SYSLOGDIR= ${OBJTOP}/lib/libcasper/services/cap_syslog LIBBSDXMLDIR= ${OBJTOP}/lib/libexpat LIBKVMDIR= ${OBJTOP}/lib/libkvm LIBPTHREADDIR= ${OBJTOP}/lib/libthr LIBMDIR= ${OBJTOP}/lib/msun LIBFORMDIR= ${OBJTOP}/lib/ncurses/form LIBFORMLIBWDIR= ${OBJTOP}/lib/ncurses/formw LIBMENUDIR= ${OBJTOP}/lib/ncurses/menu LIBMENULIBWDIR= ${OBJTOP}/lib/ncurses/menuw LIBNCURSESDIR= ${OBJTOP}/lib/ncurses/ncurses LIBNCURSESWDIR= ${OBJTOP}/lib/ncurses/ncursesw LIBPANELDIR= ${OBJTOP}/lib/ncurses/panel LIBPANELWDIR= ${OBJTOP}/lib/ncurses/panelw LIBCRYPTODIR= ${OBJTOP}/secure/lib/libcrypto LIBSSHDIR= ${OBJTOP}/secure/lib/libssh LIBSSLDIR= ${OBJTOP}/secure/lib/libssl LIBTEKENDIR= ${OBJTOP}/sys/teken/libteken LIBEGACYDIR= ${OBJTOP}/tools/build LIBLNDIR= ${OBJTOP}/usr.bin/lex/lib LIBTERMCAPDIR= ${LIBNCURSESDIR} LIBTERMCAPWDIR= ${LIBNCURSESWDIR} # Default other library directories to lib/libNAME. .for lib in ${_LIBRARIES} LIB${lib:tu}DIR?= ${OBJTOP}/lib/lib${lib} .endfor # Validate that listed LIBADD are valid. .for _l in ${LIBADD} .if empty(_LIBRARIES:M${_l}) _BADLIBADD+= ${_l} .endif .endfor .if !empty(_BADLIBADD) .error ${.CURDIR}: Invalid LIBADD used which may need to be added to ${_this:T}: ${_BADLIBADD} .endif # Sanity check that libraries are defined here properly when building them. .if defined(LIB) && ${_LIBRARIES:M${LIB}} != "" .if !empty(LIBADD) && \ (!defined(_DP_${LIB}) || ${LIBADD:O:u} != ${_DP_${LIB}:O:u}) .error ${.CURDIR}: Missing or incorrect _DP_${LIB} entry in ${_this:T}. Should match LIBADD for ${LIB} ('${LIBADD}' vs '${_DP_${LIB}}') .endif # Note that OBJTOP is not yet defined here but for the purpose of the check # it is fine as it resolves to the SRC directory. .if !defined(LIB${LIB:tu}DIR) || !exists(${SRCTOP}/${LIB${LIB:tu}DIR:S,^${OBJTOP}/,,}) .error ${.CURDIR}: Missing or incorrect value for LIB${LIB:tu}DIR in ${_this:T}: ${LIB${LIB:tu}DIR:S,^${OBJTOP}/,,} .endif .if ${_INTERNALLIBS:M${LIB}} != "" && !defined(LIB${LIB:tu}) .error ${.CURDIR}: Missing value for LIB${LIB:tu} in ${_this:T}. Likely should be: LIB${LIB:tu}?= $${LIB${LIB:tu}DIR}/lib${LIB}.a .endif .endif .endif # !target(____) Index: projects/clang900-import/share/mk/src.opts.mk =================================================================== --- projects/clang900-import/share/mk/src.opts.mk (revision 352536) +++ projects/clang900-import/share/mk/src.opts.mk (revision 352537) @@ -1,588 +1,596 @@ # $FreeBSD$ # # Option file for FreeBSD /usr/src builds. # # Users define WITH_FOO and WITHOUT_FOO on the command line or in /etc/src.conf # and /etc/make.conf files. These translate in the build system to MK_FOO={yes,no} # with sensible (usually) defaults. # # Makefiles must include bsd.opts.mk after defining specific MK_FOO options that # are applicable for that Makefile (typically there are none, but sometimes there # are exceptions). Recursive makes usually add MK_FOO=no for options that they wish # to omit from that make. # # Makefiles must include bsd.mkopt.mk before they test the value of any MK_FOO # variable. # # Makefiles may also assume that this file is included by src.opts.mk should it # need variables defined there prior to the end of the Makefile where # bsd.{subdir,lib.bin}.mk is traditionally included. # # The old-style YES_FOO and NO_FOO are being phased out. No new instances of them # should be added. Old instances should be removed since they were just to # bridge the gap between FreeBSD 4 and FreeBSD 5. # # Makefiles should never test WITH_FOO or WITHOUT_FOO directly (although an # exception is made for _WITHOUT_SRCONF which turns off this mechanism # completely inside bsd.*.mk files). # .if !target(____) ____: .include # # Define MK_* variables (which are either "yes" or "no") for users # to set via WITH_*/WITHOUT_* in /etc/src.conf and override in the # make(1) environment. # These should be tested with `== "no"' or `!= "no"' in makefiles. # The NO_* variables should only be set by makefiles for variables # that haven't been converted over. # # These options are used by the src builds. Those listed in # __DEFAULT_YES_OPTIONS default to 'yes' and will build unless turned # off. __DEFAULT_NO_OPTIONS will default to 'no' and won't build # unless turned on. Any options listed in 'BROKEN_OPTIONS' will be # hard-wired to 'no'. "Broken" here means not working or # not-appropriate and/or not supported. It doesn't imply something is # wrong with the code. There's not a single good word for this, so # BROKEN was selected as the least imperfect one considered at the # time. Options are added to BROKEN_OPTIONS list on a per-arch basis. # At this time, there's no provision for mutually incompatible options. __DEFAULT_YES_OPTIONS = \ ACCT \ ACPI \ AMD \ APM \ AT \ ATM \ AUDIT \ AUTHPF \ AUTOFS \ BHYVE \ BINUTILS \ BINUTILS_BOOTSTRAP \ BLACKLIST \ BLUETOOTH \ BOOT \ BOOTPARAMD \ BOOTPD \ BSD_CPIO \ BSD_CRTBEGIN \ BSDINSTALL \ BSNMP \ BZIP2 \ CALENDAR \ CAPSICUM \ CASPER \ CCD \ CDDL \ CPP \ CROSS_COMPILER \ CRYPT \ CUSE \ CXX \ CXGBETOOL \ DIALOG \ DICT \ DMAGENT \ DYNAMICROOT \ EE \ EFI \ ELFTOOLCHAIN_BOOTSTRAP \ EXAMPLES \ FDT \ FILE \ FINGER \ FLOPPY \ FMTREE \ FORTH \ FP_LIBC \ FREEBSD_UPDATE \ FTP \ GAMES \ GCOV \ GDB \ GNU_DIFF \ GNU_GREP \ - GOOGLETEST \ GPIO \ HAST \ HTML \ HYPERV \ ICONV \ INET \ INET6 \ INETD \ IPFILTER \ IPFW \ ISCSI \ JAIL \ KDUMP \ KVM \ LDNS \ LDNS_UTILS \ LEGACY_CONSOLE \ LIB32 \ LIBPTHREAD \ LIBTHR \ LLVM_COV \ LOADER_GELI \ LOADER_LUA \ LOADER_OFW \ LOADER_UBOOT \ LOCALES \ LOCATE \ LPR \ LS_COLORS \ LZMA_SUPPORT \ MAIL \ MAILWRAPPER \ MAKE \ MLX5TOOL \ NDIS \ NETCAT \ NETGRAPH \ NLS_CATALOGS \ NS_CACHING \ NTP \ NVME \ OFED \ OPENSSL \ PAM \ PF \ PKGBOOTSTRAP \ PMC \ PORTSNAP \ PPP \ QUOTAS \ RADIUS_SUPPORT \ RBOOTD \ RESCUE \ ROUTED \ SENDMAIL \ SERVICESDB \ SETUID_LOGIN \ SHAREDOCS \ SOURCELESS \ SOURCELESS_HOST \ SOURCELESS_UCODE \ SVNLITE \ SYSCONS \ SYSTEM_COMPILER \ SYSTEM_LINKER \ TALK \ TCP_WRAPPERS \ TCSH \ TELNET \ TEXTPROC \ TFTP \ UNBOUND \ USB \ UTMPX \ VI \ VT \ WIRELESS \ WPA_SUPPLICANT_EAPOL \ ZFS \ LOADER_ZFS \ ZONEINFO __DEFAULT_NO_OPTIONS = \ BEARSSL \ BSD_GREP \ CLANG_EXTRAS \ DTRACE_TESTS \ EXPERIMENTAL \ GNU_GREP_COMPAT \ HESIOD \ LIBSOFT \ LOADER_FIREWIRE \ LOADER_FORCE_LE \ LOADER_VERBOSE \ LOADER_VERIEXEC_PASS_MANIFEST \ OFED_EXTRA \ OPENLDAP \ REPRODUCIBLE_BUILD \ RPCBIND_WARMSTART_SUPPORT \ SHARED_TOOLCHAIN \ SORT_THREADS \ SVN \ ZONEINFO_LEAPSECONDS_SUPPORT \ ZONEINFO_OLD_TIMEZONES_SUPPORT \ # LEFT/RIGHT. Left options which default to "yes" unless their corresponding # RIGHT option is disabled. __DEFAULT_DEPENDENT_OPTIONS= \ CLANG_FULL/CLANG \ LLVM_TARGET_ALL/CLANG \ LOADER_VERIEXEC/BEARSSL \ LOADER_EFI_SECUREBOOT/LOADER_VERIEXEC \ VERIEXEC/BEARSSL \ # MK_*_SUPPORT options which default to "yes" unless their corresponding # MK_* variable is set to "no". # .for var in \ BLACKLIST \ BZIP2 \ INET \ INET6 \ KERBEROS \ KVM \ NETGRAPH \ PAM \ TESTS \ WIRELESS __DEFAULT_DEPENDENT_OPTIONS+= ${var}_SUPPORT/${var} .endfor # # Default behaviour of some options depends on the architecture. Unfortunately # this means that we have to test TARGET_ARCH (the buildworld case) as well # as MACHINE_ARCH (the non-buildworld case). Normally TARGET_ARCH is not # used at all in bsd.*.mk, but we have to make an exception here if we want # to allow defaults for some things like clang to vary by target architecture. # Additional, per-target behavior should be rarely added only after much # gnashing of teeth and grinding of gears. # .if defined(TARGET_ARCH) __T=${TARGET_ARCH} .else __T=${MACHINE_ARCH} .endif .if defined(TARGET) __TT=${TARGET} .else __TT=${MACHINE} +.endif + +# Default GOOGLETEST to off for MIPS while LLVM PR 43263 is active. Part +# of the fusefs tests trigger excessively long compile times. It does +# eventually succeed, but this shouldn't be forced on those building by default. +.if ${__TT} == "mips" +__DEFAULT_NO_OPTIONS+= GOOGLETEST +.else +__DEFAULT_YES_OPTIONS+= GOOGLETEST .endif # All supported backends for LLVM_TARGET_XXX __LLVM_TARGETS= \ aarch64 \ arm \ mips \ powerpc \ sparc \ x86 __LLVM_TARGET_FILT= C/(amd64|i386)/x86/:S/sparc64/sparc/:S/arm64/aarch64/:S/powerpc64/powerpc/ .for __llt in ${__LLVM_TARGETS} # Default the given TARGET's LLVM_TARGET support to the value of MK_CLANG. .if ${__TT:${__LLVM_TARGET_FILT}} == ${__llt} __DEFAULT_DEPENDENT_OPTIONS+= LLVM_TARGET_${__llt:${__LLVM_TARGET_FILT}:tu}/CLANG # Disable other targets for arm and armv6, to work around "relocation truncated # to fit" errors with BFD ld, since libllvm.a will get too large to link. .elif ${__T} == "arm" || ${__T} == "armv6" __DEFAULT_NO_OPTIONS+=LLVM_TARGET_${__llt:tu} # aarch64 needs arm for -m32 support. .elif ${__TT} == "arm64" && ${__llt} == "arm" __DEFAULT_DEPENDENT_OPTIONS+= LLVM_TARGET_ARM/LLVM_TARGET_AARCH64 # Default the rest of the LLVM_TARGETs to the value of MK_LLVM_TARGET_ALL # which is based on MK_CLANG. .else __DEFAULT_DEPENDENT_OPTIONS+= LLVM_TARGET_${__llt:${__LLVM_TARGET_FILT}:tu}/LLVM_TARGET_ALL .endif .endfor __DEFAULT_NO_OPTIONS+=LLVM_TARGET_BPF __DEFAULT_NO_OPTIONS+=LLVM_TARGET_RISCV .include # If the compiler is not C++11 capable, disable Clang and use GCC instead. # This means that architectures that have GCC 4.2 as default can not # build Clang without using an external compiler. .if ${COMPILER_FEATURES:Mc++11} && (${__T} == "aarch64" || \ ${__T} == "amd64" || ${__TT} == "arm" || ${__T} == "i386") # Clang is enabled, and will be installed as the default /usr/bin/cc. __DEFAULT_YES_OPTIONS+=CLANG CLANG_BOOTSTRAP CLANG_IS_CC LLD __DEFAULT_NO_OPTIONS+=GCC GCC_BOOTSTRAP GNUCXX GPL_DTC .elif ${COMPILER_FEATURES:Mc++11} && ${__T:Mriscv*} == "" && ${__T} != "sparc64" # If an external compiler that supports C++11 is used as ${CC} and Clang # supports the target, then Clang is enabled but GCC is installed as the # default /usr/bin/cc. __DEFAULT_YES_OPTIONS+=CLANG GCC GCC_BOOTSTRAP GNUCXX GPL_DTC LLD __DEFAULT_NO_OPTIONS+=CLANG_BOOTSTRAP CLANG_IS_CC .else # Everything else disables Clang, and uses GCC instead. __DEFAULT_YES_OPTIONS+=GCC GCC_BOOTSTRAP GNUCXX GPL_DTC __DEFAULT_NO_OPTIONS+=CLANG CLANG_BOOTSTRAP CLANG_IS_CC LLD .endif # In-tree binutils/gcc are older versions without modern architecture support. .if ${__T} == "aarch64" || ${__T:Mriscv*} != "" BROKEN_OPTIONS+=BINUTILS BINUTILS_BOOTSTRAP GCC GCC_BOOTSTRAP GDB .endif .if ${__T:Mriscv*} != "" BROKEN_OPTIONS+=OFED .endif .if ${__T} == "aarch64" || ${__T} == "amd64" || ${__T} == "i386" || \ ${__T:Mriscv*} != "" || ${__TT} == "mips" __DEFAULT_YES_OPTIONS+=LLVM_LIBUNWIND .else __DEFAULT_NO_OPTIONS+=LLVM_LIBUNWIND .endif .if ${__T} == "aarch64" || ${__T} == "amd64" || ${__T} == "armv7" || \ ${__T} == "i386" __DEFAULT_YES_OPTIONS+=LLD_BOOTSTRAP LLD_IS_LD .else __DEFAULT_NO_OPTIONS+=LLD_BOOTSTRAP LLD_IS_LD .endif .if ${__T} == "aarch64" || ${__T} == "amd64" || ${__T} == "i386" __DEFAULT_YES_OPTIONS+=LLDB .else __DEFAULT_NO_OPTIONS+=LLDB .endif # LLVM lacks support for FreeBSD 64-bit atomic operations for ARMv4/ARMv5 .if ${__T} == "arm" BROKEN_OPTIONS+=LLDB .endif # GDB in base is generally less functional than GDB in ports. Ports GDB # sparc64 kernel support has not been tested. .if ${__T} == "sparc64" __DEFAULT_NO_OPTIONS+=GDB_LIBEXEC .else __DEFAULT_YES_OPTIONS+=GDB_LIBEXEC .endif # Only doing soft float API stuff on armv6 and armv7 .if ${__T} != "armv6" && ${__T} != "armv7" BROKEN_OPTIONS+=LIBSOFT .endif .if ${__T:Mmips*} BROKEN_OPTIONS+=SSP .endif # EFI doesn't exist on mips, powerpc, sparc or riscv. .if ${__T:Mmips*} || ${__T:Mpowerpc*} || ${__T:Msparc64} || ${__T:Mriscv*} BROKEN_OPTIONS+=EFI .endif # OFW is only for powerpc and sparc64, exclude others .if ${__T:Mpowerpc*} == "" && ${__T:Msparc64} == "" BROKEN_OPTIONS+=LOADER_OFW .endif # UBOOT is only for arm, mips and powerpc, exclude others .if ${__T:Marm*} == "" && ${__T:Mmips*} == "" && ${__T:Mpowerpc*} == "" BROKEN_OPTIONS+=LOADER_UBOOT .endif # GELI and Lua in loader currently cause boot failures on sparc64 and powerpc. # Further debugging is required -- probably they are just broken on big # endian systems generically (they jump to null pointers or try to read # crazy high addresses, which is typical of endianness problems). .if ${__T} == "sparc64" || ${__T:Mpowerpc*} BROKEN_OPTIONS+=LOADER_GELI LOADER_LUA .endif .if ${__T:Mmips64*} # profiling won't work on MIPS64 because there is only assembly for o32 BROKEN_OPTIONS+=PROFILE .endif .if ${__T} != "aarch64" && ${__T} != "amd64" && ${__T} != "i386" && \ ${__T} != "powerpc64" && ${__T} != "sparc64" BROKEN_OPTIONS+=CXGBETOOL BROKEN_OPTIONS+=MLX5TOOL .endif # HyperV is currently x86-only .if ${__T} != "amd64" && ${__T} != "i386" BROKEN_OPTIONS+=HYPERV .endif # NVME is only aarch64, x86 and powerpc64 .if ${__T} != "aarch64" && ${__T} != "amd64" && ${__T} != "i386" && ${__T} != "powerpc64" BROKEN_OPTIONS+=NVME .endif # Sparc64 need extra crt*.o files .if ${__T:Msparc64} BROKEN_OPTIONS+=BSD_CRTBEGIN .endif .if ${COMPILER_FEATURES:Mc++11} && (${__T} == "amd64" || ${__T} == "i386") __DEFAULT_YES_OPTIONS+=OPENMP .else __DEFAULT_NO_OPTIONS+=OPENMP .endif .include # # MK_* options that default to "yes" if the compiler is a C++11 compiler. # .for var in \ LIBCPLUSPLUS .if !defined(MK_${var}) .if ${COMPILER_FEATURES:Mc++11} .if defined(WITHOUT_${var}) MK_${var}:= no .else MK_${var}:= yes .endif .else .if defined(WITH_${var}) MK_${var}:= yes .else MK_${var}:= no .endif .endif .endif .endfor # # Force some options off if their dependencies are off. # Order is somewhat important. # .if !${COMPILER_FEATURES:Mc++11} MK_GOOGLETEST:= no MK_LLVM_LIBUNWIND:= no .endif .if ${MK_CAPSICUM} == "no" MK_CASPER:= no .endif .if ${MK_LIBPTHREAD} == "no" MK_LIBTHR:= no .endif .if ${MK_LDNS} == "no" MK_LDNS_UTILS:= no MK_UNBOUND:= no .endif .if ${MK_SOURCELESS} == "no" MK_SOURCELESS_HOST:= no MK_SOURCELESS_UCODE:= no .endif .if ${MK_CDDL} == "no" MK_ZFS:= no MK_LOADER_ZFS:= no MK_CTF:= no .endif .if ${MK_CRYPT} == "no" MK_OPENSSL:= no MK_OPENSSH:= no MK_KERBEROS:= no .endif .if ${MK_CXX} == "no" MK_CLANG:= no MK_GNUCXX:= no MK_TESTS:= no .endif .if ${MK_DIALOG} == "no" MK_BSDINSTALL:= no .endif .if ${MK_MAIL} == "no" MK_MAILWRAPPER:= no MK_SENDMAIL:= no MK_DMAGENT:= no .endif .if ${MK_NETGRAPH} == "no" MK_ATM:= no MK_BLUETOOTH:= no .endif .if ${MK_NLS} == "no" MK_NLS_CATALOGS:= no .endif .if ${MK_OPENSSL} == "no" MK_OPENSSH:= no MK_KERBEROS:= no .endif .if ${MK_PF} == "no" MK_AUTHPF:= no .endif .if ${MK_OFED} == "no" MK_OFED_EXTRA:= no .endif .if ${MK_PORTSNAP} == "no" # freebsd-update depends on phttpget from portsnap MK_FREEBSD_UPDATE:= no .endif .if ${MK_TESTS} == "no" MK_DTRACE_TESTS:= no .endif .if ${MK_TESTS_SUPPORT} == "no" MK_GOOGLETEST:= no .endif .if ${MK_ZONEINFO} == "no" MK_ZONEINFO_LEAPSECONDS_SUPPORT:= no MK_ZONEINFO_OLD_TIMEZONES_SUPPORT:= no .endif .if ${MK_CROSS_COMPILER} == "no" MK_BINUTILS_BOOTSTRAP:= no MK_CLANG_BOOTSTRAP:= no MK_ELFTOOLCHAIN_BOOTSTRAP:= no MK_GCC_BOOTSTRAP:= no MK_LLD_BOOTSTRAP:= no .endif .if ${MK_TOOLCHAIN} == "no" MK_BINUTILS:= no MK_CLANG:= no MK_GCC:= no MK_GDB:= no MK_INCLUDES:= no MK_LLD:= no MK_LLDB:= no .endif .if ${MK_CLANG} == "no" MK_CLANG_EXTRAS:= no MK_CLANG_FULL:= no MK_LLVM_COV:= no .endif .if ${MK_LOADER_VERIEXEC} == "no" MK_LOADER_VERIEXEC_PASS_MANIFEST := no .endif # # MK_* options whose default value depends on another option. # .for vv in \ GSSAPI/KERBEROS \ MAN_UTILS/MAN .if defined(WITH_${vv:H}) MK_${vv:H}:= yes .elif defined(WITHOUT_${vv:H}) MK_${vv:H}:= no .else MK_${vv:H}:= ${MK_${vv:T}} .endif .endfor # # Set defaults for the MK_*_SUPPORT variables. # .if !${COMPILER_FEATURES:Mc++11} MK_LLDB:= no .endif # gcc 4.8 and newer supports libc++, so suppress gnuc++ in that case. # while in theory we could build it with that, we don't want to do # that since it creates too much confusion for too little gain. # XXX: This is incomplete and needs X_COMPILER_TYPE/VERSION checks too # to prevent Makefile.inc1 from bootstrapping unneeded dependencies # and to support 'make delete-old' when supplying an external toolchain. .if ${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 40800 MK_GNUCXX:=no MK_GCC:=no .endif .endif # !target(____) Index: projects/clang900-import/stand/efi/libefi/efipart.c =================================================================== --- projects/clang900-import/stand/efi/libefi/efipart.c (revision 352536) +++ projects/clang900-import/stand/efi/libefi/efipart.c (revision 352537) @@ -1,1067 +1,1156 @@ /*- * Copyright (c) 2010 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include static EFI_GUID blkio_guid = BLOCK_IO_PROTOCOL; static int efipart_initfd(void); static int efipart_initcd(void); static int efipart_inithd(void); static int efipart_strategy(void *, int, daddr_t, size_t, char *, size_t *); static int efipart_realstrategy(void *, int, daddr_t, size_t, char *, size_t *); static int efipart_open(struct open_file *, ...); static int efipart_close(struct open_file *); static int efipart_ioctl(struct open_file *, u_long, void *); static int efipart_printfd(int); static int efipart_printcd(int); static int efipart_printhd(int); /* EISA PNP ID's for floppy controllers */ #define PNP0604 0x604 #define PNP0700 0x700 #define PNP0701 0x701 +/* Bounce buffer max size */ +#define BIO_BUFFER_SIZE 0x4000 + struct devsw efipart_fddev = { .dv_name = "fd", .dv_type = DEVT_FD, .dv_init = efipart_initfd, .dv_strategy = efipart_strategy, .dv_open = efipart_open, .dv_close = efipart_close, .dv_ioctl = efipart_ioctl, .dv_print = efipart_printfd, .dv_cleanup = NULL }; struct devsw efipart_cddev = { .dv_name = "cd", .dv_type = DEVT_CD, .dv_init = efipart_initcd, .dv_strategy = efipart_strategy, .dv_open = efipart_open, .dv_close = efipart_close, .dv_ioctl = efipart_ioctl, .dv_print = efipart_printcd, .dv_cleanup = NULL }; struct devsw efipart_hddev = { .dv_name = "disk", .dv_type = DEVT_DISK, .dv_init = efipart_inithd, .dv_strategy = efipart_strategy, .dv_open = efipart_open, .dv_close = efipart_close, .dv_ioctl = efipart_ioctl, .dv_print = efipart_printhd, .dv_cleanup = NULL }; static pdinfo_list_t fdinfo = STAILQ_HEAD_INITIALIZER(fdinfo); static pdinfo_list_t cdinfo = STAILQ_HEAD_INITIALIZER(cdinfo); static pdinfo_list_t hdinfo = STAILQ_HEAD_INITIALIZER(hdinfo); /* * efipart_inithandles() is used to build up the pdinfo list from * block device handles. Then each devsw init callback is used to * pick items from pdinfo and move to proper device list. * In ideal world, we should end up with empty pdinfo once all * devsw initializers are called. */ static pdinfo_list_t pdinfo = STAILQ_HEAD_INITIALIZER(pdinfo); pdinfo_list_t * efiblk_get_pdinfo_list(struct devsw *dev) { if (dev->dv_type == DEVT_DISK) return (&hdinfo); if (dev->dv_type == DEVT_CD) return (&cdinfo); if (dev->dv_type == DEVT_FD) return (&fdinfo); return (NULL); } /* XXX this gets called way way too often, investigate */ pdinfo_t * efiblk_get_pdinfo(struct devdesc *dev) { pdinfo_list_t *pdi; pdinfo_t *pd = NULL; pdi = efiblk_get_pdinfo_list(dev->d_dev); if (pdi == NULL) return (pd); STAILQ_FOREACH(pd, pdi, pd_link) { if (pd->pd_unit == dev->d_unit) return (pd); } return (pd); } pdinfo_t * efiblk_get_pdinfo_by_device_path(EFI_DEVICE_PATH *path) { EFI_HANDLE h; EFI_STATUS status; EFI_DEVICE_PATH *devp = path; status = BS->LocateDevicePath(&blkio_guid, &devp, &h); if (EFI_ERROR(status)) return (NULL); return (efiblk_get_pdinfo_by_handle(h)); } static bool same_handle(pdinfo_t *pd, EFI_HANDLE h) { return (pd->pd_handle == h || pd->pd_alias == h); } pdinfo_t * efiblk_get_pdinfo_by_handle(EFI_HANDLE h) { pdinfo_t *dp, *pp; /* * Check hard disks, then cd, then floppy */ STAILQ_FOREACH(dp, &hdinfo, pd_link) { if (same_handle(dp, h)) return (dp); STAILQ_FOREACH(pp, &dp->pd_part, pd_link) { if (same_handle(pp, h)) return (pp); } } STAILQ_FOREACH(dp, &cdinfo, pd_link) { if (same_handle(dp, h)) return (dp); STAILQ_FOREACH(pp, &dp->pd_part, pd_link) { if (same_handle(pp, h)) return (pp); } } STAILQ_FOREACH(dp, &fdinfo, pd_link) { if (same_handle(dp, h)) return (dp); } return (NULL); } static int efiblk_pdinfo_count(pdinfo_list_t *pdi) { pdinfo_t *pd; int i = 0; STAILQ_FOREACH(pd, pdi, pd_link) { i++; } return (i); } int efipart_inithandles(void) { unsigned i, nin; UINTN sz; EFI_HANDLE *hin; EFI_DEVICE_PATH *devpath; EFI_BLOCK_IO *blkio; EFI_STATUS status; pdinfo_t *pd; if (!STAILQ_EMPTY(&pdinfo)) return (0); sz = 0; hin = NULL; status = BS->LocateHandle(ByProtocol, &blkio_guid, 0, &sz, hin); if (status == EFI_BUFFER_TOO_SMALL) { hin = malloc(sz); status = BS->LocateHandle(ByProtocol, &blkio_guid, 0, &sz, hin); if (EFI_ERROR(status)) free(hin); } if (EFI_ERROR(status)) return (efi_status_to_errno(status)); nin = sz / sizeof(*hin); #ifdef EFIPART_DEBUG printf("%s: Got %d BLOCK IO MEDIA handle(s)\n", __func__, nin); #endif for (i = 0; i < nin; i++) { /* * Get devpath and open protocol. * We should not get errors here */ if ((devpath = efi_lookup_devpath(hin[i])) == NULL) continue; status = OpenProtocolByHandle(hin[i], &blkio_guid, (void **)&blkio); if (EFI_ERROR(status)) { printf("error %lu\n", EFI_ERROR_CODE(status)); continue; } /* * We assume the block size 512 or greater power of 2. * Also skip devices with block size > 64k (16 is max * ashift supported by zfs). * iPXE is known to insert stub BLOCK IO device with * BlockSize 1. */ if (blkio->Media->BlockSize < 512 || blkio->Media->BlockSize > (1 << 16) || !powerof2(blkio->Media->BlockSize)) { continue; } + /* Allowed values are 0, 1 and power of 2. */ + if (blkio->Media->IoAlign > 1 && + !powerof2(blkio->Media->IoAlign)) { + continue; + } + /* This is bad. */ if ((pd = calloc(1, sizeof(*pd))) == NULL) { printf("efipart_inithandles: Out of memory.\n"); free(hin); return (ENOMEM); } STAILQ_INIT(&pd->pd_part); pd->pd_handle = hin[i]; pd->pd_devpath = devpath; pd->pd_blkio = blkio; STAILQ_INSERT_TAIL(&pdinfo, pd, pd_link); } free(hin); return (0); } static ACPI_HID_DEVICE_PATH * efipart_floppy(EFI_DEVICE_PATH *node) { ACPI_HID_DEVICE_PATH *acpi; if (DevicePathType(node) == ACPI_DEVICE_PATH && DevicePathSubType(node) == ACPI_DP) { acpi = (ACPI_HID_DEVICE_PATH *) node; if (acpi->HID == EISA_PNP_ID(PNP0604) || acpi->HID == EISA_PNP_ID(PNP0700) || acpi->HID == EISA_PNP_ID(PNP0701)) { return (acpi); } } return (NULL); } static pdinfo_t * efipart_find_parent(pdinfo_list_t *pdi, EFI_DEVICE_PATH *devpath) { pdinfo_t *pd; STAILQ_FOREACH(pd, pdi, pd_link) { if (efi_devpath_is_prefix(pd->pd_devpath, devpath)) return (pd); } return (NULL); } static int efipart_initfd(void) { EFI_DEVICE_PATH *node; ACPI_HID_DEVICE_PATH *acpi; pdinfo_t *parent, *fd; restart: STAILQ_FOREACH(fd, &pdinfo, pd_link) { if ((node = efi_devpath_last_node(fd->pd_devpath)) == NULL) continue; if ((acpi = efipart_floppy(node)) == NULL) continue; STAILQ_REMOVE(&pdinfo, fd, pdinfo, pd_link); parent = efipart_find_parent(&pdinfo, fd->pd_devpath); if (parent != NULL) { STAILQ_REMOVE(&pdinfo, parent, pdinfo, pd_link); parent->pd_alias = fd->pd_handle; parent->pd_unit = acpi->UID; free(fd); fd = parent; } else { fd->pd_unit = acpi->UID; } fd->pd_devsw = &efipart_fddev; STAILQ_INSERT_TAIL(&fdinfo, fd, pd_link); goto restart; } bcache_add_dev(efiblk_pdinfo_count(&fdinfo)); return (0); } /* * Add or update entries with new handle data. */ static void efipart_cdinfo_add(pdinfo_t *cd) { pdinfo_t *pd, *last; STAILQ_FOREACH(pd, &cdinfo, pd_link) { if (efi_devpath_is_prefix(pd->pd_devpath, cd->pd_devpath)) { last = STAILQ_LAST(&pd->pd_part, pdinfo, pd_link); if (last != NULL) cd->pd_unit = last->pd_unit + 1; else cd->pd_unit = 0; cd->pd_parent = pd; cd->pd_devsw = &efipart_cddev; STAILQ_INSERT_TAIL(&pd->pd_part, cd, pd_link); return; } } last = STAILQ_LAST(&cdinfo, pdinfo, pd_link); if (last != NULL) cd->pd_unit = last->pd_unit + 1; else cd->pd_unit = 0; cd->pd_parent = NULL; cd->pd_devsw = &efipart_cddev; STAILQ_INSERT_TAIL(&cdinfo, cd, pd_link); } static bool efipart_testcd(EFI_DEVICE_PATH *node, EFI_BLOCK_IO *blkio) { if (DevicePathType(node) == MEDIA_DEVICE_PATH && DevicePathSubType(node) == MEDIA_CDROM_DP) { return (true); } /* cd drive without the media. */ if (blkio->Media->RemovableMedia && !blkio->Media->MediaPresent) { return (true); } return (false); } static void efipart_updatecd(void) { EFI_DEVICE_PATH *devpath, *node; EFI_STATUS status; pdinfo_t *parent, *cd; restart: STAILQ_FOREACH(cd, &pdinfo, pd_link) { if ((node = efi_devpath_last_node(cd->pd_devpath)) == NULL) continue; if (efipart_floppy(node) != NULL) continue; /* Is parent of this device already registered? */ parent = efipart_find_parent(&cdinfo, cd->pd_devpath); if (parent != NULL) { STAILQ_REMOVE(&pdinfo, cd, pdinfo, pd_link); efipart_cdinfo_add(cd); goto restart; } if (!efipart_testcd(node, cd->pd_blkio)) continue; /* Find parent and unlink both parent and cd from pdinfo */ STAILQ_REMOVE(&pdinfo, cd, pdinfo, pd_link); parent = efipart_find_parent(&pdinfo, cd->pd_devpath); if (parent != NULL) { STAILQ_REMOVE(&pdinfo, parent, pdinfo, pd_link); efipart_cdinfo_add(parent); } if (parent == NULL) parent = efipart_find_parent(&cdinfo, cd->pd_devpath); /* * If we come across a logical partition of subtype CDROM * it doesn't refer to the CD filesystem itself, but rather * to any usable El Torito boot image on it. In this case * we try to find the parent device and add that instead as * that will be the CD filesystem. */ if (DevicePathType(node) == MEDIA_DEVICE_PATH && DevicePathSubType(node) == MEDIA_CDROM_DP && parent == NULL) { parent = calloc(1, sizeof(*parent)); if (parent == NULL) { printf("efipart_updatecd: out of memory\n"); /* this device is lost but try again. */ free(cd); goto restart; } devpath = efi_devpath_trim(cd->pd_devpath); if (devpath == NULL) { printf("efipart_updatecd: out of memory\n"); /* this device is lost but try again. */ free(parent); free(cd); goto restart; } parent->pd_devpath = devpath; status = BS->LocateDevicePath(&blkio_guid, &parent->pd_devpath, &parent->pd_handle); free(devpath); if (EFI_ERROR(status)) { printf("efipart_updatecd: error %lu\n", EFI_ERROR_CODE(status)); free(parent); free(cd); goto restart; } parent->pd_devpath = efi_lookup_devpath(parent->pd_handle); efipart_cdinfo_add(parent); } efipart_cdinfo_add(cd); goto restart; } } static int efipart_initcd(void) { efipart_updatecd(); bcache_add_dev(efiblk_pdinfo_count(&cdinfo)); return (0); } static void efipart_hdinfo_add(pdinfo_t *hd, HARDDRIVE_DEVICE_PATH *node) { pdinfo_t *pd, *last; STAILQ_FOREACH(pd, &hdinfo, pd_link) { if (efi_devpath_is_prefix(pd->pd_devpath, hd->pd_devpath)) { /* Add the partition. */ hd->pd_unit = node->PartitionNumber; hd->pd_parent = pd; hd->pd_devsw = &efipart_hddev; STAILQ_INSERT_TAIL(&pd->pd_part, hd, pd_link); return; } } last = STAILQ_LAST(&hdinfo, pdinfo, pd_link); if (last != NULL) hd->pd_unit = last->pd_unit + 1; else hd->pd_unit = 0; /* Add the disk. */ hd->pd_devsw = &efipart_hddev; STAILQ_INSERT_TAIL(&hdinfo, hd, pd_link); } /* * The MEDIA_FILEPATH_DP has device name. * From U-Boot sources it looks like names are in the form * of typeN:M, where type is interface type, N is disk id * and M is partition id. */ static void efipart_hdinfo_add_filepath(pdinfo_t *hd, FILEPATH_DEVICE_PATH *node) { char *pathname, *p; int len; pdinfo_t *last; last = STAILQ_LAST(&hdinfo, pdinfo, pd_link); if (last != NULL) hd->pd_unit = last->pd_unit + 1; else hd->pd_unit = 0; /* FILEPATH_DEVICE_PATH has 0 terminated string */ len = ucs2len(node->PathName); if ((pathname = malloc(len + 1)) == NULL) { printf("Failed to add disk, out of memory\n"); free(hd); return; } cpy16to8(node->PathName, pathname, len + 1); p = strchr(pathname, ':'); /* * Assume we are receiving handles in order, first disk handle, * then partitions for this disk. If this assumption proves * false, this code would need update. */ if (p == NULL) { /* no colon, add the disk */ hd->pd_devsw = &efipart_hddev; STAILQ_INSERT_TAIL(&hdinfo, hd, pd_link); free(pathname); return; } p++; /* skip the colon */ errno = 0; hd->pd_unit = (int)strtol(p, NULL, 0); if (errno != 0) { printf("Bad unit number for partition \"%s\"\n", pathname); free(pathname); free(hd); return; } /* * We should have disk registered, if not, we are receiving * handles out of order, and this code should be reworked * to create "blank" disk for partition, and to find the * disk based on PathName compares. */ if (last == NULL) { printf("BUG: No disk for partition \"%s\"\n", pathname); free(pathname); free(hd); return; } /* Add the partition. */ hd->pd_parent = last; hd->pd_devsw = &efipart_hddev; STAILQ_INSERT_TAIL(&last->pd_part, hd, pd_link); free(pathname); } static void efipart_updatehd(void) { EFI_DEVICE_PATH *devpath, *node; EFI_STATUS status; pdinfo_t *parent, *hd; restart: STAILQ_FOREACH(hd, &pdinfo, pd_link) { if ((node = efi_devpath_last_node(hd->pd_devpath)) == NULL) continue; if (efipart_floppy(node) != NULL) continue; if (efipart_testcd(node, hd->pd_blkio)) continue; if (DevicePathType(node) == HARDWARE_DEVICE_PATH && (DevicePathSubType(node) == HW_PCI_DP || DevicePathSubType(node) == HW_VENDOR_DP)) { STAILQ_REMOVE(&pdinfo, hd, pdinfo, pd_link); efipart_hdinfo_add(hd, NULL); goto restart; } if (DevicePathType(node) == MEDIA_DEVICE_PATH && DevicePathSubType(node) == MEDIA_FILEPATH_DP) { STAILQ_REMOVE(&pdinfo, hd, pdinfo, pd_link); efipart_hdinfo_add_filepath(hd, (FILEPATH_DEVICE_PATH *)node); goto restart; } STAILQ_REMOVE(&pdinfo, hd, pdinfo, pd_link); parent = efipart_find_parent(&pdinfo, hd->pd_devpath); if (parent != NULL) { STAILQ_REMOVE(&pdinfo, parent, pdinfo, pd_link); efipart_hdinfo_add(parent, NULL); } else { parent = efipart_find_parent(&hdinfo, hd->pd_devpath); } if (DevicePathType(node) == MEDIA_DEVICE_PATH && DevicePathSubType(node) == MEDIA_HARDDRIVE_DP && parent == NULL) { parent = calloc(1, sizeof(*parent)); if (parent == NULL) { printf("efipart_updatehd: out of memory\n"); /* this device is lost but try again. */ free(hd); goto restart; } devpath = efi_devpath_trim(hd->pd_devpath); if (devpath == NULL) { printf("efipart_updatehd: out of memory\n"); /* this device is lost but try again. */ free(parent); free(hd); goto restart; } parent->pd_devpath = devpath; status = BS->LocateDevicePath(&blkio_guid, &parent->pd_devpath, &parent->pd_handle); free(devpath); if (EFI_ERROR(status)) { printf("efipart_updatehd: error %lu\n", EFI_ERROR_CODE(status)); free(parent); free(hd); goto restart; } parent->pd_devpath = efi_lookup_devpath(&parent->pd_handle); efipart_hdinfo_add(parent, NULL); } efipart_hdinfo_add(hd, (HARDDRIVE_DEVICE_PATH *)node); goto restart; } } static int efipart_inithd(void) { efipart_updatehd(); bcache_add_dev(efiblk_pdinfo_count(&hdinfo)); return (0); } static int efipart_print_common(struct devsw *dev, pdinfo_list_t *pdlist, int verbose) { int ret = 0; EFI_BLOCK_IO *blkio; EFI_STATUS status; EFI_HANDLE h; pdinfo_t *pd; CHAR16 *text; struct disk_devdesc pd_dev; char line[80]; if (STAILQ_EMPTY(pdlist)) return (0); printf("%s devices:", dev->dv_name); if ((ret = pager_output("\n")) != 0) return (ret); STAILQ_FOREACH(pd, pdlist, pd_link) { h = pd->pd_handle; if (verbose) { /* Output the device path. */ text = efi_devpath_name(efi_lookup_devpath(h)); if (text != NULL) { printf(" %S", text); efi_free_devpath_name(text); if ((ret = pager_output("\n")) != 0) break; } } snprintf(line, sizeof(line), " %s%d", dev->dv_name, pd->pd_unit); printf("%s:", line); status = OpenProtocolByHandle(h, &blkio_guid, (void **)&blkio); if (!EFI_ERROR(status)) { printf(" %llu", blkio->Media->LastBlock == 0? 0: (unsigned long long) (blkio->Media->LastBlock + 1)); if (blkio->Media->LastBlock != 0) { printf(" X %u", blkio->Media->BlockSize); } printf(" blocks"); if (blkio->Media->MediaPresent) { if (blkio->Media->RemovableMedia) printf(" (removable)"); } else { printf(" (no media)"); } if ((ret = pager_output("\n")) != 0) break; if (!blkio->Media->MediaPresent) continue; pd->pd_blkio = blkio; pd_dev.dd.d_dev = dev; pd_dev.dd.d_unit = pd->pd_unit; pd_dev.d_slice = D_SLICENONE; pd_dev.d_partition = D_PARTNONE; ret = disk_open(&pd_dev, blkio->Media->BlockSize * (blkio->Media->LastBlock + 1), blkio->Media->BlockSize); if (ret == 0) { ret = disk_print(&pd_dev, line, verbose); disk_close(&pd_dev); if (ret != 0) return (ret); } else { /* Do not fail from disk_open() */ ret = 0; } } else { if ((ret = pager_output("\n")) != 0) break; } } return (ret); } static int efipart_printfd(int verbose) { return (efipart_print_common(&efipart_fddev, &fdinfo, verbose)); } static int efipart_printcd(int verbose) { return (efipart_print_common(&efipart_cddev, &cdinfo, verbose)); } static int efipart_printhd(int verbose) { return (efipart_print_common(&efipart_hddev, &hdinfo, verbose)); } static int efipart_open(struct open_file *f, ...) { va_list args; struct disk_devdesc *dev; pdinfo_t *pd; EFI_BLOCK_IO *blkio; EFI_STATUS status; va_start(args, f); dev = va_arg(args, struct disk_devdesc *); va_end(args); if (dev == NULL) return (EINVAL); pd = efiblk_get_pdinfo((struct devdesc *)dev); if (pd == NULL) return (EIO); if (pd->pd_blkio == NULL) { status = OpenProtocolByHandle(pd->pd_handle, &blkio_guid, (void **)&pd->pd_blkio); if (EFI_ERROR(status)) return (efi_status_to_errno(status)); } blkio = pd->pd_blkio; if (!blkio->Media->MediaPresent) return (EAGAIN); pd->pd_open++; if (pd->pd_bcache == NULL) pd->pd_bcache = bcache_allocate(); if (dev->dd.d_dev->dv_type == DEVT_DISK) { int rc; rc = disk_open(dev, blkio->Media->BlockSize * (blkio->Media->LastBlock + 1), blkio->Media->BlockSize); if (rc != 0) { pd->pd_open--; if (pd->pd_open == 0) { pd->pd_blkio = NULL; bcache_free(pd->pd_bcache); pd->pd_bcache = NULL; } } return (rc); } return (0); } static int efipart_close(struct open_file *f) { struct disk_devdesc *dev; pdinfo_t *pd; dev = (struct disk_devdesc *)(f->f_devdata); if (dev == NULL) return (EINVAL); pd = efiblk_get_pdinfo((struct devdesc *)dev); if (pd == NULL) return (EINVAL); pd->pd_open--; if (pd->pd_open == 0) { pd->pd_blkio = NULL; bcache_free(pd->pd_bcache); pd->pd_bcache = NULL; } if (dev->dd.d_dev->dv_type == DEVT_DISK) return (disk_close(dev)); return (0); } static int efipart_ioctl(struct open_file *f, u_long cmd, void *data) { struct disk_devdesc *dev; pdinfo_t *pd; int rc; dev = (struct disk_devdesc *)(f->f_devdata); if (dev == NULL) return (EINVAL); pd = efiblk_get_pdinfo((struct devdesc *)dev); if (pd == NULL) return (EINVAL); if (dev->dd.d_dev->dv_type == DEVT_DISK) { rc = disk_ioctl(dev, cmd, data); if (rc != ENOTTY) return (rc); } switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = pd->pd_blkio->Media->BlockSize; break; case DIOCGMEDIASIZE: *(uint64_t *)data = pd->pd_blkio->Media->BlockSize * (pd->pd_blkio->Media->LastBlock + 1); break; default: return (ENOTTY); } return (0); } /* * efipart_readwrite() * Internal equivalent of efipart_strategy(), which operates on the * media-native block size. This function expects all I/O requests * to be within the media size and returns an error if such is not * the case. */ static int efipart_readwrite(EFI_BLOCK_IO *blkio, int rw, daddr_t blk, daddr_t nblks, char *buf) { EFI_STATUS status; if (blkio == NULL) return (ENXIO); if (blk < 0 || blk > blkio->Media->LastBlock) return (EIO); if ((blk + nblks - 1) > blkio->Media->LastBlock) return (EIO); switch (rw & F_MASK) { case F_READ: status = blkio->ReadBlocks(blkio, blkio->Media->MediaId, blk, nblks * blkio->Media->BlockSize, buf); break; case F_WRITE: if (blkio->Media->ReadOnly) return (EROFS); status = blkio->WriteBlocks(blkio, blkio->Media->MediaId, blk, nblks * blkio->Media->BlockSize, buf); break; default: return (ENOSYS); } if (EFI_ERROR(status)) { printf("%s: rw=%d, blk=%ju size=%ju status=%lu\n", __func__, rw, blk, nblks, EFI_ERROR_CODE(status)); } return (efi_status_to_errno(status)); } static int efipart_strategy(void *devdata, int rw, daddr_t blk, size_t size, char *buf, size_t *rsize) { struct bcache_devdata bcd; struct disk_devdesc *dev; pdinfo_t *pd; dev = (struct disk_devdesc *)devdata; if (dev == NULL) return (EINVAL); pd = efiblk_get_pdinfo((struct devdesc *)dev); if (pd == NULL) return (EINVAL); if (pd->pd_blkio->Media->RemovableMedia && !pd->pd_blkio->Media->MediaPresent) return (ENXIO); bcd.dv_strategy = efipart_realstrategy; bcd.dv_devdata = devdata; bcd.dv_cache = pd->pd_bcache; if (dev->dd.d_dev->dv_type == DEVT_DISK) { daddr_t offset; offset = dev->d_offset * pd->pd_blkio->Media->BlockSize; offset /= 512; return (bcache_strategy(&bcd, rw, blk + offset, size, buf, rsize)); } return (bcache_strategy(&bcd, rw, blk, size, buf, rsize)); } static int efipart_realstrategy(void *devdata, int rw, daddr_t blk, size_t size, char *buf, size_t *rsize) { struct disk_devdesc *dev = (struct disk_devdesc *)devdata; pdinfo_t *pd; EFI_BLOCK_IO *blkio; uint64_t off, disk_blocks, d_offset = 0; char *blkbuf; - size_t blkoff, blksz; - int error; + size_t blkoff, blksz, bio_size; + unsigned ioalign; + bool need_buf; + int rc; uint64_t diskend, readstart; if (dev == NULL || blk < 0) return (EINVAL); pd = efiblk_get_pdinfo((struct devdesc *)dev); if (pd == NULL) return (EINVAL); blkio = pd->pd_blkio; if (blkio == NULL) return (ENXIO); if (size == 0 || (size % 512) != 0) return (EIO); off = blk * 512; /* * Get disk blocks, this value is either for whole disk or for * partition. */ disk_blocks = 0; if (dev->dd.d_dev->dv_type == DEVT_DISK) { if (disk_ioctl(dev, DIOCGMEDIASIZE, &disk_blocks) == 0) { /* DIOCGMEDIASIZE does return bytes. */ disk_blocks /= blkio->Media->BlockSize; } d_offset = dev->d_offset; } if (disk_blocks == 0) disk_blocks = blkio->Media->LastBlock + 1 - d_offset; /* make sure we don't read past disk end */ if ((off + size) / blkio->Media->BlockSize > d_offset + disk_blocks) { diskend = d_offset + disk_blocks; readstart = off / blkio->Media->BlockSize; if (diskend <= readstart) { if (rsize != NULL) *rsize = 0; return (EIO); } size = diskend - readstart; size = size * blkio->Media->BlockSize; } - if (rsize != NULL) - *rsize = size; - + need_buf = true; + /* Do we need bounce buffer? */ if ((size % blkio->Media->BlockSize == 0) && (off % blkio->Media->BlockSize == 0)) - return (efipart_readwrite(blkio, rw, - off / blkio->Media->BlockSize, - size / blkio->Media->BlockSize, buf)); + need_buf = false; - /* - * The buffer size is not a multiple of the media block size. - */ - blkbuf = malloc(blkio->Media->BlockSize); + /* Do we have IO alignment requirement? */ + ioalign = blkio->Media->IoAlign; + if (ioalign == 0) + ioalign++; + + if (ioalign > 1 && (uintptr_t)buf != roundup2((uintptr_t)buf, ioalign)) + need_buf = true; + + if (need_buf) { + for (bio_size = BIO_BUFFER_SIZE; bio_size > 0; + bio_size -= blkio->Media->BlockSize) { + blkbuf = memalign(ioalign, bio_size); + if (blkbuf != NULL) + break; + } + } else { + blkbuf = buf; + bio_size = size; + } + if (blkbuf == NULL) return (ENOMEM); - error = 0; + if (rsize != NULL) + *rsize = size; + + rc = 0; blk = off / blkio->Media->BlockSize; blkoff = off % blkio->Media->BlockSize; - blksz = blkio->Media->BlockSize - blkoff; + while (size > 0) { - error = efipart_readwrite(blkio, rw, blk, 1, blkbuf); - if (error) + size_t x = min(size, bio_size); + + if (x < blkio->Media->BlockSize) + x = 1; + else + x /= blkio->Media->BlockSize; + + switch (rw & F_MASK) { + case F_READ: + blksz = blkio->Media->BlockSize * x - blkoff; + if (size < blksz) + blksz = size; + + rc = efipart_readwrite(blkio, rw, blk, x, blkbuf); + if (rc != 0) + goto error; + + if (need_buf) + bcopy(blkbuf + blkoff, buf, blksz); break; - if (size < blksz) - blksz = size; - bcopy(blkbuf + blkoff, buf, blksz); + case F_WRITE: + rc = 0; + if (blkoff != 0) { + /* + * We got offset to sector, read 1 sector to + * blkbuf. + */ + x = 1; + blksz = blkio->Media->BlockSize - blkoff; + blksz = min(blksz, size); + rc = efipart_readwrite(blkio, F_READ, blk, x, + blkbuf); + } else if (size < blkio->Media->BlockSize) { + /* + * The remaining block is not full + * sector. Read 1 sector to blkbuf. + */ + x = 1; + blksz = size; + rc = efipart_readwrite(blkio, F_READ, blk, x, + blkbuf); + } else { + /* We can write full sector(s). */ + blksz = blkio->Media->BlockSize * x; + } + + if (rc != 0) + goto error; + /* + * Put your Data In, Put your Data out, + * Put your Data In, and shake it all about + */ + if (need_buf) + bcopy(buf, blkbuf + blkoff, blksz); + rc = efipart_readwrite(blkio, F_WRITE, blk, x, blkbuf); + if (rc != 0) + goto error; + break; + default: + /* DO NOTHING */ + rc = EROFS; + goto error; + } + + blkoff = 0; buf += blksz; size -= blksz; - blk++; - blkoff = 0; - blksz = blkio->Media->BlockSize; + blk += x; } - free(blkbuf); - return (error); +error: + if (rsize != NULL) + *rsize -= size; + + if (need_buf) + free(blkbuf); + return (rc); } Index: projects/clang900-import/stand/forth/loader.4th =================================================================== --- projects/clang900-import/stand/forth/loader.4th (revision 352536) +++ projects/clang900-import/stand/forth/loader.4th (revision 352537) @@ -1,263 +1,286 @@ \ Copyright (c) 1999 Daniel C. Sobral \ Copyright (c) 2011-2015 Devin Teske \ All rights reserved. \ \ Redistribution and use in source and binary forms, with or without \ modification, are permitted provided that the following conditions \ are met: \ 1. Redistributions of source code must retain the above copyright \ notice, this list of conditions and the following disclaimer. \ 2. Redistributions in binary form must reproduce the above copyright \ notice, this list of conditions and the following disclaimer in the \ documentation and/or other materials provided with the distribution. \ \ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND \ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE \ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE \ ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE \ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL \ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS \ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) \ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT \ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY \ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF \ SUCH DAMAGE. \ \ $FreeBSD$ only forth definitions +\ provide u> if needed +s" u>" sfind [if] drop [else] + drop +: u> + 2dup u< if 2drop 0 exit then + swap u< if -1 exit then + 0 +; +[then] + +\ provide xemit if needed +s" xemit" sfind [if] drop [else] + drop +: xemit + dup 0x80 u< if emit exit then + 0 swap 0x3F + begin 2dup u> while + 2/ >r dup 0x3F and 0x80 or swap 6 rshift r> + repeat 0x7F xor 2* or + begin dup 0x80 u< 0= while emit repeat drop +; +[then] + s" arch-i386" environment? [if] [if] s" loader_version" environment? [if] 11 < [if] .( Loader version 1.1+ required) cr abort [then] [else] .( Could not get loader version!) cr abort [then] [then] [then] 256 dictthreshold ! \ 256 cells minimum free space 2048 dictincrease ! \ 2048 additional cells each time include /boot/support.4th include /boot/color.4th include /boot/delay.4th include /boot/check-password.4th only forth definitions : bootmsg ( -- ) loader_color? dup ( -- bool bool ) if 7 fg 4 bg then ." Booting..." if me then cr ; : try-menu-unset \ menu-unset may not be present s" beastie_disable" getenv dup -1 <> if s" YES" compare-insensitive 0= if exit then else drop then s" menu-unset" sfind if execute else drop then s" menusets-unset" sfind if execute else drop then ; only forth also support-functions also builtins definitions : boot 0= if ( interpreted ) get_arguments then \ Unload only if a path was passed dup if >r over r> swap c@ [char] - <> if 0 1 unload drop else s" kernelname" getenv? if ( a kernel has been loaded ) try-menu-unset bootmsg 1 boot exit then load_kernel_and_modules ?dup if exit then try-menu-unset bootmsg 0 1 boot exit then else s" kernelname" getenv? if ( a kernel has been loaded ) try-menu-unset bootmsg 1 boot exit then load_kernel_and_modules ?dup if exit then try-menu-unset bootmsg 0 1 boot exit then load_kernel_and_modules ?dup 0= if bootmsg 0 1 boot then ; \ ***** boot-conf \ \ Prepares to boot as specified by loaded configuration files. : boot-conf 0= if ( interpreted ) get_arguments then 0 1 unload drop load_kernel_and_modules ?dup 0= if 0 1 autoboot then ; also forth definitions previous builtin: boot builtin: boot-conf only forth definitions also support-functions \ ***** start \ \ Initializes support.4th global variables, sets loader_conf_files, \ processes conf files, and, if any one such file was successfully \ read to the end, loads kernel and modules. : start ( -- ) ( throws: abort & user-defined ) s" /boot/defaults/loader.conf" initialize include_conf_files include_nextboot_file \ If the user defined a post-initialize hook, call it now s" post-initialize" sfind if execute else drop then \ Will *NOT* try to load kernel and modules if no configuration file \ was successfully loaded! any_conf_read? if s" loader_delay" getenv -1 = if load_xen_throw load_kernel load_modules else drop ." Loading Kernel and Modules (Ctrl-C to Abort)" cr s" also support-functions" evaluate s" set delay_command='load_xen_throw load_kernel load_modules'" evaluate s" set delay_showdots" evaluate delay_execute then then ; \ ***** initialize \ \ Overrides support.4th initialization word with one that does \ everything start one does, short of loading the kernel and \ modules. Returns a flag. : initialize ( -- flag ) s" /boot/defaults/loader.conf" initialize include_conf_files include_nextboot_file \ If the user defined a post-initialize hook, call it now s" post-initialize" sfind if execute else drop then any_conf_read? ; \ ***** read-conf \ \ Read a configuration file, whose name was specified on the command \ line, if interpreted, or given on the stack, if compiled in. : (read-conf) ( addr len -- ) conf_files string= include_conf_files \ Will recurse on new loader_conf_files definitions ; : read-conf ( | addr len -- ) ( throws: abort & user-defined ) state @ if \ Compiling postpone (read-conf) else \ Interpreting bl parse (read-conf) then ; immediate \ show, enable, disable, toggle module loading. They all take module from \ the next word : set-module-flag ( module_addr val -- ) \ set and print flag over module.flag ! dup module.name strtype module.flag @ if ." will be loaded" else ." will not be loaded" then cr ; : enable-module find-module ?dup if true set-module-flag then ; : disable-module find-module ?dup if false set-module-flag then ; : toggle-module find-module ?dup if dup module.flag @ 0= set-module-flag then ; \ ***** show-module \ \ Show loading information about a module. : show-module ( -- ) find-module ?dup if show-one-module then ; \ Words to be used inside configuration files : retry false ; \ For use in load error commands : ignore true ; \ For use in load error commands \ Return to strict forth vocabulary : #type over - >r type r> spaces ; : .? 2 spaces 2swap 15 #type 2 spaces type cr ; \ Execute the ? command to print all the commands defined in \ C, then list the ones we support here. Please note that this \ doesn't use pager_* routines that the C implementation of ? \ does, so these will always appear, even if you stop early \ there. And they may cause the commands to scroll off the \ screen if the number of commands modulus LINES is close \ to LINEs.... : ? ['] ? execute s" boot-conf" s" load kernel and modules, then autoboot" .? s" read-conf" s" read a configuration file" .? s" enable-module" s" enable loading of a module" .? s" disable-module" s" disable loading of a module" .? s" toggle-module" s" toggle loading of a module" .? s" show-module" s" show module load data" .? s" try-include" s" try to load/interpret files" .? ; : try-include ( -- ) \ see loader.4th(8) ['] include ( -- xt ) \ get the execution token of `include' catch ( xt -- exception# | 0 ) if \ failed LF parse ( c -- s-addr/u ) 2drop \ advance >in to EOL (drop data) \ ... prevents words unused by `include' from being interpreted then ; immediate \ interpret immediately for access to `source' (aka tib) only forth definitions Index: projects/clang900-import/stand/libsa/stand.h =================================================================== --- projects/clang900-import/stand/libsa/stand.h (revision 352536) +++ projects/clang900-import/stand/libsa/stand.h (revision 352537) @@ -1,449 +1,453 @@ /* * Copyright (c) 1998 Michael Smith. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * From $NetBSD: stand.h,v 1.22 1997/06/26 19:17:40 drochner Exp $ */ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)stand.h 8.1 (Berkeley) 6/11/93 */ #ifndef STAND_H #define STAND_H #include #include #include #include /* this header intentionally exports NULL from */ #include #define strcoll(a, b) strcmp((a), (b)) #define CHK(fmt, args...) printf("%s(%d): " fmt "\n", __func__, __LINE__ , ##args) #define PCHK(fmt, args...) {printf("%s(%d): " fmt "\n", __func__, __LINE__ , ##args); getchar();} #include /* special stand error codes */ #define EADAPT (ELAST+1) /* bad adaptor */ #define ECTLR (ELAST+2) /* bad controller */ #define EUNIT (ELAST+3) /* bad unit */ #define ESLICE (ELAST+4) /* bad slice */ #define EPART (ELAST+5) /* bad partition */ #define ERDLAB (ELAST+6) /* can't read disk label */ #define EUNLAB (ELAST+7) /* unlabeled disk */ #define EOFFSET (ELAST+8) /* relative seek not supported */ #define ESALAST (ELAST+8) /* */ /* Partial signal emulation for sig_atomic_t */ #include struct open_file; /* * This structure is used to define file system operations in a file system * independent way. * * XXX note that filesystem providers should export a pointer to their fs_ops * struct, so that consumers can reference this and thus include the * filesystems that they require. */ struct fs_ops { const char *fs_name; int (*fo_open)(const char *path, struct open_file *f); int (*fo_close)(struct open_file *f); int (*fo_read)(struct open_file *f, void *buf, size_t size, size_t *resid); int (*fo_write)(struct open_file *f, const void *buf, size_t size, size_t *resid); off_t (*fo_seek)(struct open_file *f, off_t offset, int where); int (*fo_stat)(struct open_file *f, struct stat *sb); int (*fo_readdir)(struct open_file *f, struct dirent *d); }; /* * libstand-supplied filesystems */ extern struct fs_ops ufs_fsops; extern struct fs_ops tftp_fsops; extern struct fs_ops nfs_fsops; extern struct fs_ops cd9660_fsops; extern struct fs_ops gzipfs_fsops; extern struct fs_ops bzipfs_fsops; extern struct fs_ops dosfs_fsops; extern struct fs_ops ext2fs_fsops; extern struct fs_ops splitfs_fsops; extern struct fs_ops pkgfs_fsops; extern struct fs_ops efihttp_fsops; /* where values for lseek(2) */ #define SEEK_SET 0 /* set file offset to offset */ #define SEEK_CUR 1 /* set file offset to current plus offset */ #define SEEK_END 2 /* set file offset to EOF plus offset */ /* * Device switch */ struct devsw { const char dv_name[8]; int dv_type; /* opaque type constant, arch-dependant */ #define DEVT_NONE 0 #define DEVT_DISK 1 #define DEVT_NET 2 #define DEVT_CD 3 #define DEVT_ZFS 4 #define DEVT_FD 5 int (*dv_init)(void); /* early probe call */ int (*dv_strategy)(void *devdata, int rw, daddr_t blk, size_t size, char *buf, size_t *rsize); int (*dv_open)(struct open_file *f, ...); int (*dv_close)(struct open_file *f); int (*dv_ioctl)(struct open_file *f, u_long cmd, void *data); int (*dv_print)(int verbose); /* print device information */ void (*dv_cleanup)(void); }; /* * libstand-supplied device switch */ extern struct devsw netdev; extern int errno; /* * Generic device specifier; architecture-dependent * versions may be larger, but should be allowed to * overlap. */ struct devdesc { struct devsw *d_dev; int d_unit; void *d_opendata; }; struct open_file { int f_flags; /* see F_* below */ struct devsw *f_dev; /* pointer to device operations */ void *f_devdata; /* device specific data */ struct fs_ops *f_ops; /* pointer to file system operations */ void *f_fsdata; /* file system specific data */ off_t f_offset; /* current file offset */ char *f_rabuf; /* readahead buffer pointer */ size_t f_ralen; /* valid data in readahead buffer */ off_t f_raoffset; /* consumer offset in readahead buffer */ #define SOPEN_RASIZE 512 }; #define SOPEN_MAX 64 extern struct open_file files[]; /* f_flags values */ #define F_READ 0x0001 /* file opened for reading */ #define F_WRITE 0x0002 /* file opened for writing */ #define F_RAW 0x0004 /* raw device open - no file system */ #define F_NODEV 0x0008 /* network open - no device */ #define F_MASK 0xFFFF /* Mode modifier for strategy() */ #define F_NORA (0x01 << 16) /* Disable Read-Ahead */ #define isascii(c) (((c) & ~0x7F) == 0) static __inline int isupper(int c) { return c >= 'A' && c <= 'Z'; } static __inline int islower(int c) { return c >= 'a' && c <= 'z'; } static __inline int isspace(int c) { return c == ' ' || (c >= 0x9 && c <= 0xd); } static __inline int isdigit(int c) { return c >= '0' && c <= '9'; } static __inline int isxdigit(int c) { return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } static __inline int isalpha(int c) { return isupper(c) || islower(c); } static __inline int isalnum(int c) { return isalpha(c) || isdigit(c); } static __inline int iscntrl(int c) { return (c >= 0 && c < ' ') || c == 127; } static __inline int isgraph(int c) { return c >= '!' && c <= '~'; } static __inline int ispunct(int c) { return (c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~'); } static __inline int toupper(int c) { return islower(c) ? c - 'a' + 'A' : c; } static __inline int tolower(int c) { return isupper(c) ? c - 'A' + 'a' : c; } /* sbrk emulation */ extern void setheap(void *base, void *top); extern char *sbrk(int incr); -extern void *reallocf(void *ptr, size_t size); -extern void mallocstats(void); - extern int printf(const char *fmt, ...) __printflike(1, 2); extern int asprintf(char **buf, const char *cfmt, ...) __printflike(2, 3); extern int sprintf(char *buf, const char *cfmt, ...) __printflike(2, 3); extern int snprintf(char *buf, size_t size, const char *cfmt, ...) __printflike(3, 4); extern int vprintf(const char *fmt, __va_list); extern int vsprintf(char *buf, const char *cfmt, __va_list); extern int vsnprintf(char *buf, size_t size, const char *cfmt, __va_list); extern void twiddle(u_int callerdiv); extern void twiddle_divisor(u_int globaldiv); extern void ngets(char *, int); #define gets(x) ngets((x), 0) extern int fgetstr(char *buf, int size, int fd); extern int open(const char *, int); #define O_RDONLY 0x0 #define O_WRONLY 0x1 #define O_RDWR 0x2 #define O_ACCMODE 0x3 /* NOT IMPLEMENTED */ #define O_CREAT 0x0200 /* create if nonexistent */ #define O_TRUNC 0x0400 /* truncate to zero length */ extern int close(int); extern void closeall(void); extern ssize_t read(int, void *, size_t); extern ssize_t write(int, const void *, size_t); extern struct dirent *readdirfd(int); extern void srandom(unsigned int); extern long random(void); /* imports from stdlib, locally modified */ extern char *optarg; /* getopt(3) external variables */ extern int optind, opterr, optopt, optreset; extern int getopt(int, char * const [], const char *); /* pager.c */ extern void pager_open(void); extern void pager_close(void); extern int pager_output(const char *lines); extern int pager_file(const char *fname); /* No signal state to preserve */ #define setjmp _setjmp #define longjmp _longjmp /* environment.c */ #define EV_DYNAMIC (1<<0) /* value was dynamically allocated, free if changed/unset */ #define EV_VOLATILE (1<<1) /* value is volatile, make a copy of it */ #define EV_NOHOOK (1<<2) /* don't call hook when setting */ struct env_var; typedef char *(ev_format_t)(struct env_var *ev); typedef int (ev_sethook_t)(struct env_var *ev, int flags, const void *value); typedef int (ev_unsethook_t)(struct env_var *ev); struct env_var { char *ev_name; int ev_flags; void *ev_value; ev_sethook_t *ev_sethook; ev_unsethook_t *ev_unsethook; struct env_var *ev_next, *ev_prev; }; extern struct env_var *environ; extern struct env_var *env_getenv(const char *name); extern int env_setenv(const char *name, int flags, const void *value, ev_sethook_t sethook, ev_unsethook_t unsethook); extern char *getenv(const char *name); extern int setenv(const char *name, const char *value, int overwrite); extern int putenv(char *string); extern int unsetenv(const char *name); extern ev_sethook_t env_noset; /* refuse set operation */ extern ev_unsethook_t env_nounset; /* refuse unset operation */ /* stdlib.h routines */ extern int abs(int a); extern void abort(void) __dead2; extern long strtol(const char * __restrict, char ** __restrict, int); extern long long strtoll(const char * __restrict, char ** __restrict, int); extern unsigned long strtoul(const char * __restrict, char ** __restrict, int); extern unsigned long long strtoull(const char * __restrict, char ** __restrict, int); /* BCD conversions (undocumented) */ extern u_char const bcd2bin_data[]; extern u_char const bin2bcd_data[]; extern char const hex2ascii_data[]; #define bcd2bin(bcd) (bcd2bin_data[bcd]) #define bin2bcd(bin) (bin2bcd_data[bin]) #define hex2ascii(hex) (hex2ascii_data[hex]) #define validbcd(bcd) (bcd == 0 || (bcd > 0 && bcd <= 0x99 && bcd2bin_data[bcd] != 0)) /* min/max (undocumented) */ static __inline int imax(int a, int b) { return (a > b ? a : b); } static __inline int imin(int a, int b) { return (a < b ? a : b); } static __inline long lmax(long a, long b) { return (a > b ? a : b); } static __inline long lmin(long a, long b) { return (a < b ? a : b); } static __inline u_int max(u_int a, u_int b) { return (a > b ? a : b); } static __inline u_int min(u_int a, u_int b) { return (a < b ? a : b); } static __inline quad_t qmax(quad_t a, quad_t b) { return (a > b ? a : b); } static __inline quad_t qmin(quad_t a, quad_t b) { return (a < b ? a : b); } static __inline u_long ulmax(u_long a, u_long b) { return (a > b ? a : b); } static __inline u_long ulmin(u_long a, u_long b) { return (a < b ? a : b); } /* null functions for device/filesystem switches (undocumented) */ extern int nodev(void); extern int noioctl(struct open_file *, u_long, void *); extern void nullsys(void); extern int null_open(const char *path, struct open_file *f); extern int null_close(struct open_file *f); extern int null_read(struct open_file *f, void *buf, size_t size, size_t *resid); extern int null_write(struct open_file *f, const void *buf, size_t size, size_t *resid); extern off_t null_seek(struct open_file *f, off_t offset, int where); extern int null_stat(struct open_file *f, struct stat *sb); extern int null_readdir(struct open_file *f, struct dirent *d); /* * Machine dependent functions and data, must be provided or stubbed by * the consumer */ extern void exit(int) __dead2; extern int getchar(void); extern int ischar(void); extern void putchar(int); extern int devopen(struct open_file *, const char *, const char **); extern int devclose(struct open_file *f); extern void panic(const char *, ...) __dead2 __printflike(1, 2); extern void panic_action(void) __weak_symbol __dead2; extern time_t getsecs(void); extern struct fs_ops *file_system[]; extern struct fs_ops *exclusive_file_system; extern struct devsw *devsw[]; /* * Expose byteorder(3) functions. */ #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED extern uint32_t htonl(uint32_t); extern uint16_t htons(uint16_t); extern uint32_t ntohl(uint32_t); extern uint16_t ntohs(uint16_t); #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif void *Malloc(size_t, const char *, int); +void *Memalign(size_t, size_t, const char *, int); void *Calloc(size_t, size_t, const char *, int); void *Realloc(void *, size_t, const char *, int); +void *Reallocf(void *, size_t, const char *, int); void Free(void *, const char *, int); +extern void mallocstats(void); #ifdef DEBUG_MALLOC #define malloc(x) Malloc(x, __FILE__, __LINE__) +#define memalign(x, y) Memalign(x, y, __FILE__, __LINE__) #define calloc(x, y) Calloc(x, y, __FILE__, __LINE__) #define free(x) Free(x, __FILE__, __LINE__) #define realloc(x, y) Realloc(x, y, __FILE__, __LINE__) +#define reallocf(x, y) Reallocf(x, y, __FILE__, __LINE__) #else #define malloc(x) Malloc(x, NULL, 0) +#define memalign(x, y) Memalign(x, y, NULL, 0) #define calloc(x, y) Calloc(x, y, NULL, 0) #define free(x) Free(x, NULL, 0) #define realloc(x, y) Realloc(x, y, NULL, 0) +#define reallocf(x, y) Reallocf(x, y, NULL, 0) #endif #endif /* STAND_H */ Index: projects/clang900-import/stand/libsa/zalloc.c =================================================================== --- projects/clang900-import/stand/libsa/zalloc.c (revision 352536) +++ projects/clang900-import/stand/libsa/zalloc.c (revision 352537) @@ -1,316 +1,338 @@ /* - * This module derived from code donated to the FreeBSD Project by + * This module derived from code donated to the FreeBSD Project by * Matthew Dillon * * Copyright (c) 1998 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include + /* - * LIB/MEMORY/ZALLOC.C - self contained low-overhead memory pool/allocation + * LIB/MEMORY/ZALLOC.C - self contained low-overhead memory pool/allocation * subsystem * - * This subsystem implements memory pools and memory allocation + * This subsystem implements memory pools and memory allocation * routines. * * Pools are managed via a linked list of 'free' areas. Allocating * memory creates holes in the freelist, freeing memory fills them. * Since the freelist consists only of free memory areas, it is possible * to allocate the entire pool without incuring any structural overhead. * * The system works best when allocating similarly-sized chunks of - * memory. Care must be taken to avoid fragmentation when + * memory. Care must be taken to avoid fragmentation when * allocating/deallocating dissimilar chunks. * * When a memory pool is first allocated, the entire pool is marked as * allocated. This is done mainly because we do not want to modify any * portion of a pool's data area until we are given permission. The * caller must explicitly deallocate portions of the pool to make them * available. * * z[n]xalloc() works like z[n]alloc() but the allocation is made from - * within the specified address range. If the segment could not be + * within the specified address range. If the segment could not be * allocated, NULL is returned. WARNING! The address range will be * aligned to an 8 or 16 byte boundry depending on the cpu so if you * give an unaligned address range, unexpected results may occur. * * If a standard allocation fails, the reclaim function will be called * to recover some space. This usually causes other portions of the * same pool to be released. Memory allocations at this low level * should not block but you can do that too in your reclaim function * if you want. Reclaim does not function when z[n]xalloc() is used, * only for z[n]alloc(). * * Allocation and frees of 0 bytes are valid operations. */ #include "zalloc_defs.h" /* * Objects in the pool must be aligned to at least the size of struct MemNode. * They must also be aligned to MALLOCALIGN, which should normally be larger * than the struct, so assert that to be so at compile time. */ typedef char assert_align[(sizeof(struct MemNode) <= MALLOCALIGN) ? 1 : -1]; #define MEMNODE_SIZE_MASK MALLOCALIGN_MASK /* * znalloc() - allocate memory (without zeroing) from pool. Call reclaim * and retry if appropriate, return NULL if unable to allocate * memory. */ void * -znalloc(MemPool *mp, uintptr_t bytes) +znalloc(MemPool *mp, uintptr_t bytes, size_t align) { - /* - * align according to pool object size (can be 0). This is - * inclusive of the MEMNODE_SIZE_MASK minimum alignment. - * - */ - bytes = (bytes + MEMNODE_SIZE_MASK) & ~MEMNODE_SIZE_MASK; - - if (bytes == 0) - return((void *)-1); - - /* - * locate freelist entry big enough to hold the object. If all objects - * are the same size, this is a constant-time function. - */ - - if (bytes <= mp->mp_Size - mp->mp_Used) { MemNode **pmn; MemNode *mn; - for (pmn = &mp->mp_First; (mn=*pmn) != NULL; pmn = &mn->mr_Next) { - if (bytes > mn->mr_Bytes) - continue; + /* + * align according to pool object size (can be 0). This is + * inclusive of the MEMNODE_SIZE_MASK minimum alignment. + * + */ + bytes = (bytes + MEMNODE_SIZE_MASK) & ~MEMNODE_SIZE_MASK; - /* - * Cut a chunk of memory out of the beginning of this - * block and fixup the link appropriately. - */ + if (bytes == 0) + return ((void *)-1); - { + /* + * locate freelist entry big enough to hold the object. If all objects + * are the same size, this is a constant-time function. + */ + + if (bytes > mp->mp_Size - mp->mp_Used) + return (NULL); + + for (pmn = &mp->mp_First; (mn = *pmn) != NULL; pmn = &mn->mr_Next) { char *ptr = (char *)mn; + uintptr_t dptr; + char *aligned; + size_t extra; + dptr = (uintptr_t)(ptr + MALLOCALIGN); /* pointer to data */ + aligned = (char *)(roundup2(dptr, align) - MALLOCALIGN); + extra = aligned - ptr; + + if (bytes + extra > mn->mr_Bytes) + continue; + + /* + * Cut extra from head and create new memory node from reminder. + */ + + if (extra != 0) { + MemNode *new; + + new = (MemNode *)aligned; + new->mr_Next = mn->mr_Next; + new->mr_Bytes = mn->mr_Bytes - extra; + + /* And update current memory node */ + mn->mr_Bytes = extra; + mn->mr_Next = new; + /* In next iteration, we will get our aligned address */ + continue; + } + + /* + * Cut a chunk of memory out of the beginning of this + * block and fixup the link appropriately. + */ + if (mn->mr_Bytes == bytes) { - *pmn = mn->mr_Next; + *pmn = mn->mr_Next; } else { - mn = (MemNode *)((char *)mn + bytes); - mn->mr_Next = ((MemNode *)ptr)->mr_Next; - mn->mr_Bytes = ((MemNode *)ptr)->mr_Bytes - bytes; - *pmn = mn; + mn = (MemNode *)((char *)mn + bytes); + mn->mr_Next = ((MemNode *)ptr)->mr_Next; + mn->mr_Bytes = ((MemNode *)ptr)->mr_Bytes - bytes; + *pmn = mn; } mp->mp_Used += bytes; return(ptr); - } } - } - /* - * Memory pool is full, return NULL. - */ + /* + * Memory pool is full, return NULL. + */ - return(NULL); + return (NULL); } /* * zfree() - free previously allocated memory */ void zfree(MemPool *mp, void *ptr, uintptr_t bytes) { - /* - * align according to pool object size (can be 0). This is - * inclusive of the MEMNODE_SIZE_MASK minimum alignment. - */ - bytes = (bytes + MEMNODE_SIZE_MASK) & ~MEMNODE_SIZE_MASK; + MemNode **pmn; + MemNode *mn; - if (bytes == 0) - return; + /* + * align according to pool object size (can be 0). This is + * inclusive of the MEMNODE_SIZE_MASK minimum alignment. + */ + bytes = (bytes + MEMNODE_SIZE_MASK) & ~MEMNODE_SIZE_MASK; - /* - * panic if illegal pointer - */ + if (bytes == 0) + return; - if ((char *)ptr < (char *)mp->mp_Base || - (char *)ptr + bytes > (char *)mp->mp_End || - ((uintptr_t)ptr & MEMNODE_SIZE_MASK) != 0) - panic("zfree(%p,%ju): wild pointer", ptr, (uintmax_t)bytes); + /* + * panic if illegal pointer + */ - /* - * free the segment - */ + if ((char *)ptr < (char *)mp->mp_Base || + (char *)ptr + bytes > (char *)mp->mp_End || + ((uintptr_t)ptr & MEMNODE_SIZE_MASK) != 0) + panic("zfree(%p,%ju): wild pointer", ptr, (uintmax_t)bytes); - { - MemNode **pmn; - MemNode *mn; - + /* + * free the segment + */ mp->mp_Used -= bytes; for (pmn = &mp->mp_First; (mn = *pmn) != NULL; pmn = &mn->mr_Next) { - /* - * If area between last node and current node - * - check range - * - check merge with next area - * - check merge with previous area - */ - if ((char *)ptr <= (char *)mn) { /* - * range check + * If area between last node and current node + * - check range + * - check merge with next area + * - check merge with previous area */ - if ((char *)ptr + bytes > (char *)mn) { - panic("zfree(%p,%ju): corrupt memlist1", ptr, - (uintmax_t)bytes); - } + if ((char *)ptr <= (char *)mn) { + /* + * range check + */ + if ((char *)ptr + bytes > (char *)mn) { + panic("zfree(%p,%ju): corrupt memlist1", ptr, + (uintmax_t)bytes); + } - /* - * merge against next area or create independant area - */ + /* + * merge against next area or create independant area + */ - if ((char *)ptr + bytes == (char *)mn) { - ((MemNode *)ptr)->mr_Next = mn->mr_Next; - ((MemNode *)ptr)->mr_Bytes= bytes + mn->mr_Bytes; - } else { - ((MemNode *)ptr)->mr_Next = mn; - ((MemNode *)ptr)->mr_Bytes= bytes; - } - *pmn = mn = (MemNode *)ptr; + if ((char *)ptr + bytes == (char *)mn) { + ((MemNode *)ptr)->mr_Next = mn->mr_Next; + ((MemNode *)ptr)->mr_Bytes = + bytes + mn->mr_Bytes; + } else { + ((MemNode *)ptr)->mr_Next = mn; + ((MemNode *)ptr)->mr_Bytes = bytes; + } + *pmn = mn = (MemNode *)ptr; - /* - * merge against previous area (if there is a previous - * area). - */ + /* + * merge against previous area (if there is a previous + * area). + */ - if (pmn != &mp->mp_First) { - if ((char*)pmn + ((MemNode*)pmn)->mr_Bytes == (char*)ptr) { - ((MemNode *)pmn)->mr_Next = mn->mr_Next; - ((MemNode *)pmn)->mr_Bytes += mn->mr_Bytes; - mn = (MemNode *)pmn; - } + if (pmn != &mp->mp_First) { + if ((char *)pmn + ((MemNode*)pmn)->mr_Bytes == + (char *)ptr) { + ((MemNode *)pmn)->mr_Next = mn->mr_Next; + ((MemNode *)pmn)->mr_Bytes += + mn->mr_Bytes; + mn = (MemNode *)pmn; + } + } + return; } - return; - /* NOT REACHED */ - } - if ((char *)ptr < (char *)mn + mn->mr_Bytes) { - panic("zfree(%p,%ju): corrupt memlist2", ptr, - (uintmax_t)bytes); - } + if ((char *)ptr < (char *)mn + mn->mr_Bytes) { + panic("zfree(%p,%ju): corrupt memlist2", ptr, + (uintmax_t)bytes); + } } /* * We are beyond the last MemNode, append new MemNode. Merge against * previous area if possible. */ - if (pmn == &mp->mp_First || - (char *)pmn + ((MemNode *)pmn)->mr_Bytes != (char *)ptr - ) { - ((MemNode *)ptr)->mr_Next = NULL; - ((MemNode *)ptr)->mr_Bytes = bytes; - *pmn = (MemNode *)ptr; - mn = (MemNode *)ptr; + if (pmn == &mp->mp_First || + (char *)pmn + ((MemNode *)pmn)->mr_Bytes != (char *)ptr) { + ((MemNode *)ptr)->mr_Next = NULL; + ((MemNode *)ptr)->mr_Bytes = bytes; + *pmn = (MemNode *)ptr; + mn = (MemNode *)ptr; } else { - ((MemNode *)pmn)->mr_Bytes += bytes; - mn = (MemNode *)pmn; + ((MemNode *)pmn)->mr_Bytes += bytes; + mn = (MemNode *)pmn; } - } } /* * zextendPool() - extend memory pool to cover additional space. * * Note: the added memory starts out as allocated, you * must free it to make it available to the memory subsystem. * * Note: mp_Size may not reflect (mp_End - mp_Base) range * due to other parts of the system doing their own sbrk() * calls. */ void zextendPool(MemPool *mp, void *base, uintptr_t bytes) { - if (mp->mp_Size == 0) { - mp->mp_Base = base; - mp->mp_Used = bytes; - mp->mp_End = (char *)base + bytes; - mp->mp_Size = bytes; - } else { - void *pend = (char *)mp->mp_Base + mp->mp_Size; + if (mp->mp_Size == 0) { + mp->mp_Base = base; + mp->mp_Used = bytes; + mp->mp_End = (char *)base + bytes; + mp->mp_Size = bytes; + } else { + void *pend = (char *)mp->mp_Base + mp->mp_Size; - if (base < mp->mp_Base) { - mp->mp_Size += (char *)mp->mp_Base - (char *)base; - mp->mp_Used += (char *)mp->mp_Base - (char *)base; - mp->mp_Base = base; + if (base < mp->mp_Base) { + mp->mp_Size += (char *)mp->mp_Base - (char *)base; + mp->mp_Used += (char *)mp->mp_Base - (char *)base; + mp->mp_Base = base; + } + base = (char *)base + bytes; + if (base > pend) { + mp->mp_Size += (char *)base - (char *)pend; + mp->mp_Used += (char *)base - (char *)pend; + mp->mp_End = (char *)base; + } } - base = (char *)base + bytes; - if (base > pend) { - mp->mp_Size += (char *)base - (char *)pend; - mp->mp_Used += (char *)base - (char *)pend; - mp->mp_End = (char *)base; - } - } } #ifdef ZALLOCDEBUG void zallocstats(MemPool *mp) { - int abytes = 0; - int hbytes = 0; - int fcount = 0; - MemNode *mn; + int abytes = 0; + int hbytes = 0; + int fcount = 0; + MemNode *mn; - printf("%d bytes reserved", (int) mp->mp_Size); + printf("%d bytes reserved", (int)mp->mp_Size); - mn = mp->mp_First; + mn = mp->mp_First; - if ((void *)mn != (void *)mp->mp_Base) { - abytes += (char *)mn - (char *)mp->mp_Base; - } + if ((void *)mn != (void *)mp->mp_Base) { + abytes += (char *)mn - (char *)mp->mp_Base; + } - while (mn) { - if ((char *)mn + mn->mr_Bytes != mp->mp_End) { - hbytes += mn->mr_Bytes; - ++fcount; + while (mn != NULL) { + if ((char *)mn + mn->mr_Bytes != mp->mp_End) { + hbytes += mn->mr_Bytes; + ++fcount; + } + if (mn->mr_Next != NULL) { + abytes += (char *)mn->mr_Next - + ((char *)mn + mn->mr_Bytes); + } + mn = mn->mr_Next; } - if (mn->mr_Next) - abytes += (char *)mn->mr_Next - ((char *)mn + mn->mr_Bytes); - mn = mn->mr_Next; - } - printf(" %d bytes allocated\n%d fragments (%d bytes fragmented)\n", - abytes, - fcount, - hbytes - ); + printf(" %d bytes allocated\n%d fragments (%d bytes fragmented)\n", + abytes, fcount, hbytes); } #endif - Index: projects/clang900-import/stand/libsa/zalloc_defs.h =================================================================== --- projects/clang900-import/stand/libsa/zalloc_defs.h (revision 352536) +++ projects/clang900-import/stand/libsa/zalloc_defs.h (revision 352537) @@ -1,78 +1,83 @@ /* - * This module derived from code donated to the FreeBSD Project by + * This module derived from code donated to the FreeBSD Project by * Matthew Dillon * * Copyright (c) 1998 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * DEFS.H */ -#define USEGUARD /* use stard/end guard bytes */ -#define USEENDGUARD -#define DMALLOCDEBUG /* add debugging code to gather stats */ -#define ZALLOCDEBUG +#ifndef _ZALLOC_DEFS_H +#define _ZALLOC_DEFS_H +#define USEGUARD /* use stard/end guard bytes */ +#define USEENDGUARD +#define DMALLOCDEBUG /* add debugging code to gather stats */ +#define ZALLOCDEBUG + #include #include "stand.h" #include "zalloc_mem.h" -#define Library extern +#define Library extern /* * block extension for sbrk() */ -#define BLKEXTEND (4 * 1024) -#define BLKEXTENDMASK (BLKEXTEND - 1) +#define BLKEXTEND (4 * 1024) +#define BLKEXTENDMASK (BLKEXTEND - 1) /* * Required malloc alignment. * * Embedded platforms using the u-boot API drivers require that all I/O buffers * be on a cache line sized boundary. The worst case size for that is 64 bytes. * For other platforms, 16 bytes works fine. The alignment also must be at * least sizeof(struct MemNode); this is asserted in zalloc.c. */ #if defined(__arm__) || defined(__mips__) || defined(__powerpc__) #define MALLOCALIGN 64 #else #define MALLOCALIGN 16 #endif #define MALLOCALIGN_MASK (MALLOCALIGN - 1) typedef struct Guard { - size_t ga_Bytes; - size_t ga_Magic; /* must be at least 32 bits */ + size_t ga_Bytes; + size_t ga_Magic; /* must be at least 32 bits */ } Guard; -#define GAMAGIC 0x55FF44FD -#define GAFREE 0x5F54F4DF +#define GAMAGIC 0x55FF44FD +#define GAFREE 0x5F54F4DF #include "zalloc_protos.h" + +#endif /* _ZALLOC_DEFS_H */ Index: projects/clang900-import/stand/libsa/zalloc_malloc.c =================================================================== --- projects/clang900-import/stand/libsa/zalloc_malloc.c (revision 352536) +++ projects/clang900-import/stand/libsa/zalloc_malloc.c (revision 352537) @@ -1,203 +1,223 @@ /* - * This module derived from code donated to the FreeBSD Project by + * This module derived from code donated to the FreeBSD Project by * Matthew Dillon * * Copyright (c) 1998 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * MALLOC.C - malloc equivalent, runs on top of zalloc and uses sbrk */ #include "zalloc_defs.h" static MemPool MallocPool; #ifdef DMALLOCDEBUG static int MallocMax; static int MallocCount; void mallocstats(void); #endif #ifdef malloc #undef malloc #undef free #endif +static void *Malloc_align(size_t, size_t); + void * -Malloc(size_t bytes, const char *file, int line) +Malloc(size_t bytes, const char *file __unused, int line __unused) { - Guard *res; + return (Malloc_align(bytes, 1)); +} - if (bytes == 0) - return (NULL); +void * +Memalign(size_t alignment, size_t bytes, const char *file __unused, + int line __unused) +{ + if (alignment == 0) + alignment = 1; + return (Malloc_align(bytes, alignment)); +} + +static void * +Malloc_align(size_t bytes, size_t alignment) +{ + Guard *res; + #ifdef USEENDGUARD - bytes += MALLOCALIGN + 1; + bytes += MALLOCALIGN + 1; #else - bytes += MALLOCALIGN; + bytes += MALLOCALIGN; #endif - while ((res = znalloc(&MallocPool, bytes)) == NULL) { - int incr = (bytes + BLKEXTENDMASK) & ~BLKEXTENDMASK; - char *base; + while ((res = znalloc(&MallocPool, bytes, alignment)) == NULL) { + int incr = (bytes + BLKEXTENDMASK) & ~BLKEXTENDMASK; + char *base; - if ((base = sbrk(incr)) == (char *)-1) - return(NULL); - zextendPool(&MallocPool, base, incr); - zfree(&MallocPool, base, incr); - } + if ((base = sbrk(incr)) == (char *)-1) + return (NULL); + zextendPool(&MallocPool, base, incr); + zfree(&MallocPool, base, incr); + } #ifdef DMALLOCDEBUG - if (++MallocCount > MallocMax) - MallocMax = MallocCount; + if (++MallocCount > MallocMax) + MallocMax = MallocCount; #endif #ifdef USEGUARD - res->ga_Magic = GAMAGIC; + res->ga_Magic = GAMAGIC; #endif - res->ga_Bytes = bytes; + res->ga_Bytes = bytes; #ifdef USEENDGUARD - *((signed char *)res + bytes - 1) = -2; + *((signed char *)res + bytes - 1) = -2; #endif - return((char *)res + MALLOCALIGN); + return ((char *)res + MALLOCALIGN); } void Free(void *ptr, const char *file, int line) { - size_t bytes; + size_t bytes; - if (ptr != NULL) { - Guard *res = (void *)((char *)ptr - MALLOCALIGN); + if (ptr != NULL) { + Guard *res = (void *)((char *)ptr - MALLOCALIGN); - if (file == NULL) - file = "unknown"; + if (file == NULL) + file = "unknown"; #ifdef USEGUARD - if (res->ga_Magic == GAFREE) { - printf("free: duplicate free @ %p from %s:%d\n", ptr, file, line); - return; - } - if (res->ga_Magic != GAMAGIC) - panic("free: guard1 fail @ %p from %s:%d", ptr, file, line); - res->ga_Magic = GAFREE; + if (res->ga_Magic == GAFREE) { + printf("free: duplicate free @ %p from %s:%d\n", + ptr, file, line); + return; + } + if (res->ga_Magic != GAMAGIC) + panic("free: guard1 fail @ %p from %s:%d", + ptr, file, line); + res->ga_Magic = GAFREE; #endif #ifdef USEENDGUARD - if (*((signed char *)res + res->ga_Bytes - 1) == -1) { - printf("free: duplicate2 free @ %p from %s:%d\n", ptr, file, line); - return; - } - if (*((signed char *)res + res->ga_Bytes - 1) != -2) - panic("free: guard2 fail @ %p + %zu from %s:%d", ptr, res->ga_Bytes - MALLOCALIGN, file, line); - *((signed char *)res + res->ga_Bytes - 1) = -1; + if (*((signed char *)res + res->ga_Bytes - 1) == -1) { + printf("free: duplicate2 free @ %p from %s:%d\n", + ptr, file, line); + return; + } + if (*((signed char *)res + res->ga_Bytes - 1) != -2) + panic("free: guard2 fail @ %p + %zu from %s:%d", + ptr, res->ga_Bytes - MALLOCALIGN, file, line); + *((signed char *)res + res->ga_Bytes - 1) = -1; #endif - bytes = res->ga_Bytes; - zfree(&MallocPool, res, bytes); + bytes = res->ga_Bytes; + zfree(&MallocPool, res, bytes); #ifdef DMALLOCDEBUG - --MallocCount; + --MallocCount; #endif - } + } } void * Calloc(size_t n1, size_t n2, const char *file, int line) { - uintptr_t bytes = (uintptr_t)n1 * (uintptr_t)n2; - void *res; + uintptr_t bytes = (uintptr_t)n1 * (uintptr_t)n2; + void *res; - if ((res = Malloc(bytes, file, line)) != NULL) { - bzero(res, bytes); + if ((res = Malloc(bytes, file, line)) != NULL) { + bzero(res, bytes); #ifdef DMALLOCDEBUG - if (++MallocCount > MallocMax) - MallocMax = MallocCount; + if (++MallocCount > MallocMax) + MallocMax = MallocCount; #endif - } - return(res); + } + return (res); } /* * realloc() - I could be fancier here and free the old buffer before - * allocating the new one (saving potential fragmentation + * allocating the new one (saving potential fragmentation * and potential buffer copies). But I don't bother. */ void * Realloc(void *ptr, size_t size, const char *file, int line) { - void *res; - size_t old; + void *res; + size_t old; - if ((res = Malloc(size, file, line)) != NULL) { - if (ptr) { - old = *(size_t *)((char *)ptr - MALLOCALIGN) - MALLOCALIGN; - if (old < size) - bcopy(ptr, res, old); - else - bcopy(ptr, res, size); - Free(ptr, file, line); - } else { + if ((res = Malloc(size, file, line)) != NULL) { + if (ptr != NULL) { + Guard *g = (Guard *)((char *)ptr - MALLOCALIGN); + + old = g->ga_Bytes - MALLOCALIGN; + if (old < size) + bcopy(ptr, res, old); + else + bcopy(ptr, res, size); + Free(ptr, file, line); + } else { #ifdef DMALLOCDEBUG - if (++MallocCount > MallocMax) - MallocMax = MallocCount; + if (++MallocCount > MallocMax) + MallocMax = MallocCount; #ifdef EXITSTATS - if (DidAtExit == 0) { - DidAtExit = 1; - atexit(mallocstats); - } + if (DidAtExit == 0) { + DidAtExit = 1; + atexit(mallocstats); + } #endif #endif + } } - } - return(res); + return (res); } void * Reallocf(void *ptr, size_t size, const char *file, int line) { - void *res; + void *res; - if ((res = Realloc(ptr, size, file, line)) == NULL) - Free(ptr, file, line); - return(res); + if ((res = Realloc(ptr, size, file, line)) == NULL) + Free(ptr, file, line); + return (res); } #ifdef DMALLOCDEBUG void mallocstats(void) { - printf("Active Allocations: %d/%d\n", MallocCount, MallocMax); + printf("Active Allocations: %d/%d\n", MallocCount, MallocMax); #ifdef ZALLOCDEBUG - zallocstats(&MallocPool); + zallocstats(&MallocPool); #endif } #endif - Index: projects/clang900-import/stand/libsa/zalloc_mem.h =================================================================== --- projects/clang900-import/stand/libsa/zalloc_mem.h (revision 352536) +++ projects/clang900-import/stand/libsa/zalloc_mem.h (revision 352537) @@ -1,53 +1,56 @@ /* - * This module derived from code donated to the FreeBSD Project by + * This module derived from code donated to the FreeBSD Project by * Matthew Dillon * * Copyright (c) 1998 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * H/MEM.H * * Basic memory pool / memory node structures. */ +#ifndef _ZALLOC_MEM_H +#define _ZALLOC_MEM_H typedef struct MemNode { - struct MemNode *mr_Next; - uintptr_t mr_Bytes; + struct MemNode *mr_Next; + uintptr_t mr_Bytes; } MemNode; typedef struct MemPool { - void *mp_Base; - void *mp_End; - MemNode *mp_First; - uintptr_t mp_Size; - uintptr_t mp_Used; + void *mp_Base; + void *mp_End; + MemNode *mp_First; + uintptr_t mp_Size; + uintptr_t mp_Used; } MemPool; -#define ZNOTE_FREE 0 -#define ZNOTE_REUSE 1 +#define ZNOTE_FREE 0 +#define ZNOTE_REUSE 1 +#endif /* _ZALLOC_MEM_H */ Index: projects/clang900-import/stand/libsa/zalloc_protos.h =================================================================== --- projects/clang900-import/stand/libsa/zalloc_protos.h (revision 352536) +++ projects/clang900-import/stand/libsa/zalloc_protos.h (revision 352537) @@ -1,35 +1,40 @@ /* - * This module derived from code donated to the FreeBSD Project by + * This module derived from code donated to the FreeBSD Project by * Matthew Dillon * * Copyright (c) 1998 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ -Library void *znalloc(struct MemPool *mpool, uintptr_t bytes); +#ifndef _ZALLOC_PROTOS_H +#define _ZALLOC_PROTOS_H + +Library void *znalloc(struct MemPool *mpool, uintptr_t bytes, size_t align); Library void zfree(struct MemPool *mpool, void *ptr, uintptr_t bytes); Library void zextendPool(MemPool *mp, void *base, uintptr_t bytes); Library void zallocstats(struct MemPool *mp); + +#endif /* _ZALLOC_PROTOS_H */ Index: projects/clang900-import/stand/mips/uboot/Makefile =================================================================== --- projects/clang900-import/stand/mips/uboot/Makefile (revision 352536) +++ projects/clang900-import/stand/mips/uboot/Makefile (revision 352537) @@ -1,56 +1,60 @@ # $FreeBSD$ LOADER_CD9660_SUPPORT?= no LOADER_EXT2FS_SUPPORT?= no LOADER_MSDOS_SUPPORT?= yes LOADER_UFS_SUPPORT?= yes LOADER_NET_SUPPORT?= yes LOADER_NFS_SUPPORT?= yes LOADER_TFTP_SUPPORT?= no LOADER_GZIP_SUPPORT?= no LOADER_BZIP2_SUPPORT?= no .include FILES+= ubldr NEWVERSWHAT= "U-Boot loader" ${MACHINE_ARCH} INSTALLFLAGS= -b WARNS?= 1 # Address at which ubldr will be loaded. # This varies for different boards and SOCs. +.if ${MACHINE_ARCH:Mmips64*} UBLDR_LOADADDR?= 0xffffffff80800000 +.else +UBLDR_LOADADDR?= 0x80800000 +.endif # Architecture-specific loader code SRCS= start.S conf.c vers.c HELP_FILES= ${.CURDIR}/help.uboot ${BOOTSRC}/fdt/help.fdt # Always add MI sources .include "${BOOTSRC}/loader.mk" CFLAGS+= -g LDFLAGS= -nostdlib -static -T ${.CURDIR}/ldscript.${MACHINE_CPUARCH} .include "${BOOTSRC}/uboot.mk" DPADD= ${LDR_INTERP} ${LIBUBOOT} ${LIBFDT} ${LIBUBOOT_FDT} ${LIBSA} LDADD= ${LDR_INTERP} ${LIBUBOOT} ${LIBFDT} ${LIBUBOOT_FDT} ${LIBSA} OBJS+= ${SRCS:N*.h:R:S/$/.o/g} ldscript.abs: echo "UBLDR_LOADADDR = ${UBLDR_LOADADDR};" >${.TARGET} ldscript.pie: echo "UBLDR_LOADADDR = 0;" >${.TARGET} ubldr: ${OBJS} ldscript.abs ${.CURDIR}/ldscript.${MACHINE_CPUARCH} ${DPADD} ${CC} ${CFLAGS} -T ldscript.abs ${LDFLAGS} \ -o ${.TARGET} ${OBJS} ${LDADD} ${OBJCOPY} -S -O binary ubldr ubldr.bin CLEANFILES+= ldscript.abs ldscript.pie ubldr ubldr.pie ubldr.bin .include Index: projects/clang900-import/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c =================================================================== --- projects/clang900-import/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 352536) +++ projects/clang900-import/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 352537) @@ -1,3939 +1,3977 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * Grand theory statement on scan queue sorting * * Scanning is implemented by recursively traversing all indirection levels * in an object and reading all blocks referenced from said objects. This * results in us approximately traversing the object from lowest logical * offset to the highest. For best performance, we would want the logical * blocks to be physically contiguous. However, this is frequently not the * case with pools given the allocation patterns of copy-on-write filesystems. * So instead, we put the I/Os into a reordering queue and issue them in a * way that will most benefit physical disks (LBA-order). * * Queue management: * * Ideally, we would want to scan all metadata and queue up all block I/O * prior to starting to issue it, because that allows us to do an optimal * sorting job. This can however consume large amounts of memory. Therefore * we continuously monitor the size of the queues and constrain them to 5% * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this * limit, we clear out a few of the largest extents at the head of the queues * to make room for more scanning. Hopefully, these extents will be fairly * large and contiguous, allowing us to approach sequential I/O throughput * even without a fully sorted tree. * * Metadata scanning takes place in dsl_scan_visit(), which is called from * dsl_scan_sync() every spa_sync(). If we have either fully scanned all * metadata on the pool, or we need to make room in memory because our * queues are too large, dsl_scan_visit() is postponed and * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies * that metadata scanning and queued I/O issuing are mutually exclusive. This * allows us to provide maximum sequential I/O throughput for the majority of * I/O's issued since sequential I/O performance is significantly negatively * impacted if it is interleaved with random I/O. * * Implementation Notes * * One side effect of the queued scanning algorithm is that the scanning code * needs to be notified whenever a block is freed. This is needed to allow * the scanning code to remove these I/Os from the issuing queue. Additionally, * we do not attempt to queue gang blocks to be issued sequentially since this * is very hard to do and would have an extremely limitted performance benefit. * Instead, we simply issue gang I/Os as soon as we find them using the legacy * algorithm. * * Backwards compatibility * * This new algorithm is backwards compatible with the legacy on-disk data * structures (and therefore does not require a new feature flag). * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan * will stop scanning metadata (in logical order) and wait for all outstanding * sorted I/O to complete. Once this is done, we write out a checkpoint * bookmark, indicating that we have scanned everything logically before it. * If the pool is imported on a machine without the new sorting algorithm, * the scan simply resumes from the last checkpoint using the legacy algorithm. */ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; static int scan_ds_queue_compare(const void *a, const void *b); static int scan_prefetch_queue_compare(const void *a, const void *b); static void scan_ds_queue_clear(dsl_scan_t *scn); static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg); static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); extern int zfs_vdev_async_write_active_min_dirty_percent; /* * By default zfs will check to ensure it is not over the hard memory * limit before each txg. If finer-grained control of this is needed * this value can be set to 1 to enable checking before scanning each * block. */ int zfs_scan_strict_mem_lim = B_FALSE; /* * Maximum number of parallelly executing I/Os per top-level vdev. * Tune with care. Very high settings (hundreds) are known to trigger * some firmware bugs and resets on certain SSDs. */ int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver -- 2 is a good number */ unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub -- 4 is a good number */ unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ /* * Maximum number of parallelly executed bytes per leaf vdev. We attempt * to strike a balance here between keeping the vdev queues full of I/Os * at all times and not overflowing the queues to cause long latency, * which would cause long txg sync times. No matter what, we will not * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ unsigned long zfs_scan_vdev_limit = 4 << 20; int zfs_scan_issue_strategy = 0; int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */ #define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval) /* * fill_weight is non-tunable at runtime, so we copy it at module init from * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would * break queue sorting. */ uint64_t zfs_scan_fill_weight = 3; static uint64_t fill_weight; /* See dsl_scan_should_clear() for details on the memory limit tunables */ uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ SYSCTL_DECL(_vfs_zfs); SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, &zfs_scan_idle, 0, "Idle scan window in clock ticks"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN, &zfs_scrub_min_time_ms, 0, "Min millisecs to scrub per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, &zfs_no_scrub_io, 0, "Disable scrub I/O"); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_legacy, CTLFLAG_RWTUN, &zfs_scan_legacy, 0, "Scrub using legacy non-sequential method"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_checkpoint_interval, CTLFLAG_RWTUN, &zfs_scan_checkpoint_intval, 0, "Scan progress on-disk checkpointing interval"); enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ uint64_t zfs_async_block_max_blocks = UINT64_MAX; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG"); /* * We wait a few txgs after importing a pool to begin scanning so that * the import / mounting code isn't held up by scrub / resilver IO. * Unfortunately, it is a bit difficult to determine exactly how long * this will take since userspace will trigger fs mounts asynchronously * and the kernel will create zvol minors asynchronously. As a result, * the value provided here is a bit arbitrary, but represents a * reasonable estimate of how many txgs it will take to finish fully * importing a pool */ #define SCAN_IMPORT_WAIT_TXGS 5 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) extern int zfs_txg_timeout; /* * Enable/disable the processing of the free_bpobj object. */ boolean_t zfs_free_bpobj_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; /* In core node for the scn->scn_queue. Represents a dataset to be scanned */ typedef struct { uint64_t sds_dsobj; uint64_t sds_txg; avl_node_t sds_node; } scan_ds_t; /* * This controls what conditions are placed on dsl_scan_sync_state(): * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise * write out the scn_phys_cached version. * See dsl_scan_sync_state for details. */ typedef enum { SYNC_OPTIONAL, SYNC_MANDATORY, SYNC_CACHED } state_sync_type_t; /* * This struct represents the minimum information needed to reconstruct a * zio for sequential scanning. This is useful because many of these will * accumulate in the sequential IO queues before being issued, so saving * memory matters here. */ typedef struct scan_io { /* fields from blkptr_t */ uint64_t sio_offset; uint64_t sio_blk_prop; uint64_t sio_phys_birth; uint64_t sio_birth; zio_cksum_t sio_cksum; uint32_t sio_asize; /* fields from zio_t */ int sio_flags; zbookmark_phys_t sio_zb; /* members for queue sorting */ union { avl_node_t sio_addr_node; /* link into issueing queue */ list_node_t sio_list_node; /* link for issuing to disk */ } sio_nodes; } scan_io_t; struct dsl_scan_io_queue { dsl_scan_t *q_scn; /* associated dsl_scan_t */ vdev_t *q_vd; /* top-level vdev that this queue represents */ /* trees used for sorting I/Os and extents of I/Os */ range_tree_t *q_exts_by_addr; avl_tree_t q_exts_by_size; avl_tree_t q_sios_by_addr; /* members for zio rate limiting */ uint64_t q_maxinflight_bytes; uint64_t q_inflight_bytes; kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ /* per txg statistics */ uint64_t q_total_seg_size_this_txg; uint64_t q_segs_this_txg; uint64_t q_total_zio_size_this_txg; uint64_t q_zios_this_txg; }; /* private data for dsl_scan_prefetch_cb() */ typedef struct scan_prefetch_ctx { refcount_t spc_refcnt; /* refcount for memory management */ dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ boolean_t spc_root; /* is this prefetch for an objset? */ uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ } scan_prefetch_ctx_t; /* private data for dsl_scan_prefetch() */ typedef struct scan_prefetch_issue_ctx { avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ blkptr_t spic_bp; /* bp to prefetch */ zbookmark_phys_t spic_zb; /* bookmark to prefetch */ } scan_prefetch_issue_ctx_t; static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio); static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); static void scan_io_queues_destroy(dsl_scan_t *scn); static kmem_cache_t *sio_cache; void scan_init(void) { /* * This is used in ext_size_compare() to weight segments * based on how sparse they are. This cannot be changed * mid-scan and the tree comparison functions don't currently * have a mechansim for passing additional context to the * compare functions. Thus we store this value globally and * we only allow it to be set at module intiailization time */ fill_weight = zfs_scan_fill_weight; sio_cache = kmem_cache_create("sio_cache", sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void scan_fini(void) { kmem_cache_destroy(sio_cache); } static inline boolean_t dsl_scan_is_running(const dsl_scan_t *scn) { return (scn->scn_phys.scn_state == DSS_SCANNING); } boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { return (dsl_scan_is_running(dp->dp_scan) && dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); } static inline void sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) { bzero(bp, sizeof (*bp)); DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); bp->blk_prop = sio->sio_blk_prop; bp->blk_phys_birth = sio->sio_phys_birth; bp->blk_birth = sio->sio_birth; bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; } static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { /* we discard the vdev id, since we can deduce it from the queue */ sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); sio->sio_blk_prop = bp->blk_prop; sio->sio_phys_birth = bp->blk_phys_birth; sio->sio_birth = bp->blk_birth; sio->sio_cksum = bp->blk_cksum; } void dsl_scan_global_init(void) { /* * This is used in ext_size_compare() to weight segments * based on how sparse they are. This cannot be changed * mid-scan and the tree comparison functions don't currently * have a mechansim for passing additional context to the * compare functions. Thus we store this value globally and * we only allow it to be set at module intiailization time */ fill_weight = zfs_scan_fill_weight; } int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { int err; dsl_scan_t *scn; spa_t *spa = dp->dp_spa; uint64_t f; scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; /* * It's possible that we're resuming a scan after a reboot so * make sure that the scan_async_destroying flag is initialized * appropriately. */ ASSERT(!scn->scn_async_destroying); scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, sizeof (scan_prefetch_issue_ctx_t), offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { /* * There was an old-style scrub in progress. Restart a * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", (longlong_t)scn->scn_restart_txg); /* * Load the queue obj from the old location so that it * can be freed by dsl_scan_done(). */ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); if (err == ENOENT) return (0); else if (err) return (err); /* * We might be restarting after a reboot, so jump the issued * counter to how far we've scanned. We know we're consistent * up to here. */ scn->scn_issued_before_pass = scn->scn_phys.scn_examined; if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old * pool, and the pool was accessed by old * software. Restart from the beginning, since * the old software may have changed the pool in * the meantime. */ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", (longlong_t)scn->scn_restart_txg); } } /* reload the queue into the in-core state */ if (scn->scn_phys.scn_queue_obj != 0) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, scn->scn_phys.scn_queue_obj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { scan_ds_queue_insert(scn, zfs_strtonum(za.za_name, NULL), za.za_first_integer); } zap_cursor_fini(&zc); } spa_scan_stat_init(spa); return (0); } void dsl_scan_fini(dsl_pool_t *dp) { if (dp->dp_scan != NULL) { dsl_scan_t *scn = dp->dp_scan; if (scn->scn_taskq != NULL) taskq_destroy(scn->scn_taskq); scan_ds_queue_clear(scn); avl_destroy(&scn->scn_queue); avl_destroy(&scn->scn_prefetch_queue); kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } static boolean_t dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) { return (scn->scn_restart_txg != 0 && scn->scn_restart_txg <= tx->tx_txg); } boolean_t dsl_scan_scrubbing(const dsl_pool_t *dp) { dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; return (scn_phys->scn_state == DSS_SCANNING && scn_phys->scn_func == POOL_SCAN_SCRUB); } boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn) { return (dsl_scan_scrubbing(scn->scn_dp) && scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); } /* * Writes out a persistent dsl_scan_phys_t record to the pool directory. * Because we can be running in the block sorting algorithm, we do not always * want to write out the record, only when it is "safe" to do so. This safety * condition is achieved by making sure that the sorting queues are empty * (scn_bytes_pending == 0). When this condition is not true, the sync'd state * is inconsistent with how much actual scanning progress has been made. The * kind of sync to be performed is specified by the sync_type argument. If the * sync is optional, we only sync if the queues are empty. If the sync is * mandatory, we do a hard ASSERT to make sure that the queues are empty. The * third possible state is a "cached" sync. This is done in response to: * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been * destroyed, so we wouldn't be able to restart scanning from it. * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been * superseded by a newer snapshot. * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been * swapped with its clone. * In all cases, a cached sync simply rewrites the last record we've written, * just slightly modified. For the modifications that are performed to the * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. */ static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) { int i; spa_t *spa = scn->scn_dp->dp_spa; ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); if (scn->scn_bytes_pending == 0) { for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; if (q == NULL) continue; mutex_enter(&vd->vdev_scan_io_queue_lock); ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); } if (scn->scn_phys.scn_queue_obj != 0) scan_ds_queue_sync(scn, tx); VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); if (scn->scn_checkpointing) zfs_dbgmsg("finish scan checkpoint"); scn->scn_checkpointing = B_FALSE; scn->scn_last_checkpoint = ddi_get_lbolt(); } else if (sync_type == SYNC_CACHED) { VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys_cached, tx)); } } /* ARGSUSED */ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (dsl_scan_is_running(scn)) return (SET_ERROR(EBUSY)); return (0); } static void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; scn->scn_phys.scn_max_txg = tx->tx_txg; scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; scn->scn_issued_before_pass = 0; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; scn->scn_last_checkpoint = 0; scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; /* rewrite all disk labels */ vdev_config_dirty(spa->spa_root_vdev); if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_RESILVER_START); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); } spa->spa_scrub_started = B_TRUE; /* * If this is an incremental scrub, limit the DDT scrub phase * to just the auto-ditto class (for correctness); the rest * of the scrub should go faster using top-down pruning. */ if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; } /* back to the generic stuff */ if (dp->dp_blkstats == NULL) { dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); mutex_init(&dp->dp_blkstats->zab_lock, NULL, MUTEX_DEFAULT, NULL); } bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); } /* * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. * Can also be called to resume a paused scrub. */ int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; /* * Purge all vdev caches and probe all devices. We do this here * rather than in sync context because this requires a writer lock * on the spa_config lock, which we can't do from sync context. The * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ spa_vdev_state_enter(spa, SCL_NONE); spa->spa_scrub_reopen = B_TRUE; vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { /* got scrub start cmd, resume paused scrub */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, POOL_SCRUB_NORMAL); if (err == 0) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); return (ECANCELED); } return (SET_ERROR(err)); } return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { static const char *old_names[] = { "scrub_bookmark", "scrub_ddt_bookmark", "scrub_ddt_class_max", "scrub_queue", "scrub_min_txg", "scrub_max_txg", "scrub_func", "scrub_errors", NULL }; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int i; /* Remove any remnants of an old-style scrub. */ for (i = 0; old_names[i]; i++) { (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); } if (scn->scn_phys.scn_queue_obj != 0) { VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } scan_ds_queue_clear(scn); scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; /* * If we were "restarted" from a stopped state, don't bother * with anything else. */ if (!dsl_scan_is_running(scn)) { ASSERT(!scn->scn_is_sorted); return; } if (scn->scn_is_sorted) { scan_io_queues_destroy(scn); scn->scn_is_sorted = B_FALSE; if (scn->scn_taskq != NULL) { taskq_destroy(scn->scn_taskq); scn->scn_taskq = NULL; } } scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; if (dsl_scan_restarting(scn, tx)) spa_history_log_internal(spa, "scan aborted, restarting", tx, "errors=%llu", spa_get_errlog_size(spa)); else if (!complete) spa_history_log_internal(spa, "scan cancelled", tx, "errors=%llu", spa_get_errlog_size(spa)); else spa_history_log_internal(spa, "scan done", tx, "errors=%llu", spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* * If the scrub/resilver completed, update all DTLs to * reflect this. Whether it succeeded or not, vacate * all temporary scrub DTLs. * * As the scrub does not currently support traversing * data that have been freed but are part of a checkpoint, * we don't mark the scrub as done in the DTLs as faults * may still exist in those vdevs. */ if (complete && !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, scn->scn_phys.scn_max_txg, B_TRUE); spa_event_notify(spa, NULL, NULL, scn->scn_phys.scn_min_txg ? ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); } else { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 0, B_TRUE); } spa_errlog_rotate(spa); /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); } scn->scn_phys.scn_end_time = gethrestime_sec(); ASSERT(!dsl_scan_is_running(scn)); } /* ARGSUSED */ static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (!dsl_scan_is_running(scn)) return (SET_ERROR(ENOENT)); return (0); } /* ARGSUSED */ static void dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); } int dsl_scan_cancel(dsl_pool_t *dp) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } static int dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ if (!dsl_scan_scrubbing(dp)) return (SET_ERROR(ENOENT)); /* can't pause a paused scrub */ if (dsl_scan_is_paused_scrub(scn)) return (SET_ERROR(EBUSY)); } else if (*cmd != POOL_SCRUB_NORMAL) { return (SET_ERROR(ENOTSUP)); } return (0); } static void dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ spa->spa_scan_pass_scrub_pause = gethrestime_sec(); scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx, SYNC_CACHED); spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); if (dsl_scan_is_paused_scrub(scn)) { /* * We need to keep track of how much time we spend * paused per pass so that we can adjust the scrub rate * shown in the output of 'zpool status' */ spa->spa_scan_pass_scrub_spent_paused += gethrestime_sec() - spa->spa_scan_pass_scrub_pause; spa->spa_scan_pass_scrub_pause = 0; scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx, SYNC_CACHED); } } } /* * Set scrub pause/resume state if it makes sense to do so */ int dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); } /* start a new scan, or restart an existing one. */ void dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; dmu_tx_commit(tx); } else { dp->dp_scan->scn_restart_txg = txg; } zfs_dbgmsg("restarting resilver txg=%llu", txg); } void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { zio_free(dp->dp_spa, txg, bp); } void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), pio->io_flags)); } static int scan_ds_queue_compare(const void *a, const void *b) { const scan_ds_t *sds_a = a, *sds_b = b; if (sds_a->sds_dsobj < sds_b->sds_dsobj) return (-1); if (sds_a->sds_dsobj == sds_b->sds_dsobj) return (0); return (1); } static void scan_ds_queue_clear(dsl_scan_t *scn) { void *cookie = NULL; scan_ds_t *sds; while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { kmem_free(sds, sizeof (*sds)); } } static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) { scan_ds_t srch, *sds; srch.sds_dsobj = dsobj; sds = avl_find(&scn->scn_queue, &srch, NULL); if (sds != NULL && txg != NULL) *txg = sds->sds_txg; return (sds != NULL); } static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) { scan_ds_t *sds; avl_index_t where; sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); sds->sds_dsobj = dsobj; sds->sds_txg = txg; VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); avl_insert(&scn->scn_queue, sds, where); } static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) { scan_ds_t srch, *sds; srch.sds_dsobj = dsobj; sds = avl_find(&scn->scn_queue, &srch, NULL); VERIFY(sds != NULL); avl_remove(&scn->scn_queue, sds); kmem_free(sds, sizeof (*sds)); } static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; ASSERT0(scn->scn_bytes_pending); ASSERT(scn->scn_phys.scn_queue_obj != 0); VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, DMU_OT_NONE, 0, tx); for (scan_ds_t *sds = avl_first(&scn->scn_queue); sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { VERIFY0(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, sds->sds_dsobj, sds->sds_txg, tx)); } } /* * Computes the memory limit state that we're currently in. A sorted scan * needs quite a bit of memory to hold the sorting queue, so we need to * reasonably constrain the size so it doesn't impact overall system * performance. We compute two limits: * 1) Hard memory limit: if the amount of memory used by the sorting * queues on a pool gets above this value, we stop the metadata * scanning portion and start issuing the queued up and sorted * I/Os to reduce memory usage. * This limit is calculated as a fraction of physmem (by default 5%). * We constrain the lower bound of the hard limit to an absolute * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain * the upper bound to 5% of the total pool size - no chance we'll * ever need that much memory, but just to keep the value in check. * 2) Soft memory limit: once we hit the hard memory limit, we start * issuing I/O to reduce queue memory usage, but we don't want to * completely empty out the queues, since we might be able to find I/Os * that will fill in the gaps of our non-sequential IOs at some point * in the future. So we stop the issuing of I/Os once the amount of * memory used drops below the soft limit (at which point we stop issuing * I/O and start scanning metadata again). * * This limit is calculated by subtracting a fraction of the hard * limit from the hard limit. By default this fraction is 5%, so * the soft limit is 95% of the hard limit. We cap the size of the * difference between the hard and soft limits at an absolute * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is * sufficient to not cause too frequent switching between the * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's * worth of queues is about 1.2 GiB of on-pool data, so scanning * that should take at least a decent fraction of a second). */ static boolean_t dsl_scan_should_clear(dsl_scan_t *scn) { vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; uint64_t mlim_hard, mlim_soft, mused; uint64_t alloc = metaslab_class_get_alloc(spa_normal_class( scn->scn_dp->dp_spa)); mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, zfs_scan_mem_lim_min); mlim_hard = MIN(mlim_hard, alloc / 20); mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, zfs_scan_mem_lim_soft_max); mused = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *tvd = rvd->vdev_child[i]; dsl_scan_io_queue_t *queue; mutex_enter(&tvd->vdev_scan_io_queue_lock); queue = tvd->vdev_scan_io_queue; if (queue != NULL) { /* #extents in exts_by_size = # in exts_by_addr */ mused += avl_numnodes(&queue->q_exts_by_size) * sizeof (range_seg_t) + avl_numnodes(&queue->q_sios_by_addr) * sizeof (scan_io_t); } mutex_exit(&tvd->vdev_scan_io_queue_lock); } dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); if (mused == 0) ASSERT0(scn->scn_bytes_pending); /* * If we are above our hard limit, we need to clear out memory. * If we are below our soft limit, we need to accumulate sequential IOs. * Otherwise, we should keep doing whatever we are currently doing. */ if (mused >= mlim_hard) return (B_TRUE); else if (mused < mlim_soft) return (B_FALSE); else return (scn->scn_clearing); } static boolean_t dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* we never skip user/group accounting objects */ if (zb && (int64_t)zb->zb_object < 0) return (B_FALSE); if (scn->scn_suspending) return (B_TRUE); /* we're already suspending */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ if (zb && zb->zb_level != 0) return (B_FALSE); /* * We suspend if: * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly * (default 30%), or someone is explicitly waiting for this txg * to complete. * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. * or * - the scan queue has reached its memory use limit */ uint64_t elapsed_nanosecs = gethrtime(); uint64_t curr_time_ns = gethrtime(); uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; if ((NSEC2MSEC(scan_time_ns) > mintime && (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa) || (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { if (zb) { dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; } else { dsl_scan_phys_t *scnp = &scn->scn_phys; dprintf("suspending at at DDT bookmark " "%llx/%llx/%llx/%llx\n", (longlong_t)scnp->scn_ddt_bookmark.ddb_class, (longlong_t)scnp->scn_ddt_bookmark.ddb_type, (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); } scn->scn_suspending = B_TRUE; return (B_TRUE); } return (B_FALSE); } typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; } zil_scan_arg_t; /* ARGSUSED */ static int dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * One block ("stubby") can be allocated a long time ago; we * want to visit that one because it has been allocated * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); return (0); } /* ARGSUSED */ static int dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * birth can be < claim_txg if this record's txg is * already txg sync'ed (but this log block contains * other records that are not synced) */ if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); } return (0); } static void dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; ASSERT(spa_writeable(dp->dp_spa)); /* * We only want to visit blocks that have been claimed * but not yet replayed. */ if (claim_txg == 0) return; zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, claim_txg); zil_free(zilog); } /* * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea * here is to sort the AVL tree by the order each block will be needed. */ static int scan_prefetch_queue_compare(const void *a, const void *b) { const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; return (zbookmark_compare(spc_a->spc_datablkszsec, spc_a->spc_indblkshift, spc_b->spc_datablkszsec, spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); } static void scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) { if (refcount_remove(&spc->spc_refcnt, tag) == 0) { refcount_destroy(&spc->spc_refcnt); kmem_free(spc, sizeof (scan_prefetch_ctx_t)); } } static scan_prefetch_ctx_t * scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) { scan_prefetch_ctx_t *spc; spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); refcount_create(&spc->spc_refcnt); refcount_add(&spc->spc_refcnt, tag); spc->spc_scn = scn; if (dnp != NULL) { spc->spc_datablkszsec = dnp->dn_datablkszsec; spc->spc_indblkshift = dnp->dn_indblkshift; spc->spc_root = B_FALSE; } else { spc->spc_datablkszsec = 0; spc->spc_indblkshift = 0; spc->spc_root = B_TRUE; } return (spc); } static void scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag) { refcount_add(&spc->spc_refcnt, tag); } static boolean_t dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, const zbookmark_phys_t *zb) { zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; dnode_phys_t tmp_dnp; dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; if (zb->zb_objset != last_zb->zb_objset) return (B_TRUE); if ((int64_t)zb->zb_object < 0) return (B_FALSE); tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; tmp_dnp.dn_indblkshift = spc->spc_indblkshift; if (zbookmark_subtree_completed(dnp, zb, last_zb)) return (B_TRUE); return (B_FALSE); } static void dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) { avl_index_t idx; dsl_scan_t *scn = spc->spc_scn; spa_t *spa = scn->scn_dp->dp_spa; scan_prefetch_issue_ctx_t *spic; if (zfs_no_scrub_prefetch) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; if (dsl_scan_check_prefetch_resume(spc, zb)) return; scan_prefetch_ctx_add_ref(spc, scn); spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); spic->spic_spc = spc; spic->spic_bp = *bp; spic->spic_zb = *zb; /* * Add the IO to the queue of blocks to prefetch. This allows us to * prioritize blocks that we will need first for the main traversal * thread. */ mutex_enter(&spa->spa_scrub_lock); if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { /* this block is already queued for prefetch */ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); scan_prefetch_ctx_rele(spc, scn); mutex_exit(&spa->spa_scrub_lock); return; } avl_insert(&scn->scn_prefetch_queue, spic, idx); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } static void dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int i; zbookmark_phys_t zb; scan_prefetch_ctx_t *spc; if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) return; SET_BOOKMARK(&zb, objset, object, 0, 0); spc = scan_prefetch_ctx_create(scn, dnp, FTAG); for (i = 0; i < dnp->dn_nblkptr; i++) { zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); zb.zb_blkid = i; dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zb.zb_level = 0; zb.zb_blkid = DMU_SPILL_BLKID; dsl_scan_prefetch(spc, &dnp->dn_spill, &zb); } scan_prefetch_ctx_rele(spc, FTAG); } void dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *private) { scan_prefetch_ctx_t *spc = private; dsl_scan_t *scn = spc->spc_scn; spa_t *spa = scn->scn_dp->dp_spa; /* broadcast that the IO has completed for rate limitting purposes */ mutex_enter(&spa->spa_scrub_lock); ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); /* if there was an error or we are done prefetching, just cleanup */ if (buf == NULL || scn->scn_suspending) goto out; if (BP_GET_LEVEL(bp) > 0) { int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t czb; for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_prefetch(spc, cbp, &czb); } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { dnode_phys_t *cdnp = buf->b_data; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_prefetch_dnode(scn, cdnp, zb->zb_objset, zb->zb_blkid * epb + i); } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { objset_phys_t *osp = buf->b_data; dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (OBJSET_BUF_HAS_USERUSED(buf)) { dsl_scan_prefetch_dnode(scn, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); dsl_scan_prefetch_dnode(scn, &osp->os_userused_dnode, zb->zb_objset, DMU_USERUSED_OBJECT); } } out: if (buf != NULL) arc_buf_destroy(buf, private); scan_prefetch_ctx_rele(spc, scn); } /* ARGSUSED */ static void dsl_scan_prefetch_thread(void *arg) { dsl_scan_t *scn = arg; spa_t *spa = scn->scn_dp->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; scan_prefetch_issue_ctx_t *spic; /* loop until we are told to stop */ while (!scn->scn_prefetch_stop) { arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; mutex_enter(&spa->spa_scrub_lock); /* * Wait until we have an IO to issue and are not above our * maximum in flight limit. */ while (!scn->scn_prefetch_stop && (avl_numnodes(&scn->scn_prefetch_queue) == 0 || spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } /* recheck if we should stop since we waited for the cv */ if (scn->scn_prefetch_stop) { mutex_exit(&spa->spa_scrub_lock); break; } /* remove the prefetch IO from the tree */ spic = avl_first(&scn->scn_prefetch_queue); spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); avl_remove(&scn->scn_prefetch_queue, spic); mutex_exit(&spa->spa_scrub_lock); /* issue the prefetch asynchronously */ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } ASSERT(scn->scn_prefetch_stop); /* free any prefetches we didn't get to complete */ mutex_enter(&spa->spa_scrub_lock); while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { avl_remove(&scn->scn_prefetch_queue, spic); scan_prefetch_ctx_rele(spic->spic_spc, scn); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); mutex_exit(&spa->spa_scrub_lock); } static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ if (zbookmark_subtree_completed(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it to a different object, zero it out to * indicate that it's OK to start checking for suspending * again. */ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { dprintf("resuming at %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); } } return (B_FALSE); } static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); static void dsl_scan_visitdnode( dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } osp = buf->b_data; dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); if (OBJSET_BUF_HAS_USERUSED(buf)) { /* * We also always visit user/group accounting * objects, and never skip them, even if we are * suspending. This is necessary so that the space * deltas from this txg get integrated. */ dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_groupused_dnode, DMU_GROUPUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); } return (0); } static void dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx) { int j; for (j = 0; j < dnp->dn_nblkptr; j++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, dnp->dn_nlevels - 1, j); dsl_scan_visitbp(&dnp->dn_blkptr[j], &czb, dnp, ds, scn, ostype, tx); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), &czb, dnp, ds, scn, ostype, tx); } } /* * The arguments are in this order because mdb can only print the * first 5; we want them to be useful. */ static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; blkptr_t *bp_toread = NULL; if (dsl_scan_check_suspend(scn, zb)) return; if (dsl_scan_check_resume(scn, dnp, zb)) return; scn->scn_visited_this_txg++; dprintf_bp(bp, "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", ds, ds ? ds->ds_object : 0, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, bp); if (BP_IS_HOLE(bp)) { scn->scn_holes_this_txg++; return; } if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; } bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); *bp_toread = *bp; if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) return; /* * If dsl_scan_ddt() has already visited this block, it will have * already done any translations or scrubbing, so don't call the * callback again. */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { scn->scn_ddt_contained_this_txg++; goto out; } /* * If this block is from the future (after cur_max_txg), then we * are doing this on behalf of a deleted snapshot, and we will * revisit the future block on the next pass of this dataset. * Don't scan it now unless we need to because something * under it was modified. */ if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { scn->scn_gt_max_this_txg++; goto out; } scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); out: kmem_free(bp_toread, sizeof (blkptr_t)); } static void dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_phys_t zb; scan_prefetch_ctx_t *spc; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { SET_BOOKMARK(&scn->scn_prefetch_bookmark, zb.zb_objset, 0, 0, 0); } else { scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; } scn->scn_objsets_visited_this_txg++; spc = scan_prefetch_ctx_create(scn, NULL, FTAG); dsl_scan_prefetch(spc, bp, &zb); scan_prefetch_ctx_rele(spc, FTAG); dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } static void ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) { if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { if (ds->ds_is_snapshot) { /* * Note: * - scn_cur_{min,max}_txg stays the same. * - Setting the flag is not really necessary if * scn_cur_max_txg == scn_max_txg, because there * is nothing after this snapshot that we care * about. However, we set it anyway and then * ignore it when we retraverse it in * dsl_scan_visitds(). */ scn_phys->scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn_phys->scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object); } } } /* * Invoked when a dataset is destroyed. We need to make sure that: * * 1) If it is the dataset that was currently being scanned, we write * a new dsl_scan_phys_t and marking the objset reference in it * as destroyed. * 2) Remove it from the work queue, if it was present. * * If the dataset was actually a snapshot, instead of marking the dataset * as destroyed, we instead substitute the next snapshot in line. */ void dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (!dsl_scan_is_running(scn)) return; ds_destroyed_scn_phys(ds, &scn->scn_phys); ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { scan_ds_queue_remove(scn, ds->ds_object); if (ds->ds_is_snapshot) scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); if (ds->ds_is_snapshot) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was * deleted too. */ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("destroying ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); } else { zfs_dbgmsg("destroying ds %llu; in queue; removing", (u_longlong_t)ds->ds_object); } } /* * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static void ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) { if (scn_bookmark->zb_objset == ds->ds_object) { scn_bookmark->zb_objset = dsl_dataset_phys(ds)->ds_prev_snap_obj; zfs_dbgmsg("snapshotting ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } } /* * Called when a dataset is snapshotted. If we were currently traversing * this snapshot, we reset our bookmark to point at the newly created * snapshot. We also modify our work queue to remove the old snapshot and * replace with the new one. */ void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (!dsl_scan_is_running(scn)) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { scan_ds_queue_remove(scn, ds->ds_object); scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("snapshotting ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static void ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, zbookmark_phys_t *scn_bookmark) { if (scn_bookmark->zb_objset == ds1->ds_object) { scn_bookmark->zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (scn_bookmark->zb_objset == ds2->ds_object) { scn_bookmark->zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } } /* - * Called when a parent dataset and its clone are swapped. If we were + * Called when an origin dataset and its clone are swapped. If we were * currently traversing the dataset, we need to switch to traversing the - * newly promoted parent. + * newly promoted clone. */ void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; + uint64_t mintxg1, mintxg2; + boolean_t ds1_queued, ds2_queued; if (!dsl_scan_is_running(scn)) return; ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); - if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) { - scan_ds_queue_remove(scn, ds1->ds_object); - scan_ds_queue_insert(scn, ds2->ds_object, mintxg); + /* + * Handle the in-memory scan queue. + */ + ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1); + ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2); + + /* Sanity checking. */ + if (ds1_queued) { + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } - if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) { + if (ds2_queued) { + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + + if (ds1_queued && ds2_queued) { + /* + * If both are queued, we don't need to do anything. + * The swapping code below would not handle this case correctly, + * since we can't insert ds2 if it is already there. That's + * because scan_ds_queue_insert() prohibits a duplicate insert + * and panics. + */ + } else if (ds1_queued) { + scan_ds_queue_remove(scn, ds1->ds_object); + scan_ds_queue_insert(scn, ds2->ds_object, mintxg1); + } else if (ds2_queued) { scan_ds_queue_remove(scn, ds2->ds_object); - scan_ds_queue_insert(scn, ds1->ds_object, mintxg); + scan_ds_queue_insert(scn, ds1->ds_object, mintxg2); } - if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds1->ds_object, &mintxg) == 0) { - int err; - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + /* + * Handle the on-disk scan queue. + * The on-disk state is an out-of-date version of the in-memory state, + * so the in-memory and on-disk values for ds1_queued and ds2_queued may + * be different. Therefore we need to apply the swap logic to the + * on-disk state independently of the in-memory state. + */ + ds1_queued = zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0; + ds2_queued = zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0; + + /* Sanity checking. */ + if (ds1_queued) { + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + if (ds2_queued) { + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + + if (ds1_queued && ds2_queued) { + /* + * If both are queued, we don't need to do anything. + * Alternatively, we could check for EEXIST from + * zap_add_int_key() and back out to the original state, but + * that would be more work than checking for this case upfront. + */ + } else if (ds1_queued) { + VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); - err = zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); - VERIFY(err == 0 || err == EEXIST); - if (err == EEXIST) { - /* Both were there to begin with */ - VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, - ds1->ds_object, mintxg, tx)); - } + VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); - } - if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds2->ds_object, &mintxg) == 0) { - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + } else if (ds2_queued) { + VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); - VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); + VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } dsl_scan_sync_state(scn, tx, SYNC_CACHED); } /* ARGSUSED */ static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { uint64_t originobj = *(uint64_t *)arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); dsl_dataset_rele(ds, FTAG); if (err) return (err); ds = prev; } scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (scn->scn_phys.scn_cur_min_txg >= scn->scn_phys.scn_max_txg) { /* * This can happen if this snapshot was created after the * scan started, and we already completed a previous snapshot * that was created after the scan started. This snapshot * only references blocks with: * * birth < our ds_creation_txg * cur_min_txg is no less than ds_creation_txg. * We have already visited these blocks. * or * birth > scn_max_txg * The scan requested not to visit these blocks. * * Subsequent snapshots (and clones) can reference our * blocks, or blocks with even higher birth times. * Therefore we do not need to visit them either, * so we do not add them to the work queue. * * Note that checking for cur_min_txg >= cur_max_txg * is not sufficient, because in that case we may need to * visit subsequent snapshots. This happens when min_txg > 0, * which raises cur_min_txg. In this case we will visit * this dataset but skip all of its blocks, because the * rootbp's birth time is < cur_min_txg. Then we will * add the next snapshots/clones to the work queue. */ char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " "cur_min_txg (%llu) >= max_txg (%llu)", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_max_txg); kmem_free(dsname, MAXNAMELEN); goto out; } /* * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same * BP as in the head), they must be ignored. In addition, $ORIGIN * doesn't have a objset (i.e. its ds_bp is a hole) so we don't * need to look for a ZIL in it either. So we traverse the ZIL here, * rather than in scan_recurse(), because the regular snapshot * block-sharing rules don't apply to it. */ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) && (dp->dp_origin_snap == NULL || ds->ds_dir != dp->dp_origin_snap->ds_dir)) { objset_t *os; if (dmu_objset_from_ds(ds, &os) != 0) { goto out; } dsl_scan_zil(dp, &os->os_zil_header); } /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "suspending=%u", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_cur_max_txg, (int)scn->scn_suspending); kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); if (scn->scn_suspending) goto out; /* * We've finished this pass over this dataset. */ /* * If we did not completely visit this dataset, do another pass. */ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass; visiting again"); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; scan_ds_queue_insert(scn, ds->ds_object, scn->scn_phys.scn_cur_max_txg); goto out; } /* * Add descendent datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, dsl_dataset_phys(ds)->ds_creation_txg); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { uint64_t count; /* * A bug in a previous version of the code could * cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a * missing entry. Therefore we can only use the * next_clones_obj when its count is correct. */ int err = zap_count(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, &count); if (err == 0 && count == dsl_dataset_phys(ds)->ds_num_children - 1) usenext = B_TRUE; } if (usenext) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { scan_ds_queue_insert(scn, zfs_strtonum(za.za_name, NULL), dsl_dataset_phys(ds)->ds_creation_txg); } zap_cursor_fini(&zc); } else { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_clones_cb, &ds->ds_object, DS_FIND_CHILDREN)); } } out: dsl_dataset_rele(ds, FTAG); } /* ARGSUSED */ static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } /* * If this is a clone, we don't need to worry about it for now. */ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(prev, FTAG); return (0); } dsl_dataset_rele(ds, FTAG); ds = prev; } scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } /* ARGSUSED */ void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx) { const ddt_key_t *ddk = &dde->dde_key; ddt_phys_t *ddp = dde->dde_phys; blkptr_t bp; zbookmark_phys_t zb = { 0 }; int p; if (scn->scn_phys.scn_state != DSS_SCANNING) return; for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) continue; ddt_bp_create(checksum, ddk, ddp, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); } } /* * Scrub/dedup interaction. * * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * * We leverage the fact that the dde's replication class (enum ddt_class) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * * To prevent excess scrubbing, the scrub begins by walking the DDT * to find all blocks with refcnt > 1, and scrubs each of these once. * Since there are two replication classes which contain blocks with * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. * * There would be nothing more to say if a block's refcnt couldn't change * during a scrub, but of course it can so we must account for changes * in a block's replication class. * * Here's an example of what can occur: * * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 * when visited during the top-down scrub phase, it will be scrubbed twice. * This negates our scrub optimization, but is otherwise harmless. * * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 * on each visit during the top-down scrub phase, it will never be scrubbed. * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 * while a scrub is in progress, it scrubs the block right then. */ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ddt_entry_t dde = { 0 }; int error; uint64_t n = 0; while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) break; dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", (longlong_t)ddb->ddb_class, (longlong_t)ddb->ddb_type, (longlong_t)ddb->ddb_checksum, (longlong_t)ddb->ddb_cursor); /* There should be no pending changes to the dedup table */ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); n++; if (dsl_scan_check_suspend(scn, NULL)) break; } zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; " "suspending=%u", (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; if (ds->ds_is_snapshot) return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { scan_ds_t *sds; dsl_pool_t *dp = scn->scn_dp; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_ddt(scn, tx); if (scn->scn_suspending) return; } if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { /* First do the MOS & ORIGIN */ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_suspending) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_cb, NULL, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); } ASSERT(!scn->scn_suspending); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; /* * If we were suspended, continue from here. Note if the * ds we were suspended on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* * In case we suspended right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); /* * Keep pulling things out of the dataset avl queue. Updates to the * persistent zap-object-as-queue happen only at checkpoints. */ while ((sds = avl_first(&scn->scn_queue)) != NULL) { dsl_dataset_t *ds; uint64_t dsobj = sds->sds_dsobj; uint64_t txg = sds->sds_txg; /* dequeue and free the ds from the queue */ scan_ds_queue_remove(scn, dsobj); sds = NULL; /* must not be touched after removal */ /* Set up min / max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (txg != 0) { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, txg); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, dsl_dataset_phys(ds)->ds_prev_snap_txg); } scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* No more objsets to fetch, we're done */ scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; ASSERT0(scn->scn_suspending); } static uint64_t dsl_scan_count_leaves(vdev_t *vd) { uint64_t i, leaves = 0; /* we only count leaves that belong to the main pool and are readable */ if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache || !vdev_readable(vd)) return (0); if (vd->vdev_ops->vdev_op_leaf) return (1); for (i = 0; i < vd->vdev_children; i++) { leaves += dsl_scan_count_leaves(vd->vdev_child[i]); } return (leaves); } static void scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) { int i; uint64_t cur_size = 0; for (i = 0; i < BP_GET_NDVAS(bp); i++) { cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); } q->q_total_zio_size_this_txg += cur_size; q->q_zios_this_txg++; } static void scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, uint64_t end) { q->q_total_seg_size_this_txg += end - start; q->q_segs_this_txg++; } static boolean_t scan_io_queue_check_suspend(dsl_scan_t *scn) { /* See comment in dsl_scan_check_suspend() */ uint64_t curr_time_ns = gethrtime(); uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; return ((NSEC2MSEC(scan_time_ns) > mintime && (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa)); } /* * Given a list of scan_io_t's in io_list, this issues the io's out to * disk. This consumes the io_list and frees the scan_io_t's. This is * called when emptying queues, either when we're up against the memory * limit or when we have finished scanning. Returns B_TRUE if we stopped * processing the list before we finished. Any zios that were not issued * will remain in the io_list. */ static boolean_t scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; int64_t bytes_issued = 0; boolean_t suspended = B_FALSE; while ((sio = list_head(io_list)) != NULL) { blkptr_t bp; if (scan_io_queue_check_suspend(scn)) { suspended = B_TRUE; break; } sio2bp(sio, &bp, queue->q_vd->vdev_id); bytes_issued += sio->sio_asize; scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, &sio->sio_zb, queue); (void) list_remove_head(io_list); scan_io_queues_update_zio_stats(queue, &bp); kmem_free(sio, sizeof (*sio)); } atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); return (suspended); } /* * Given a range_seg_t (extent) and a list, this function passes over a * scan queue and gathers up the appropriate ios which fit into that * scan seg (starting from lowest LBA). At the end, we remove the segment * from the q_exts_by_addr range tree. */ static boolean_t scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) { scan_io_t srch_sio, *sio, *next_sio; avl_index_t idx; uint_t num_sios = 0; int64_t bytes_issued = 0; ASSERT(rs != NULL); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); srch_sio.sio_offset = rs->rs_start; /* * The exact start of the extent might not contain any matching zios, * so if that's the case, examine the next one in the tree. */ sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { ASSERT3U(sio->sio_offset, >=, rs->rs_start); ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); bytes_issued += sio->sio_asize; num_sios++; list_insert_tail(list, sio); sio = next_sio; } /* * We limit the number of sios we process at once to 32 to avoid * biting off more than we can chew. If we didn't take everything * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ if (sio != NULL && sio->sio_offset < rs->rs_end) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, sio->sio_offset, rs->rs_end - sio->sio_offset); return (B_TRUE); } else { range_tree_remove(queue->q_exts_by_addr, rs->rs_start, rs->rs_end - rs->rs_start); return (B_FALSE); } } /* * This is called from the queue emptying thread and selects the next * extent from which we are to issue io's. The behavior of this function * depends on the state of the scan, the current memory consumption and * whether or not we are performing a scan shutdown. * 1) We select extents in an elevator algorithm (LBA-order) if the scan * needs to perform a checkpoint * 2) We select the largest available extent if we are up against the * memory limit. * 3) Otherwise we don't select any extents. */ static const range_seg_t * scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); /* handle tunable overrides */ if (scn->scn_checkpointing || scn->scn_clearing) { if (zfs_scan_issue_strategy == 1) { return (range_tree_first(queue->q_exts_by_addr)); } else if (zfs_scan_issue_strategy == 2) { return (avl_first(&queue->q_exts_by_size)); } } /* * During normal clearing, we want to issue our largest segments * first, keeping IO as sequential as possible, and leaving the * smaller extents for later with the hope that they might eventually * grow to larger sequential segments. However, when the scan is * checkpointing, no new extents will be added to the sorting queue, * so the way we are sorted now is as good as it will ever get. * In this case, we instead switch to issuing extents in LBA order. */ if (scn->scn_checkpointing) { return (range_tree_first(queue->q_exts_by_addr)); } else if (scn->scn_clearing) { return (avl_first(&queue->q_exts_by_size)); } else { return (NULL); } } static void scan_io_queues_run_one(void *arg) { dsl_scan_io_queue_t *queue = arg; kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; boolean_t suspended = B_FALSE; range_seg_t *rs = NULL; scan_io_t *sio = NULL; list_t sio_list; uint64_t bytes_per_leaf = zfs_scan_vdev_limit; uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); ASSERT(queue->q_scn->scn_is_sorted); list_create(&sio_list, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_list_node)); mutex_enter(q_lock); /* calculate maximum in-flight bytes for this txg (min 1MB) */ queue->q_maxinflight_bytes = MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); /* reset per-queue scan statistics for this txg */ queue->q_total_seg_size_this_txg = 0; queue->q_segs_this_txg = 0; queue->q_total_zio_size_this_txg = 0; queue->q_zios_this_txg = 0; /* loop until we have run out of time or sios */ while ((rs = (range_seg_t*)scan_io_queue_fetch_ext(queue)) != NULL) { uint64_t seg_start = 0, seg_end = 0; boolean_t more_left = B_TRUE; ASSERT(list_is_empty(&sio_list)); /* loop while we still have sios left to process in this rs */ while (more_left) { scan_io_t *first_sio, *last_sio; /* * We have selected which extent needs to be * processed next. Gather up the corresponding sios. */ more_left = scan_io_queue_gather(queue, rs, &sio_list); ASSERT(!list_is_empty(&sio_list)); first_sio = list_head(&sio_list); last_sio = list_tail(&sio_list); seg_end = last_sio->sio_offset + last_sio->sio_asize; if (seg_start == 0) seg_start = first_sio->sio_offset; /* * Issuing sios can take a long time so drop the * queue lock. The sio queue won't be updated by * other threads since we're in syncing context so * we can be sure that our trees will remain exactly * as we left them. */ mutex_exit(q_lock); suspended = scan_io_queue_issue(queue, &sio_list); mutex_enter(q_lock); if (suspended) break; } /* update statistics for debugging purposes */ scan_io_queues_update_seg_stats(queue, seg_start, seg_end); if (suspended) break; } /* If we were suspended in the middle of processing, * requeue any unfinished sios and exit. */ while ((sio = list_head(&sio_list)) != NULL) { list_remove(&sio_list, sio); scan_io_queue_insert_impl(queue, sio); } mutex_exit(q_lock); list_destroy(&sio_list); } /* * Performs an emptying run on all scan queues in the pool. This just * punches out one thread per top-level vdev, each of which processes * only that vdev's scan queue. We can parallelize the I/O here because * we know that each queue's io's only affect its own top-level vdev. * * This function waits for the queue runs to complete, and must be * called from dsl_scan_sync (or in general, syncing context). */ static void scan_io_queues_run(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; ASSERT(scn->scn_is_sorted); ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (scn->scn_bytes_pending == 0) return; if (scn->scn_taskq == NULL) { char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16, KM_SLEEP); int nthreads = spa->spa_root_vdev->vdev_children; /* * We need to make this taskq *always* execute as many * threads in parallel as we have top-level vdevs and no * less, otherwise strange serialization of the calls to * scan_io_queues_run_one can occur during spa_sync runs * and that significantly impacts performance. */ (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16, "dsl_scan_tq_%s", spa->spa_name); scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE); kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16); } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; mutex_enter(&vd->vdev_scan_io_queue_lock); if (vd->vdev_scan_io_queue != NULL) { VERIFY(taskq_dispatch(scn->scn_taskq, scan_io_queues_run_one, vd->vdev_scan_io_queue, TQ_SLEEP) != TASKQID_INVALID); } mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * Wait for the queues to finish issuing thir IOs for this run * before we return. There may still be IOs in flight at this * point. */ taskq_wait(scn->scn_taskq); } static boolean_t dsl_scan_async_block_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; if (zfs_recover) return (B_FALSE); if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks) return (B_TRUE); elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } static int dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_scan_t *scn = arg; if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; return (0); } static void dsl_scan_update_stats(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t i; uint64_t seg_size_total = 0, zio_size_total = 0; uint64_t seg_count_total = 0, zio_count_total = 0; for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; if (queue == NULL) continue; seg_size_total += queue->q_total_seg_size_this_txg; zio_size_total += queue->q_total_zio_size_this_txg; seg_count_total += queue->q_segs_this_txg; zio_count_total += queue->q_zios_this_txg; } if (seg_count_total == 0 || zio_count_total == 0) { scn->scn_avg_seg_size_this_txg = 0; scn->scn_avg_zio_size_this_txg = 0; scn->scn_segs_this_txg = 0; scn->scn_zios_this_txg = 0; return; } scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; scn->scn_segs_this_txg = seg_count_total; scn->scn_zios_this_txg = zio_count_total; } static int dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_scan_t *scn = arg; const dva_t *dva = &bp->blk_dva[0]; if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), tx); scn->scn_visited_this_txg++; return (0); } boolean_t dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } return (used != 0); } static boolean_t dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_t *vd; vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops == &vdev_indirect_ops) { /* * The indirect vdev can point to multiple * vdevs. For simplicity, always create * the resilver zio_t. zio_vdev_io_start() * will bypass the child resilver i/o's if * they are on vdevs that don't have DTL's. */ return (B_TRUE); } if (DVA_GET_GANG(dva)) { /* * Gang members may be spread across multiple * vdevs, so the best estimate we have is the * scrub range, which has already been checked. * XXX -- it would be better to change our * allocation policy to ensure that all * gang members reside on the same vdev. */ return (B_TRUE); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) return (B_FALSE); /* * Check if the top-level vdev must resilver this offset. * When the offset does not intersect with a dirty leaf DTL * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) return (B_FALSE); return (B_TRUE); } static int dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) { int err = 0; dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; if (spa_suspend_async_destroy(spa)) return (0); if (zfs_free_bpobj_enabled && spa_version(spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); } if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; if (err == EIO || err == ECKSUM) { err = 0; } else if (err != 0 && err != ERESTART) { zfs_panic_recover("error %u from " "traverse_dataset_destroyed()", err); } if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { /* finished; deactivate async destroy feature */ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); ASSERT(!spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, tx)); VERIFY0(bptree_free(dp->dp_meta_objset, dp->dp_bptree_obj, tx)); dp->dp_bptree_obj = 0; scn->scn_async_destroying = B_FALSE; scn->scn_async_stalled = B_FALSE; } else { /* * If we didn't make progress, mark the async * destroy as stalled, so that we will not initiate * a spa_sync() on its behalf. Note that we only * check this if we are not finished, because if the * bptree had no blocks for us to visit, we can * finish without "making progress". */ scn->scn_async_stalled = (scn->scn_visited_this_txg == 0); } } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " "free_bpobj/bptree txg %llu; err=%d", (longlong_t)scn->scn_visited_this_txg, (longlong_t) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)tx->tx_txg, err); scn->scn_visited_this_txg = 0; /* * Write out changes to the DDT that may be required as a * result of the blocks freed. This ensures that the DDT * is clean when a scrub/resilver runs. */ ddt_sync(spa, tx->tx_txg); } if (err != 0) return (err); if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && zfs_free_leak_on_eio && (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { /* * We have finished background destroying, but there is still * some space left in the dp_free_dir. Transfer this leaked * space to the dp_leak_dir. */ if (dp->dp_leak_dir == NULL) { rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, LEAK_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir)); rrw_exit(&dp->dp_config_rwlock, FTAG); } dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ)); if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)); scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; err = bpobj_iterate(&dp->dp_obsolete_bpobj, dsl_scan_obsolete_block_cb, scn, tx); if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) dsl_pool_destroy_obsolete_bpobj(dp, tx); } return (0); } /* * This is the primary entry point for scans that is called from syncing * context. Scans must happen entirely during syncing context so that we * cna guarantee that blocks we are currently scanning will not change out * from under us. While a scan is active, this funciton controls how quickly * transaction groups proceed, instead of the normal handling provided by * txg_sync_thread(). */ void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; int err = 0; state_sync_type_t sync_type = SYNC_OPTIONAL; /* * Check for scn_restart_txg before checking spa_load_state, so * that we can restart an old-style scan while the pool is being * imported (see dsl_scan_init). */ if (dsl_scan_restarting(scn, tx)) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", func, (longlong_t)tx->tx_txg); dsl_scan_setup_sync(&func, tx); } /* * Only process scans in sync pass 1. */ if (spa_sync_pass(dp->dp_spa) > 1) return; /* * If the spa is shutting down, then stop scanning. This will * ensure that the scan does not dirty any new data during the * shutdown phase. */ if (spa_shutting_down(spa)) return; /* * If the scan is inactive due to a stalled async destroy, try again. */ if (!scn->scn_async_stalled && !dsl_scan_active(scn)) return; /* reset scan statistics */ scn->scn_visited_this_txg = 0; scn->scn_holes_this_txg = 0; scn->scn_lt_min_this_txg = 0; scn->scn_gt_max_this_txg = 0; scn->scn_ddt_contained_this_txg = 0; scn->scn_objsets_visited_this_txg = 0; scn->scn_avg_seg_size_this_txg = 0; scn->scn_segs_this_txg = 0; scn->scn_avg_zio_size_this_txg = 0; scn->scn_zios_this_txg = 0; scn->scn_suspending = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; /* * First process the async destroys. If we pause, don't do * any scrubbing or resilvering. This ensures that there are no * async destroys while we are scanning, so the scan code doesn't * have to worry about traversing it. It is also faster to free the * blocks than to scrub them. */ err = dsl_process_async_destroys(dp, tx); if (err != 0) return; if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; /* * Wait a few txgs after importing to begin scanning so that * we can get the pool imported quickly. */ if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) return; /* * It is possible to switch from unsorted to sorted at any time, * but afterwards the scan will remain sorted unless reloaded from * a checkpoint after a reboot. */ if (!zfs_scan_legacy) { scn->scn_is_sorted = B_TRUE; if (scn->scn_last_checkpoint == 0) scn->scn_last_checkpoint = ddi_get_lbolt(); } /* * For sorted scans, determine what kind of work we will be doing * this txg based on our memory limitations and whether or not we * need to perform a checkpoint. */ if (scn->scn_is_sorted) { /* * If we are over our checkpoint interval, set scn_clearing * so that we can begin checkpointing immediately. The * checkpoint allows us to save a consisent bookmark * representing how much data we have scrubbed so far. * Otherwise, use the memory limit to determine if we should * scan for metadata or start issue scrub IOs. We accumulate * metadata until we hit our hard memory limit at which point * we issue scrub IOs until we are at our soft memory limit. */ if (scn->scn_checkpointing || ddi_get_lbolt() - scn->scn_last_checkpoint > SEC_TO_TICK(zfs_scan_checkpoint_intval)) { if (!scn->scn_checkpointing) zfs_dbgmsg("begin scan checkpoint"); scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; } else { boolean_t should_clear = dsl_scan_should_clear(scn); if (should_clear && !scn->scn_clearing) { zfs_dbgmsg("begin scan clearing"); scn->scn_clearing = B_TRUE; } else if (!should_clear && scn->scn_clearing) { zfs_dbgmsg("finish scan clearing"); scn->scn_clearing = B_FALSE; } } } else { ASSERT0(scn->scn_checkpointing); ASSERT0(scn->scn_clearing); } if (!scn->scn_clearing && scn->scn_done_txg == 0) { /* Need to scan metadata for more blocks to scrub */ dsl_scan_phys_t *scnp = &scn->scn_phys; taskqid_t prefetch_tqid; uint64_t bytes_per_leaf = zfs_scan_vdev_limit; uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev); /* * Calculate the max number of in-flight bytes for pool-wide * scanning operations (minimum 1MB). Limits for the issuing * phase are done per top-level vdev and are handled separately. */ scn->scn_maxinflight_bytes = MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); zfs_dbgmsg("doing scan sync txg %llu; " "ddt bm=%llu/%llu/%llu/%llx", (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_ddt_bookmark.ddb_class, (longlong_t)scnp->scn_ddt_bookmark.ddb_type, (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); } else { zfs_dbgmsg("doing scan sync txg %llu; " "bm=%llu/%llu/%llu/%llu", (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_bookmark.zb_objset, (longlong_t)scnp->scn_bookmark.zb_object, (longlong_t)scnp->scn_bookmark.zb_level, (longlong_t)scnp->scn_bookmark.zb_blkid); } scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); scn->scn_prefetch_stop = B_FALSE; prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, dsl_scan_prefetch_thread, scn, TQ_SLEEP); ASSERT(prefetch_tqid != TASKQID_INVALID); dsl_pool_config_enter(dp, FTAG); dsl_scan_visit(scn, tx); dsl_pool_config_exit(dp, FTAG); mutex_enter(&dp->dp_spa->spa_scrub_lock); scn->scn_prefetch_stop = B_TRUE; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&dp->dp_spa->spa_scrub_lock); taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; zfs_dbgmsg("scan visited %llu blocks in %llums " "(%llu os's, %llu holes, %llu < mintxg, " "%llu in ddt, %llu > maxtxg)", (longlong_t)scn->scn_visited_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)scn->scn_objsets_visited_this_txg, (longlong_t)scn->scn_holes_this_txg, (longlong_t)scn->scn_lt_min_this_txg, (longlong_t)scn->scn_ddt_contained_this_txg, (longlong_t)scn->scn_gt_max_this_txg); if (!scn->scn_suspending) { ASSERT0(avl_numnodes(&scn->scn_queue)); scn->scn_done_txg = tx->tx_txg + 1; if (scn->scn_is_sorted) { scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; } zfs_dbgmsg("scan complete txg %llu", (longlong_t)tx->tx_txg); } } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { /* need to issue scrubbing IOs from per-vdev queues */ scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); scan_io_queues_run(scn); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; /* calculate and dprintf the current memory usage */ (void) dsl_scan_should_clear(scn); dsl_scan_update_stats(scn); zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums " "(avg_block_size = %llu, avg_seg_size = %llu)", (longlong_t)scn->scn_zios_this_txg, (longlong_t)scn->scn_segs_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)scn->scn_avg_zio_size_this_txg, (longlong_t)scn->scn_avg_seg_size_this_txg); } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { /* Finished with everything. Mark the scrub as complete */ zfs_dbgmsg("scan issuing complete txg %llu", (longlong_t)tx->tx_txg); ASSERT3U(scn->scn_done_txg, !=, 0); ASSERT0(spa->spa_scrub_inflight); ASSERT0(scn->scn_bytes_pending); dsl_scan_done(scn, B_TRUE, tx); sync_type = SYNC_MANDATORY; } dsl_scan_sync_state(scn, tx, sync_type); } static void count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; /* update the spa's stats on how many bytes we have issued */ for (i = 0; i < BP_GET_NDVAS(bp); i++) { atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, DVA_GET_ASIZE(&bp->blk_dva[i])); } /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. */ if (zab == NULL) return; mutex_enter(&zab->zab_lock); for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; if (t & DMU_OT_NEWTYPE) t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_2_of_2_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal == 1) zb->zb_ditto_2_of_3_samevdev++; else if (equal == 3) zb->zb_ditto_3_of_3_samevdev++; break; } } mutex_exit(&zab->zab_lock); } static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { avl_index_t idx; int64_t asize = sio->sio_asize; dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { /* block is already scheduled for reading */ atomic_add_64(&scn->scn_bytes_pending, -asize); kmem_free(sio, sizeof (*sio)); return; } avl_insert(&queue->q_sios_by_addr, sio, idx); range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); } /* * Given all the info we got from our metadata scanning process, we * construct a scan_io_t and insert it into the scan sorting queue. The * I/O must already be suitable for us to process. This is controlled * by dsl_scan_enqueue(). */ static void scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, int zio_flags, const zbookmark_phys_t *zb) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP); ASSERT0(BP_IS_GANG(bp)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); bp2sio(bp, sio, dva_i); sio->sio_flags = zio_flags; sio->sio_zb = *zb; /* * Increment the bytes pending counter now so that we can't * get an integer underflow in case the worker processes the * zio before we get to incrementing this counter. */ atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); scan_io_queue_insert_impl(queue, sio); } /* * Given a set of I/O parameters as discovered by the metadata traversal * process, attempts to place the I/O into the sorted queues (if allowed), * or immediately executes the I/O. */ static void dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb) { spa_t *spa = dp->dp_spa; ASSERT(!BP_IS_EMBEDDED(bp)); /* * Gang blocks are hard to issue sequentially, so we just issue them * here immediately instead of queuing them. */ if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { scan_exec_io(dp, bp, zio_flags, zb, NULL); return; } for (int i = 0; i < BP_GET_NDVAS(bp); i++) { dva_t dva; vdev_t *vdev; dva = bp->blk_dva[i]; vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); ASSERT(vdev != NULL); mutex_enter(&vdev->vdev_scan_io_queue_lock); if (vdev->vdev_scan_io_queue == NULL) vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); ASSERT(dp->dp_scan != NULL); scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, i, zio_flags, zb); mutex_exit(&vdev->vdev_scan_io_queue_lock); } } static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; int d; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) { count_block(scn, dp->dp_blkstats, bp); return (0); } /* Embedded BP's have phys_birth==0, so we reject them above. */ ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; for (d = 0; d < BP_GET_NDVAS(bp); d++) { const dva_t *dva = &bp->blk_dva[d]; /* * Keep track of how much data we've examined so that * zpool(1M) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); /* if it's a resilver, this may not be in the target range */ if (!needs_io) needs_io = dsl_scan_need_resilver(spa, dva, psize, phys_birth); } if (needs_io && !zfs_no_scrub_io) { dsl_scan_enqueue(dp, bp, zio_flags, zb); } else { count_block(scn, dp->dp_blkstats, bp); } /* do not relocate this block */ return (0); } static void dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; dsl_scan_io_queue_t *queue = zio->io_private; abd_free(zio->io_abd); if (queue == NULL) { mutex_enter(&spa->spa_scrub_lock); ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } else { mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); queue->q_inflight_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&queue->q_zio_cv); mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); } if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); } } /* * Given a scanning zio's information, executes the zio. The zio need * not necessarily be only sortable, this function simply executes the * zio, no matter what it is. The optional queue argument allows the * caller to specify that they want per top level vdev IO rate limiting * instead of the legacy global limiting. */ static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); abd_t *data = abd_alloc_for_io(size, B_FALSE); unsigned int scan_delay = 0; if (queue == NULL) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight += BP_GET_PSIZE(bp); mutex_exit(&spa->spa_scrub_lock); } else { kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; mutex_enter(q_lock); while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) cv_wait(&queue->q_zio_cv, q_lock); queue->q_inflight_bytes += BP_GET_PSIZE(bp); mutex_exit(q_lock); } if (zio_flags & ZIO_FLAG_RESILVER) scan_delay = zfs_resilver_delay; else { ASSERT(zio_flags & ZIO_FLAG_SCRUB); scan_delay = zfs_scrub_delay; } if (scan_delay && (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)) delay(MAX((int)scan_delay, 0)); count_block(dp->dp_scan, dp->dp_blkstats, bp); zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size, dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* * This is the primary extent sorting algorithm. We balance two parameters: * 1) how many bytes of I/O are in an extent * 2) how well the extent is filled with I/O (as a fraction of its total size) * Since we allow extents to have gaps between their constituent I/Os, it's * possible to have a fairly large extent that contains the same amount of * I/O bytes than a much smaller extent, which just packs the I/O more tightly. * The algorithm sorts based on a score calculated from the extent's size, * the relative fill volume (in %) and a "fill weight" parameter that controls * the split between whether we prefer larger extents or more well populated * extents: * * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) * * Example: * 1) assume extsz = 64 MiB * 2) assume fill = 32 MiB (extent is half full) * 3) assume fill_weight = 3 * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 * SCORE = 32M + (50 * 3 * 32M) / 100 * SCORE = 32M + (4800M / 100) * SCORE = 32M + 48M * ^ ^ * | +--- final total relative fill-based score * +--------- final total fill-based score * SCORE = 80M * * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards * extents that are more completely filled (in a 3:2 ratio) vs just larger. * Note that as an optimization, we replace multiplication and division by * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128). */ static int ext_size_compare(const void *x, const void *y) { const range_seg_t *rsa = x, *rsb = y; uint64_t sa = rsa->rs_end - rsa->rs_start, sb = rsb->rs_end - rsb->rs_start; uint64_t score_a, score_b; score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * fill_weight * rsa->rs_fill) >> 7); score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * fill_weight * rsb->rs_fill) >> 7); if (score_a > score_b) return (-1); if (score_a == score_b) { if (rsa->rs_start < rsb->rs_start) return (-1); if (rsa->rs_start == rsb->rs_start) return (0); return (1); } return (1); } /* * Comparator for the q_sios_by_addr tree. Sorting is simply performed * based on LBA-order (from lowest to highest). */ static int io_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; if (a->sio_offset < b->sio_offset) return (-1); if (a->sio_offset == b->sio_offset) return (0); return (1); } /* IO queues are created on demand when they are needed. */ static dsl_scan_io_queue_t * scan_io_queue_create(vdev_t *vd) { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); q->q_scn = scn; q->q_vd = vd; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, io_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); return (q); } /* * Destroys a scan queue and all segments and scan_io_t's contained in it. * No further execution of I/O occurs, anything pending in the queue is * simply freed without being executed. */ void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; void *cookie = NULL; int64_t bytes_dequeued = 0; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { ASSERT(range_tree_contains(queue->q_exts_by_addr, sio->sio_offset, sio->sio_asize)); bytes_dequeued += sio->sio_asize; kmem_free(sio, sizeof (*sio)); } atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); range_tree_vacate(queue->q_exts_by_addr, NULL, queue); range_tree_destroy(queue->q_exts_by_addr); avl_destroy(&queue->q_sios_by_addr); cv_destroy(&queue->q_zio_cv); kmem_free(queue, sizeof (*queue)); } /* * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is * called on behalf of vdev_top_transfer when creating or destroying * a mirror vdev due to zpool attach/detach. */ void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) { mutex_enter(&svd->vdev_scan_io_queue_lock); mutex_enter(&tvd->vdev_scan_io_queue_lock); VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; svd->vdev_scan_io_queue = NULL; if (tvd->vdev_scan_io_queue != NULL) tvd->vdev_scan_io_queue->q_vd = tvd; mutex_exit(&tvd->vdev_scan_io_queue_lock); mutex_exit(&svd->vdev_scan_io_queue_lock); } static void scan_io_queues_destroy(dsl_scan_t *scn) { vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *tvd = rvd->vdev_child[i]; mutex_enter(&tvd->vdev_scan_io_queue_lock); if (tvd->vdev_scan_io_queue != NULL) dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); tvd->vdev_scan_io_queue = NULL; mutex_exit(&tvd->vdev_scan_io_queue_lock); } } static void dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) { dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; vdev_t *vdev; kmutex_t *q_lock; dsl_scan_io_queue_t *queue; scan_io_t srch, *sio; avl_index_t idx; uint64_t start, size; vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); ASSERT(vdev != NULL); q_lock = &vdev->vdev_scan_io_queue_lock; queue = vdev->vdev_scan_io_queue; mutex_enter(q_lock); if (queue == NULL) { mutex_exit(q_lock); return; } bp2sio(bp, &srch, dva_i); start = srch.sio_offset; size = srch.sio_asize; /* * We can find the zio in two states: * 1) Cold, just sitting in the queue of zio's to be issued at * some point in the future. In this case, all we do is * remove the zio from the q_sios_by_addr tree, decrement * its data volume from the containing range_seg_t and * resort the q_exts_by_size tree to reflect that the * range_seg_t has lost some of its 'fill'. We don't shorten * the range_seg_t - this is usually rare enough not to be * worth the extra hassle of trying keep track of precise * extent boundaries. * 2) Hot, where the zio is currently in-flight in * dsl_scan_issue_ios. In this case, we can't simply * reach in and stop the in-flight zio's, so we instead * block the caller. Eventually, dsl_scan_issue_ios will * be done with issuing the zio's it gathered and will * signal us. */ sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); if (sio != NULL) { int64_t asize = sio->sio_asize; blkptr_t tmpbp; /* Got it while it was cold in the queue */ ASSERT3U(start, ==, sio->sio_offset); ASSERT3U(size, ==, asize); avl_remove(&queue->q_sios_by_addr, sio); ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); /* * We only update scn_bytes_pending in the cold path, * otherwise it will already have been accounted for as * part of the zio's execution. */ atomic_add_64(&scn->scn_bytes_pending, -asize); /* count the block as though we issued it */ sio2bp(sio, &tmpbp, dva_i); count_block(scn, dp->dp_blkstats, &tmpbp); kmem_free(sio, sizeof (*sio)); } mutex_exit(q_lock); } /* * Callback invoked when a zio_free() zio is executing. This needs to be * intercepted to prevent the zio from deallocating a particular portion * of disk space and it then getting reallocated and written to, while we * still have it queued up for processing. */ void dsl_scan_freed(spa_t *spa, const blkptr_t *bp) { dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(scn != NULL); if (!dsl_scan_is_running(scn)) return; for (int i = 0; i < BP_GET_NDVAS(bp); i++) dsl_scan_freed_dva(spa, bp, i); } Index: projects/clang900-import/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- projects/clang900-import/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 352536) +++ projects/clang900-import/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 352537) @@ -1,6021 +1,6021 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; offset_t ndata; dmu_object_info_t doi; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: { return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: { #ifdef illumos if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else off = *(offset_t *)data; #endif zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); #ifdef illumos if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (SET_ERROR(EFAULT)); #else *(offset_t *)data = off; #endif return (0); } #ifdef illumos case _FIO_COUNT_FILLED: { /* * _FIO_COUNT_FILLED adds a new ioctl command which * exposes the number of filled blocks in a * ZFS object. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Wait for all dirty blocks for this object * to get synced out to disk, and the DMU info * updated. */ error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); if (error) { ZFS_EXIT(zfsvfs); return (error); } /* * Retrieve fill count from DMU object. */ error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); if (error) { ZFS_EXIT(zfsvfs); return (error); } ndata = doi.doi_fill_count; ZFS_EXIT(zfsvfs); if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) return (SET_ERROR(EFAULT)); return (0); } #endif } return (SET_ERROR(ENOTTY)); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; int64_t end; /* * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE * aligned boundaries, if the range is not aligned. As a result a * DEV_BSIZE subrange with partially dirty data may get marked as clean. * It may happen that all DEV_BSIZE subranges are marked clean and thus * the whole page would be considred clean despite have some dirty data. * For this reason we should shrink the range to DEV_BSIZE aligned * boundaries before calling vm_page_clear_dirty. */ end = rounddown2(off + nbytes, DEV_BSIZE); off = roundup2(off, DEV_BSIZE); nbytes = end - off; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); vm_page_grab_valid(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_sunbusy(pp); vm_object_pip_wakeup(pp->object); } static vm_page_t page_wire(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t m; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); vm_page_grab_valid(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY); return (m); } static void page_unwire(vm_page_t pp) { vm_page_unwire(pp, PQ_ACTIVE); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, int segflg, dmu_tx_t *tx) { vm_object_t obj; struct sf_buf *sf; caddr_t va; int off; ASSERT(segflg != UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); vm_object_pip_add(obj, 1); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH);; zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unbusy(pp); } len -= nbytes; off = 0; } vm_object_pip_wakeup(obj); zfs_vmobject_wunlock(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain exclusive busy on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ static int mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(uio->uio_segflg == UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); zfs_vmobject_wlock(obj); for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp->valid == 0) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); vm_page_sunbusy(pp); if (error) { if (!vm_page_busied(pp) && !vm_page_wired(pp) && pp->valid == 0) vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_lock(pp); vm_page_activate(pp); vm_page_unlock(pp); } } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_sunbusy(pp); } if (error) break; uio->uio_resid -= bytes; uio->uio_offset += bytes; len -= bytes; } zfs_vmobject_wunlock(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); vm_object_t obj; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_wire(vp, start)) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); #ifdef illumos error = uiomove(va + off, bytes, UIO_READ, uio); #else error = vn_io_fault_uiomove(va + off, bytes, uio); #endif zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unwire(pp); } else { zfs_vmobject_wunlock(obj); error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); zfs_vmobject_wlock(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock(obj); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes; int error = 0; rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (zfsvfs->z_log && (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); #ifdef illumos if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(vp)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } #endif /* illumos */ while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); #ifdef __FreeBSD__ if (uio->uio_segflg == UIO_NOCOPY) error = mappedread_sf(vp, nbytes, uio); else #endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our * callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM. * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common() */ if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } #ifdef illumos /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else uio_prefaultpages(MIN(n, max_blksz), uio); #endif /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); woff = rl->r_off; if (rl->r_len == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (EFBIG); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { abuf = NULL; woff = uio->uio_loffset; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = SET_ERROR(EDQUOT); break; } if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_size) vnode_pager_setsize(vp, woff + nbytes); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id, uio->uio_segflg, tx); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(vp, cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); #ifdef illumos ASSERT(error == 0); #else ASSERT(error == 0 || error == EFAULT); #endif } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; if (error == 0) error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); else (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; #ifdef illumos if (!xuio && n > 0) uio_prefaultpages(MIN(n, max_blksz), uio); #endif } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } #ifdef __FreeBSD__ /* * EFAULT means that at least one page of the source buffer was not * available. VFS will re-try remaining I/O upon this error. */ if (error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } #endif if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_range_unlock(zgd->zgd_rl); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static int zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { int error; *vpp = arg; error = vn_lock(*vpp, lkflags); if (error != 0) vrele(*vpp); return (error); } static int zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error; int ltype; ASSERT_VOP_LOCKED(dvp, __func__); #ifdef DIAGNOSTIC if ((zdp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); #endif if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ASSERT3P(dvp, ==, vp); vref(dvp); ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case could leave us with * reclaimed vnode. */ if (dvp->v_iflag & VI_DOOMED) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. * * XXX Note that there is a time window when both vnodes are * unlocked. It is possible, although highly unlikely, that * during that window the parent-child relationship between * the vnodes may change, for example, get reversed. * In that case we would have a wrong lock order for the vnodes. * All other filesystems seem to ignore this problem, so we * do the same here. * A potential solution could be implemented as follows: * - using LK_NOWAIT when locking the second vnode and retrying * if necessary * - checking that the parent-child relationship still holds * after locking both vnodes and retrying if it doesn't */ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); return (error); } else { error = vn_lock(vp, lkflags); if (error != 0) vrele(vp); return (error); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td, int flags) { znode_t *zdp = VTOZ(dvp); znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { #ifdef TODO /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } #endif /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { vrele(*vpp); *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * First handle the special cases. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { struct componentname cn; vnode_t *zfsctl_vp; int ltype; ZFS_EXIT(zfsvfs); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, &zfsctl_vp); if (error == 0) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = cnp->cn_nameiop; cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; cn.cn_lkflags = cnp->cn_lkflags; error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); vput(zfsctl_vp); } vn_lock(dvp, ltype | LK_RETRY); return (error); } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { ZFS_EXIT(zfsvfs); if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); return (error); } /* * The loop is retry the lookup if the parent-child relationship * changes during the dot-dot locking complexities. */ for (;;) { uint64_t parent; error = zfs_dirlook(zdp, nm, &zp); if (error == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (error != 0) break; error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); if (error != 0) { /* * If we've got a locking error, then the vnode * got reclaimed because of a force unmount. * We never enter doomed vnodes into the name cache. */ *vpp = NULL; return (error); } if ((cnp->cn_flags & ISDOTDOT) == 0) break; ZFS_ENTER(zfsvfs); if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); } if (error != 0) { ZFS_EXIT(zfsvfs); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { ZFS_EXIT(zfsvfs); break; } vput(ZTOV(zp)); } out: if (error != 0) *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } /* Insert name into cache (as non-existent) if appropriate. */ if (zfsvfs->z_use_namecache && error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, NULL, cnp); /* Insert name into cache if appropriate. */ if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; void *vsecp = NULL; int flag = 0; uint64_t txtype; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } *vpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = SET_ERROR(EINVAL); goto out; } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } getnewvnode_reserve(1); tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); out: if (error == 0) { *vpp = ZTOV(zp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t obj = 0; dmu_tx_t *tx; boolean_t unlinked, toobig = FALSE; uint64_t txtype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; zp = VTOZ(vp); xattr_obj = 0; xzp = NULL; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); obj = zp->z_id; /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); } /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { zfs_unlinked_add(zp, tx); vp->v_vflag |= VV_NOSYNC; } txtype = TX_REMOVE; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (xzp) vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t txtype; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ *vpp = NULL; if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *vpp = ZTOV(zp); txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } cache_purge(dvp); error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); cache_purge(vp); out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; u_long *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { vrele(ZTOV(ezp)); goto skip_entry; } vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry. */ next = &eodp->ed_off; eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); /* NOTE: d_off is the offset for the *next* entry. */ next = &odp->d_off; (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; dirent_terminate(odp); odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } /* Fill the offset right after advancing the cursor. */ if (next != NULL) *next = offset; if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * ct - caller context * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; #ifdef illumos vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; #else vn_fsid(vp, vap); #endif vap->va_nodeid = zp->z_id; vap->va_nlink = zp->z_links; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && zp->z_links < ZFS_LINK_MAX) vap->va_nlink++; vap->va_size = zp->z_size; #ifdef illumos vap->va_rdev = vp->v_rdev; #else if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); #endif vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Note: ZFS_READONLY is handled in zfs_zaccess_common. */ /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && TIMESPEC_OVERFLOW(&vap->va_birthtime)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err == 0) { err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); if (err != 0) vrele(ZTOV(attrzp)); } if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } } tx = dmu_tx_create(zfsvfs->z_os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime, B_TRUE); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) xoap->xoa_createtime = vap->va_birthtime; /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, struct vnode *tdvp, struct vnode **tvpp, const struct componentname *scnp, const struct componentname *tcnp) { zfsvfs_t *zfsvfs; struct vnode *nvp, *svp, *tvp; znode_t *sdzp, *tdzp, *szp, *tzp; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error; VOP_UNLOCK(tdvp, 0); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK(*tvpp, 0); relock: error = vn_lock(sdvp, LK_EXCLUSIVE); if (error) goto out; sdzp = VTOZ(sdvp); error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); if (error != EBUSY) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto out; VOP_UNLOCK(tdvp, 0); goto relock; } tdzp = VTOZ(tdvp); /* * Before using sdzp and tdzp we must ensure that they are live. * As a porting legacy from illumos we have two things to worry * about. One is typical for FreeBSD and it is that the vnode is * not reclaimed (doomed). The other is that the znode is live. * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); ZFS_ENTER(zfsvfs); /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); error = SET_ERROR(EIO); goto out; } /* * Re-resolve svp to be certain it still exists and fetch the * correct vnode. */ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); if (error != 0) { /* Source entry invalid or not there. */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if ((scnp->cn_flags & ISDOTDOT) != 0 || (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) error = SET_ERROR(EINVAL); goto out; } svp = ZTOV(szp); /* * Re-resolve tvp, if it disappeared we just carry on. */ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); if (error != 0) { ZFS_EXIT(zfsvfs); VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); vrele(svp); if ((tcnp->cn_flags & ISDOTDOT) != 0) error = SET_ERROR(EINVAL); goto out; } if (tzp != NULL) tvp = ZTOV(tzp); else tvp = NULL; /* * At present the vnode locks must be acquired before z_teardown_lock, * although it would be more logical to use the opposite order. */ ZFS_EXIT(zfsvfs); /* * Now try acquire locks on svp and tvp. */ nvp = svp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); if (tvp != NULL) vrele(tvp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } VOP_UNLOCK(nvp, 0); /* * Concurrent rename race. * XXX ? */ if (nvp == tdvp) { vrele(nvp); error = SET_ERROR(EINVAL); goto out; } vrele(*svpp); *svpp = nvp; goto relock; } vrele(*svpp); *svpp = nvp; if (*tvpp != NULL) vrele(*tvpp); *tvpp = NULL; if (tvp != NULL) { nvp = tvp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK(sdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(*svpp, 0); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } vput(nvp); goto relock; } *tvpp = nvp; } return (0); out: return (error); } /* * Note that we must use VRELE_ASYNC in this function as it walks * up the directory tree and vrele may need to acquire an exclusive * lock if a last reference to a vnode is dropped. */ static int zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) { zfsvfs_t *zfsvfs; znode_t *zp, *zp1; uint64_t parent; int error; zfsvfs = tdzp->z_zfsvfs; if (tdzp == szp) return (SET_ERROR(EINVAL)); if (tdzp == sdzp) return (0); if (tdzp->z_id == zfsvfs->z_root) return (0); zp = tdzp; for (;;) { ASSERT(!zp->z_unlinked); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) break; if (parent == szp->z_id) { error = SET_ERROR(EINVAL); break; } if (parent == zfsvfs->z_root) break; if (parent == sdzp->z_id) break; error = zfs_zget(zfsvfs, parent, &zp1); if (error != 0) break; if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); zp = zp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); return (error); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr) { zfsvfs_t *zfsvfs; znode_t *sdzp, *tdzp, *szp, *tzp; zilog_t *zilog = NULL; dmu_tx_t *tx; char *snm = scnp->cn_nameptr; char *tnm = tcnp->cn_nameptr; int error = 0; /* Reject renames across filesystems. */ if ((*svpp)->v_mount != tdvp->v_mount || ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { error = SET_ERROR(EXDEV); goto out; } if (zfsctl_is_node(tdvp)) { error = SET_ERROR(EXDEV); goto out; } /* * Lock all four vnodes to ensure safety and semantics of renaming. */ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); if (error != 0) { /* no vnodes are locked in the case of error here */ return (error); } tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; zilog = zfsvfs->z_log; /* * After we re-enter ZFS_ENTER() we will have to revalidate all * znodes involved. */ ZFS_ENTER(zfsvfs); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { error = SET_ERROR(EILSEQ); goto unlockout; } /* If source and target are the same file, there is nothing to do. */ if ((*svpp) == (*tvpp)) { error = 0; goto unlockout; } if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && (*tvpp)->v_mountedhere != NULL)) { error = SET_ERROR(EXDEV); goto unlockout; } /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); goto unlockout; } szp = VTOZ(*svpp); tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { error = SET_ERROR(EIO); goto unlockout; } /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { error = SET_ERROR(EINVAL); goto unlockout; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto unlockout; if ((*svpp)->v_type == VDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || sdzp == szp || (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_check(szp, sdzp, tdzp)) goto unlockout; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if ((*svpp)->v_type == VDIR) { if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto unlockout; } else { cache_purge(tdvp); if (sdvp != tdvp) cache_purge(sdvp); } } else { if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto unlockout; } } } vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto unlockout; } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL), ==, 0); } } if (error == 0) { cache_purge(*svpp); if (*tvpp != NULL) cache_purge(*tvpp); cache_purge_negative(tdvp); } } dmu_tx_commit(tx); unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ ZFS_EXIT(zfsvfs); VOP_UNLOCK(*svpp, 0); VOP_UNLOCK(sdvp, 0); out: /* original two vnodes are locked */ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (*tvpp != NULL) VOP_UNLOCK(*tvpp, 0); if (tdvp != *tvpp) VOP_UNLOCK(tdvp, 0); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; int flags = 0; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dzp, name, zp, tx, ZNEW); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; uint64_t parent; uid_t owner; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); if (error == 0) { vnevent_link(svp, ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; dmu_tx_commit(tx); } } rw_exit(&zfsvfs->z_teardown_inactive_lock); } CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); /*ARGSUSED*/ static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; #ifdef illumos if (fidp->fid_len < size) { fidp->fid_len = size; ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOSPC)); } #else fidp->fid_len = size; #endif zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { znode_t *zp, *xzp; zfsvfs_t *zfsvfs; int error; switch (cmd) { case _PC_LINK_MAX: *valp = MIN(LONG_MAX, ZFS_LINK_MAX); return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); #ifdef illumos case _PC_XATTR_EXISTS: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = 0; error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR | ZEXISTS | ZSHARED); if (error == 0) { if (!zfs_dirempty(xzp)) *valp = 1; vrele(ZTOV(xzp)); } else if (error == ENOENT) { /* * If there aren't extended attributes, it's the * same as having zero of them. */ error = 0; } ZFS_EXIT(zfsvfs); return (error); case _PC_SATTR_ENABLED: case _PC_SATTR_EXISTS: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_type == VREG || vp->v_type == VDIR); return (0); case _PC_ACCESS_FILTERING: *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && vp->v_type == VDIR; return (0); case _PC_ACL_ENABLED: *valp = _ACL_ACE_ENABLED; return (0); #endif /* illumos */ case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); #ifdef illumos case _PC_TIMESTAMP_RESOLUTION: /* nanosecond timestamp resolution */ *valp = 1L; return (0); #endif case _PC_ACL_EXTENDED: *valp = 0; return (0); case _PC_ACL_NFS4: *valp = 1; return (0); case _PC_ACL_PATH_MAX: *valp = ACL_MAX_ENTRIES; return (0); default: return (EOPNOTSUPP); } } /*ARGSUSED*/ static int zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zp->z_zfsvfs->z_os; rl_t *rl; vm_object_t object; off_t start, end, obj_size; uint_t blksz; int pgsin_b, pgsin_a; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); /* * Lock a range covering all required and optional pages. * Note that we need to handle the case of the block size growing. */ for (;;) { blksz = zp->z_blksz; rl = zfs_range_lock(zp, rounddown(start, blksz), roundup(end, blksz) - rounddown(start, blksz), RL_READER); if (blksz == zp->z_blksz) break; zfs_range_unlock(rl); } object = ma[0]->object; zfs_vmobject_wlock(object); obj_size = object->un_pager.vnp.vnp_size; zfs_vmobject_wunlock(object); if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (zfs_vm_pagerret_bad); } pgsin_b = 0; if (rbehind != NULL) { pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); pgsin_b = MIN(*rbehind, pgsin_b); } pgsin_a = 0; if (rahead != NULL) { pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); if (end + IDX_TO_OFF(pgsin_a) >= obj_size) pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); pgsin_a = MIN(*rahead, pgsin_a); } /* * NB: we need to pass the exact byte size of the data that we expect * to read after accounting for the file size. This is required because * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); if (error != 0) return (zfs_vm_pagerret_error); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); if (rbehind != NULL) *rbehind = pgsin_b; if (rahead != NULL) *rahead = pgsin_a; return (zfs_vm_pagerret_ok); } static int zfs_freebsd_getpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; } */ *ap; { return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead)); } static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; rl_t *rl; dmu_tx_t *tx; struct sf_buf *sf; vm_object_t object; vm_page_t m; caddr_t va; size_t tocopy; size_t lo_len; vm_ooffset_t lo_off; vm_ooffset_t off; uint_t blksz; int ncount; int pcount; int err; int i; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); object = vp->v_object; pcount = btoc(len); ncount = pcount; KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); lo_len = roundup(len + (off - lo_off), blksz); rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER); zfs_vmobject_wlock(object); if (len + off > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > off) { int pgoff; len = object->un_pager.vnp.vnp_size - off; ncount = btoc(len); if ((pgoff = (int)len & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("zfs_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { len = 0; ncount = 0; } if (ncount < pcount) { for (i = ncount; i < pcount; i++) { rtvals[i] = zfs_vm_pagerret_bad; } } } zfs_vmobject_wunlock(object); if (ncount == 0) goto out; if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz < PAGE_SIZE) { for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { rtvals[i] = zfs_vm_pagerret_ok; vm_page_undirty(ma[i]); } zfs_vmobject_wunlock(object); VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } dmu_tx_commit(tx); out: zfs_range_unlock(rl); if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (rtvals[0]); } int zfs_freebsd_putpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals)); } static int zfs_freebsd_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } static int zfs_freebsd_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL); if (error == 0) vnode_create_vobject(vp, zp->z_size, ap->a_td); return (error); } static int zfs_freebsd_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL)); } static int zfs_freebsd_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; } */ *ap; { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL, NULL)); } static int ioflags(int ioflags) { int flags = 0; if (ioflags & IO_APPEND) flags |= FAPPEND; if (ioflags & IO_NDELAY) flags |= FNONBLOCK; if (ioflags & IO_SYNC) flags |= (FSYNC | FDSYNC | FRSYNC); return (flags); } static int zfs_freebsd_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); } static int zfs_freebsd_access(ap) struct vop_access_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); accmode_t accmode; int error = 0; /* * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); /* * VADMIN has to be handled by vaccess(). */ if (error == 0) { accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) { error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred, NULL); } } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. */ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { error = EACCES; } return (error); } static int zfs_freebsd_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT(cnp->cn_namelen < sizeof(nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, cnp->cn_thread, 0)); } static int zfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; if (zfsvfs->z_use_namecache) return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap)); } static int zfs_freebsd_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { zfsvfs_t *zfsvfs; struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; int error, mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; zfsvfs = ap->a_dvp->v_mount->mnt_data; error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode, ap->a_vpp, cnp->cn_cred, cnp->cn_thread); if (zfsvfs->z_use_namecache && error == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (error); } static int zfs_freebsd_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } static int zfs_freebsd_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { vattr_t *vap = ap->a_vap; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp, ap->a_cnp->cn_cred)); } static int zfs_freebsd_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } static int zfs_freebsd_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } static int zfs_freebsd_fsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { vop_stdfsync(ap); return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); } static int zfs_freebsd_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vattr_t *vap = ap->a_vap; xvattr_t xvap; u_long fflags = 0; int error; xva_init(&xvap); xvap.xva_vattr = *vap; xvap.xva_vattr.va_mask |= AT_XVATTR; /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ XVA_SET_REQ(&xvap, XAT_IMMUTABLE); XVA_SET_REQ(&xvap, XAT_APPENDONLY); XVA_SET_REQ(&xvap, XAT_NOUNLINK); XVA_SET_REQ(&xvap, XAT_NODUMP); XVA_SET_REQ(&xvap, XAT_READONLY); XVA_SET_REQ(&xvap, XAT_ARCHIVE); XVA_SET_REQ(&xvap, XAT_SYSTEM); XVA_SET_REQ(&xvap, XAT_HIDDEN); XVA_SET_REQ(&xvap, XAT_REPARSE); XVA_SET_REQ(&xvap, XAT_OFFLINE); XVA_SET_REQ(&xvap, XAT_SPARSE); error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); if (error != 0) return (error); /* Convert ZFS xattr into chflags. */ #define FLAG_CHECK(fflag, xflag, xfield) do { \ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ fflags |= (fflag); \ } while (0) FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHECK(UF_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHECK(UF_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHECK(UF_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHECK(UF_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; return (0); } static int zfs_freebsd_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cred = ap->a_cred; xvattr_t xvap; u_long fflags; uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; xva_init(&xvap); xvap.xva_vattr = *vap; zflags = VTOZ(vp)->z_pflags; if (vap->va_flags != VNOVAL) { zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; int error; if (zfsvfs->z_use_fuids == B_FALSE) return (EOPNOTSUPP); fflags = vap->va_flags; /* * XXX KDM * We need to figure out whether it makes sense to allow * UF_REPARSE through, since we don't really have other * facilities to handle reparse points and zfs_setattr() * doesn't currently allow setting that attribute anyway. */ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| UF_OFFLINE|UF_SPARSE)) != 0) return (EOPNOTSUPP); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the PR_ALLOW_CHFLAGS permission bit is set; * otherwise, they behave like unprivileged processes. */ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { error = securelevel_gt(cred, 0); if (error != 0) return (error); } } else { /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) return (error); if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { return (EPERM); } if (fflags & (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { return (EPERM); } } #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ if (((fflags & (fflag)) && !(zflags & (zflag))) || \ ((zflags & (zflag)) && !(fflags & (fflag)))) { \ XVA_SET_REQ(&xvap, (xflag)); \ (xfield) = ((fflags & (fflag)) != 0); \ } \ } while (0) /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, - xvap.xva_xoptattrs.xoa_hidden); + xvap.xva_xoptattrs.xoa_reparse); FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHANGE } if (vap->va_birthtime.tv_sec != VNOVAL) { xvap.xva_vattr.va_mask |= AT_XVATTR; XVA_SET_REQ(&xvap, XAT_CREATETIME); } return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL)); } static int zfs_freebsd_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } static int zfs_freebsd_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap, __DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread)); } static int zfs_freebsd_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); } static int zfs_freebsd_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct componentname *cnp = ap->a_cnp; vnode_t *vp = ap->a_vp; vnode_t *tdvp = ap->a_tdvp; if (tdvp->v_mount != vp->v_mount) return (EXDEV); ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0)); } static int zfs_freebsd_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; zfs_inactive(vp, ap->a_td->td_ucred, NULL); return (0); } static int zfs_freebsd_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp != NULL); /* * z_teardown_inactive_lock protects from a race with * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); rw_exit(&zfsvfs->z_teardown_inactive_lock); vp->v_data = NULL; return (0); } static int zfs_freebsd_fid(ap) struct vop_fid_args /* { struct vnode *a_vp; struct fid *a_fid; } */ *ap; { return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } static int zfs_freebsd_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; register_t *a_retval; } */ *ap; { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) { *ap->a_retval = val; return (error); } if (error != EOPNOTSUPP) return (error); switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { *ap->a_retval = PIPE_BUF; return (0); } return (EINVAL); default: return (vop_stdpathconf(ap)); } } /* * FreeBSD's extended attributes namespace defines file name prefix for ZFS' * extended attribute name: * * NAMESPACE PREFIX * system freebsd:system: * user (none, can be used to access ZFS fsattr(5) attributes * created on Solaris) */ static int zfs_create_attrname(int attrnamespace, const char *name, char *attrname, size_t size) { const char *namespace, *prefix, *suffix; /* We don't allow '/' character in attribute name. */ if (strchr(name, '/') != NULL) return (EINVAL); /* We don't allow attribute names that start with "freebsd:" string. */ if (strncmp(name, "freebsd:", 8) == 0) return (EINVAL); bzero(attrname, size); switch (attrnamespace) { case EXTATTR_NAMESPACE_USER: #if 0 prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_USER_STRING; suffix = ":"; #else /* * This is the default namespace by which we can access all * attributes created on Solaris. */ prefix = namespace = suffix = ""; #endif break; case EXTATTR_NAMESPACE_SYSTEM: prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; suffix = ":"; break; case EXTATTR_NAMESPACE_EMPTY: default: return (EINVAL); } if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, name) >= size) { return (ENAMETOOLONG); } return (0); } /* * Vnode operating to retrieve a named extended attribute. */ static int zfs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); if (error == ENOENT) error = ENOATTR; return (error); } if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); if (error == 0) *ap->a_size = (size_t)va.va_size; } else if (ap->a_uio != NULL) error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to remove a named attribute. */ int zfs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp, td); error = namei(&nd); vp = nd.ni_vp; if (error != 0) { ZFS_EXIT(zfsvfs); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ENOENT) error = ENOATTR; return (error); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to set a named attribute. */ static int zfs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof(attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR | CREATE_XATTR_DIR); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FFLAGS(O_WRONLY | O_CREAT); NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } VATTR_NULL(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, ap->a_cred); if (error == 0) VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK(vp, 0); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int zfs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrprefix[16]; u_char dirbuf[sizeof(struct dirent)]; struct dirent *dp; struct iovec aiov; struct uio auio, *uio = ap->a_uio; size_t *sizep = ap->a_size; size_t plen; vnode_t *xvp = NULL, *vp; int done, error, eof, pos; error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof(attrprefix)); if (error != 0) return (error); plen = strlen(attrprefix); ZFS_ENTER(zfsvfs); if (sizep != NULL) *sizep = 0; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR); if (error != 0) { ZFS_EXIT(zfsvfs); /* * ENOATTR means that the EA directory does not yet exist, * i.e. there are no extended attributes there. */ if (error == ENOATTR) error = 0; return (error); } NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp, td); error = namei(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_rw = UIO_READ; auio.uio_offset = 0; do { u_char nlen; aiov.iov_base = (void *)dirbuf; aiov.iov_len = sizeof(dirbuf); auio.uio_resid = sizeof(dirbuf); error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); done = sizeof(dirbuf) - auio.uio_resid; if (error != 0) break; for (pos = 0; pos < done;) { dp = (struct dirent *)(dirbuf + pos); pos += dp->d_reclen; /* * XXX: Temporarily we also accept DT_UNKNOWN, as this * is what we get when attribute was created on Solaris. */ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) continue; if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) continue; else if (strncmp(dp->d_name, attrprefix, plen) != 0) continue; nlen = dp->d_namlen - plen; if (sizep != NULL) *sizep += 1 + nlen; else if (uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, uio->uio_rw, uio); if (error == 0) { error = uiomove(dp->d_name + plen, nlen, uio->uio_rw, uio); } if (error != 0) break; } } } while (!eof && error == 0); vput(vp); ZFS_EXIT(zfsvfs); return (error); } int zfs_freebsd_getacl(ap) struct vop_getacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); if (vsecattr.vsa_aclentp != NULL) kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); return (error); } int zfs_freebsd_setacl(ap) struct vop_setacl_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { int error; vsecattr_t vsecattr; int aclbsize; /* size of acl list in bytes */ aclent_t *aaclp; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); if (ap->a_aclp == NULL) return (EINVAL); if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) return (EINVAL); /* * With NFSv4 ACLs, chmod(2) may need to add additional entries, * splitting every entry into two and appending "canonical six" * entries at the end. Don't allow for setting an ACL that would * cause chmod(2) to run out of ACL entries. */ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) return (ENOSPC); error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); if (error != 0) return (error); vsecattr.vsa_mask = VSA_ACE; aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t); vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); aaclp = vsecattr.vsa_aclentp; vsecattr.vsa_aclentsz = aclbsize; aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL); kmem_free(aaclp, aclbsize); return (error); } int zfs_freebsd_aclcheck(ap) struct vop_aclcheck_args /* { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; } */ *ap; { return (EOPNOTSUPP); } static int zfs_vptocnp(struct vop_vptocnp_args *ap) { vnode_t *covered_vp; vnode_t *vp = ap->a_vp;; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *zp = VTOZ(vp); enum vgetstate vs; int ltype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If we are a snapshot mounted under .zfs, run the operation * on the covered vnode. */ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { char name[MAXNAMLEN + 1]; znode_t *dzp; size_t len; error = zfs_znode_parent_and_name(zp, &dzp, name); if (error == 0) { len = strlen(name); if (*ap->a_buflen < len) error = SET_ERROR(ENOMEM); } if (error == 0) { *ap->a_buflen -= len; bcopy(name, ap->a_buf + *ap->a_buflen, len); *ap->a_vpp = ZTOV(dzp); } ZFS_EXIT(zfsvfs); return (error); } ZFS_EXIT(zfsvfs); covered_vp = vp->v_mount->mnt_vnodecovered; vs = vget_prep(covered_vp); ltype = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); error = vget_finish(covered_vp, LK_SHARED, vs); if (error == 0) { error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, ap->a_buf, ap->a_buflen); vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); if ((vp->v_iflag & VI_DOOMED) != 0) error = SET_ERROR(ENOENT); return (error); } #ifdef DIAGNOSTIC static int zfs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { vnode_t *vp; znode_t *zp; int err; err = vop_stdlock(ap); if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { vp = ap->a_vp; zp = vp->v_data; if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 && zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); } return (err); } #endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_access = zfs_freebsd_access, .vop_allocate = VOP_EINVAL, .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_lookup, .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, .vop_setextattr = zfs_setextattr, .vop_listextattr = zfs_listextattr, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, .vop_vptocnp = zfs_vptocnp, #ifdef DIAGNOSTIC .vop_lock1 = zfs_lock, #endif }; struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = zfs_freebsd_fsync, .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_pathconf = zfs_freebsd_pathconf, .vop_fid = zfs_freebsd_fid, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, }; /* * special share hidden files vnode operations template */ struct vop_vector zfs_shareops = { .vop_default = &default_vnodeops, .vop_access = zfs_freebsd_access, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, }; Index: projects/clang900-import/sys/cddl/contrib/opensolaris =================================================================== --- projects/clang900-import/sys/cddl/contrib/opensolaris (revision 352536) +++ projects/clang900-import/sys/cddl/contrib/opensolaris (revision 352537) Property changes on: projects/clang900-import/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/cddl/contrib/opensolaris:r352308-352536 Index: projects/clang900-import/sys/conf/files.mips =================================================================== --- projects/clang900-import/sys/conf/files.mips (revision 352536) +++ projects/clang900-import/sys/conf/files.mips (revision 352537) @@ -1,115 +1,117 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # $FreeBSD$ # # Arch dependent files mips/mips/autoconf.c standard mips/mips/bus_space_generic.c standard mips/mips/busdma_machdep.c standard mips/mips/cache.c standard mips/mips/cache_mipsNN.c standard mips/mips/cpu.c standard mips/mips/db_disasm.c optional ddb mips/mips/db_interface.c optional ddb mips/mips/db_trace.c optional ddb mips/mips/dump_machdep.c standard mips/mips/elf_machdep.c standard mips/mips/exception.S standard mips/mips/fp.S standard mips/mips/freebsd32_machdep.c optional compat_freebsd32 mips/mips/gdb_machdep.c standard mips/mips/in_cksum.c optional inet mips/mips/libkern_machdep.c standard mips/mips/locore.S standard no-obj mips/mips/machdep.c standard mips/mips/mem.c optional mem mips/mips/minidump_machdep.c standard mips/mips/mp_machdep.c optional smp mips/mips/mpboot.S optional smp mips/mips/nexus.c standard mips/mips/ofw_machdep.c optional fdt mips/mips/pm_machdep.c standard mips/mips/pmap.c standard mips/mips/ptrace_machdep.c standard mips/mips/sc_machdep.c standard mips/mips/stack_machdep.c optional ddb | stack mips/mips/stdatomic.c standard \ compile-with "${NORMAL_C:N-Wmissing-prototypes}" mips/mips/support.S standard mips/mips/bcopy.S standard mips/mips/swtch.S standard mips/mips/sys_machdep.c standard mips/mips/tlb.c standard mips/mips/trap.c standard mips/mips/uio_machdep.c standard mips/mips/uma_machdep.c standard mips/mips/vm_machdep.c standard # misc opt-in bits kern/kern_clocksource.c standard kern/link_elf_obj.c standard kern/subr_busdma_bufalloc.c standard kern/subr_dummy_vdso_tc.c standard kern/subr_sfbuf.c optional mips | mipsel | mipsn32 kern/subr_sfbuf.c optional mipshf | mipselhf # gcc/clang runtime libkern/ffsl.c standard libkern/ffsll.c standard libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard libkern/cmpdi2.c optional mips | mipshf | mipsel | mipselhf libkern/ucmpdi2.c optional mips | mipshf | mipsel | mipselhf libkern/ashldi3.c standard libkern/ashrdi3.c standard libkern/memcmp.c standard # cfe support dev/cfe/cfe_api.c optional cfe dev/cfe/cfe_console.c optional cfe_console dev/cfe/cfe_env.c optional cfe_env # syscons support dev/fb/fb.c optional sc dev/syscons/scgfbrndr.c optional sc mips/mips/sc_machdep.c optional sc # FDT support dev/uart/uart_cpu_fdt.c optional uart fdt # crypto support -- use generic crypto/blowfish/bf_enc.c optional crypto | ipsec | \ ipsec_support crypto/des/des_enc.c optional crypto | ipsec | \ ipsec_support | netsmb # AP common nvram interface MIPS specific, but maybe should be more generic dev/nvram2env/nvram2env_mips.c optional nvram2env dev/nvram2env/nvram2env.c optional nvram2env # hwpmc support -dev/hwpmc/hwpmc_mips.c optional hwpmc +dev/hwpmc/hwpmc_beri.c optional hwpmc_beri +dev/hwpmc/hwpmc_mips.c optional hwpmc_mips24k | \ + hwpmc_mips74k dev/hwpmc/hwpmc_mips24k.c optional hwpmc_mips24k dev/hwpmc/hwpmc_mips74k.c optional hwpmc_mips74k # ofw support dev/ofw/ofwpci.c optional fdt pci # INTRNG support code kern/msi_if.m optional intrng kern/pic_if.m optional intrng kern/subr_intr.c optional intrng # INTRNG compatible MIPS32 interrupt controller mips/mips/mips_pic.c optional intrng # DTrace cddl/compat/opensolaris/kern/opensolaris_atomic.c optional zfs | dtrace compile-with "${CDDL_C}" cddl/dev/dtrace/mips/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}" cddl/dev/dtrace/mips/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}" cddl/dev/fbt/mips/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}" # Zstd contrib/zstd/lib/freebsd/zstd_kfreebsd.c optional zstdio compile-with ${ZSTD_C} Index: projects/clang900-import/sys/dev/hwpmc/hwpmc_beri.c =================================================================== --- projects/clang900-import/sys/dev/hwpmc/hwpmc_beri.c (nonexistent) +++ projects/clang900-import/sys/dev/hwpmc/hwpmc_beri.c (revision 352537) @@ -0,0 +1,540 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Ruslan Bukin + * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory (Department of Computer Science and + * Technology) under DARPA contract HR0011-18-C-0016 ("ECATS"), as part of the + * DARPA SSITH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_hwpmc_hooks.h" + +#include +#include + +#include + +#define BERI_NCOUNTERS 56 +#define BERI_PMC_CAPS (PMC_CAP_USER | PMC_CAP_SYSTEM | \ + PMC_CAP_READ | PMC_CAP_WRITE ) + +struct beri_event_code_map { + uint32_t pe_ev; /* enum value */ + uint64_t (*get_func)(void); +}; + +const struct beri_event_code_map beri_event_codes[BERI_NCOUNTERS] = { + { PMC_EV_BERI_CYCLE, + statcounters_get_cycle_count }, + { PMC_EV_BERI_INST, + statcounters_get_inst_count }, + { PMC_EV_BERI_INST_USER, + statcounters_get_inst_user_count }, + { PMC_EV_BERI_INST_KERNEL, + statcounters_get_inst_kernel_count }, + { PMC_EV_BERI_IMPRECISE_SETBOUNDS, + statcounters_get_imprecise_setbounds_count }, + { PMC_EV_BERI_UNREPRESENTABLE_CAPS, + statcounters_get_unrepresentable_caps_count }, + { PMC_EV_BERI_ITLB_MISS, + statcounters_get_itlb_miss_count }, + { PMC_EV_BERI_DTLB_MISS, + statcounters_get_dtlb_miss_count }, + { PMC_EV_BERI_ICACHE_WRITE_HIT, + statcounters_get_icache_write_hit_count }, + { PMC_EV_BERI_ICACHE_WRITE_MISS, + statcounters_get_icache_write_miss_count }, + { PMC_EV_BERI_ICACHE_READ_HIT, + statcounters_get_icache_read_hit_count }, + { PMC_EV_BERI_ICACHE_READ_MISS, + statcounters_get_icache_read_miss_count }, + { PMC_EV_BERI_ICACHE_EVICT, + statcounters_get_icache_evict_count }, + { PMC_EV_BERI_DCACHE_WRITE_HIT, + statcounters_get_dcache_write_hit_count }, + { PMC_EV_BERI_DCACHE_WRITE_MISS, + statcounters_get_dcache_write_miss_count }, + { PMC_EV_BERI_DCACHE_READ_HIT, + statcounters_get_dcache_read_hit_count }, + { PMC_EV_BERI_DCACHE_READ_MISS, + statcounters_get_dcache_read_miss_count }, + { PMC_EV_BERI_DCACHE_EVICT, + statcounters_get_dcache_evict_count }, + { PMC_EV_BERI_DCACHE_SET_TAG_WRITE, + statcounters_get_dcache_set_tag_write_count }, + { PMC_EV_BERI_DCACHE_SET_TAG_READ, + statcounters_get_dcache_set_tag_read_count }, + { PMC_EV_BERI_L2CACHE_WRITE_HIT, + statcounters_get_l2cache_write_hit_count }, + { PMC_EV_BERI_L2CACHE_WRITE_MISS, + statcounters_get_l2cache_write_miss_count }, + { PMC_EV_BERI_L2CACHE_READ_HIT, + statcounters_get_l2cache_read_hit_count }, + { PMC_EV_BERI_L2CACHE_READ_MISS, + statcounters_get_l2cache_read_miss_count }, + { PMC_EV_BERI_L2CACHE_EVICT, + statcounters_get_l2cache_evict_count }, + { PMC_EV_BERI_L2CACHE_SET_TAG_WRITE, + statcounters_get_l2cache_set_tag_write_count }, + { PMC_EV_BERI_L2CACHE_SET_TAG_READ, + statcounters_get_l2cache_set_tag_read_count }, + { PMC_EV_BERI_MEM_BYTE_READ, + statcounters_get_mem_byte_read_count }, + { PMC_EV_BERI_MEM_BYTE_WRITE, + statcounters_get_mem_byte_write_count }, + { PMC_EV_BERI_MEM_HWORD_READ, + statcounters_get_mem_hword_read_count }, + { PMC_EV_BERI_MEM_HWORD_WRITE, + statcounters_get_mem_hword_write_count }, + { PMC_EV_BERI_MEM_WORD_READ, + statcounters_get_mem_word_read_count }, + { PMC_EV_BERI_MEM_WORD_WRITE, + statcounters_get_mem_word_write_count }, + { PMC_EV_BERI_MEM_DWORD_READ, + statcounters_get_mem_dword_read_count }, + { PMC_EV_BERI_MEM_DWORD_WRITE, + statcounters_get_mem_dword_write_count }, + { PMC_EV_BERI_MEM_CAP_READ, + statcounters_get_mem_cap_read_count }, + { PMC_EV_BERI_MEM_CAP_WRITE, + statcounters_get_mem_cap_write_count }, + { PMC_EV_BERI_MEM_CAP_READ_TAG_SET, + statcounters_get_mem_cap_read_tag_set_count }, + { PMC_EV_BERI_MEM_CAP_WRITE_TAG_SET, + statcounters_get_mem_cap_write_tag_set_count }, + { PMC_EV_BERI_TAGCACHE_WRITE_HIT, + statcounters_get_tagcache_write_hit_count }, + { PMC_EV_BERI_TAGCACHE_WRITE_MISS, + statcounters_get_tagcache_write_miss_count }, + { PMC_EV_BERI_TAGCACHE_READ_HIT, + statcounters_get_tagcache_read_hit_count }, + { PMC_EV_BERI_TAGCACHE_READ_MISS, + statcounters_get_tagcache_read_miss_count }, + { PMC_EV_BERI_TAGCACHE_EVICT, + statcounters_get_tagcache_evict_count }, + { PMC_EV_BERI_L2CACHEMASTER_READ_REQ, + statcounters_get_l2cachemaster_read_req_count }, + { PMC_EV_BERI_L2CACHEMASTER_WRITE_REQ, + statcounters_get_l2cachemaster_write_req_count }, + { PMC_EV_BERI_L2CACHEMASTER_WRITE_REQ_FLIT, + statcounters_get_l2cachemaster_write_req_flit_count }, + { PMC_EV_BERI_L2CACHEMASTER_READ_RSP, + statcounters_get_l2cachemaster_read_rsp_count }, + { PMC_EV_BERI_L2CACHEMASTER_READ_RSP_FLIT, + statcounters_get_l2cachemaster_read_rsp_flit_count }, + { PMC_EV_BERI_L2CACHEMASTER_WRITE_RSP, + statcounters_get_l2cachemaster_write_rsp_count }, + { PMC_EV_BERI_TAGCACHEMASTER_READ_REQ, + statcounters_get_tagcachemaster_read_req_count }, + { PMC_EV_BERI_TAGCACHEMASTER_WRITE_REQ, + statcounters_get_tagcachemaster_write_req_count }, + { PMC_EV_BERI_TAGCACHEMASTER_WRITE_REQ_FLIT, + statcounters_get_tagcachemaster_write_req_flit_count }, + { PMC_EV_BERI_TAGCACHEMASTER_READ_RSP, + statcounters_get_tagcachemaster_read_rsp_count }, + { PMC_EV_BERI_TAGCACHEMASTER_READ_RSP_FLIT, + statcounters_get_tagcachemaster_read_rsp_flit_count }, + { PMC_EV_BERI_TAGCACHEMASTER_WRITE_RSP, + statcounters_get_tagcachemaster_write_rsp_count }, +}; + +struct mips_pmc_spec beri_pmc_spec = { + .ps_cpuclass = PMC_CLASS_BERI, + .ps_cputype = PMC_CPU_MIPS_BERI, + .ps_capabilities = BERI_PMC_CAPS, + .ps_counter_width = 64 +}; + +/* + * Per-processor information. + */ +struct beri_cpu { + struct pmc_hw *pc_beripmcs; + uint64_t start_values[BERI_NCOUNTERS]; + uint64_t stop_values[BERI_NCOUNTERS]; + uint64_t saved_values[BERI_NCOUNTERS]; +}; + +int beri_npmcs; +static struct beri_cpu **beri_pcpu; + +static int +beri_allocate_pmc(int cpu, int ri, struct pmc *pm, + const struct pmc_op_pmcallocate *a) +{ + uint32_t config; + int i; + + KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), + ("[beri,%d] illegal CPU value %d", __LINE__, cpu)); + KASSERT(ri >= 0 && ri < beri_npmcs, + ("[beri,%d] illegal row index %d", __LINE__, ri)); + + if (a->pm_class != beri_pmc_spec.ps_cpuclass) + return (EINVAL); + + for (i = 0; i < BERI_NCOUNTERS; i++) { + if (beri_event_codes[i].pe_ev == a->pm_ev) { + config = i; + break; + } + } + + if (i == BERI_NCOUNTERS) + return (EINVAL); + + pm->pm_md.pm_mips_evsel = config; + + PMCDBG2(MDP,ALL,2,"beri-allocate ri=%d -> config=0x%x", ri, config); + + return (0); +} + +static int +beri_read_pmc(int cpu, int ri, pmc_value_t *v) +{ + uint32_t config; + struct pmc *pm; + pmc_value_t new; + pmc_value_t start_val; + pmc_value_t stop_val; + pmc_value_t saved_val; + pmc_value_t result; + + KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), + ("[beri,%d] illegal CPU value %d", __LINE__, cpu)); + KASSERT(ri >= 0 && ri < beri_npmcs, + ("[beri,%d] illegal row index %d", __LINE__, ri)); + + pm = beri_pcpu[cpu]->pc_beripmcs[ri].phw_pmc; + config = pm->pm_md.pm_mips_evsel; + + start_val = beri_pcpu[cpu]->start_values[config]; + if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { + stop_val = beri_event_codes[config].get_func(); + } else + stop_val = beri_pcpu[cpu]->stop_values[config]; + + if (start_val <= stop_val) + result = stop_val - start_val; + else { + if (config == 0) /* CYCLE counter is 48 bit */ + result = 0x00ffffffffffffffUL; + else + result = 0xffffffffffffffffUL; + result -= start_val; + result += stop_val; + } + + saved_val = beri_pcpu[cpu]->saved_values[config]; + + *v = result + saved_val; + + return (0); +} + +static int +beri_write_pmc(int cpu, int ri, pmc_value_t v) +{ + struct pmc *pm; + uint32_t config; + + KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), + ("[beri,%d] illegal CPU value %d", __LINE__, cpu)); + KASSERT(ri >= 0 && ri < beri_npmcs, + ("[beri,%d] illegal row-index %d", __LINE__, ri)); + + pm = beri_pcpu[cpu]->pc_beripmcs[ri].phw_pmc; + config = pm->pm_md.pm_mips_evsel; + + if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) + v = (1UL << (beri_pmc_spec.ps_counter_width - 1)) - v; + + PMCDBG3(MDP,WRI,1,"beri-write cpu=%d ri=%d v=%jx", cpu, ri, v); + + if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) + beri_pcpu[cpu]->saved_values[config] = 0; + else + beri_pcpu[cpu]->saved_values[config] = v; + + return (0); +} + +static int +beri_config_pmc(int cpu, int ri, struct pmc *pm) +{ + struct pmc_hw *phw; + + PMCDBG3(MDP,CFG,1, "cpu=%d ri=%d pm=%p", cpu, ri, pm); + + KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), + ("[beri,%d] illegal CPU value %d", __LINE__, cpu)); + KASSERT(ri >= 0 && ri < beri_npmcs, + ("[beri,%d] illegal row-index %d", __LINE__, ri)); + + phw = &beri_pcpu[cpu]->pc_beripmcs[ri]; + + KASSERT(pm == NULL || phw->phw_pmc == NULL, + ("[beri,%d] pm=%p phw->pm=%p hwpmc not unconfigured", + __LINE__, pm, phw->phw_pmc)); + + phw->phw_pmc = pm; + + return (0); +} + +static int +beri_start_pmc(int cpu, int ri) +{ + uint32_t config; + struct pmc *pm; + struct pmc_hw *phw; + pmc_value_t v; + + phw = &beri_pcpu[cpu]->pc_beripmcs[ri]; + pm = phw->phw_pmc; + config = pm->pm_md.pm_mips_evsel; + + v = beri_event_codes[config].get_func(); + beri_pcpu[cpu]->start_values[config] = v; + + return (0); +} + +static int +beri_stop_pmc(int cpu, int ri) +{ + uint32_t config; + struct pmc *pm; + struct pmc_hw *phw; + pmc_value_t v; + + phw = &beri_pcpu[cpu]->pc_beripmcs[ri]; + pm = phw->phw_pmc; + config = pm->pm_md.pm_mips_evsel; + + v = beri_event_codes[config].get_func(); + beri_pcpu[cpu]->stop_values[config] = v; + + return (0); +} + +static int +beri_release_pmc(int cpu, int ri, struct pmc *pmc) +{ + struct pmc_hw *phw; + + KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), + ("[beri,%d] illegal CPU value %d", __LINE__, cpu)); + KASSERT(ri >= 0 && ri < beri_npmcs, + ("[beri,%d] illegal row-index %d", __LINE__, ri)); + + phw = &beri_pcpu[cpu]->pc_beripmcs[ri]; + KASSERT(phw->phw_pmc == NULL, + ("[beri,%d] PHW pmc %p non-NULL", __LINE__, phw->phw_pmc)); + + return (0); +} + +static int +beri_describe(int cpu, int ri, struct pmc_info *pi, struct pmc **ppmc) +{ + struct pmc_hw *phw; + char beri_name[PMC_NAME_MAX]; + int error; + + KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), + ("[beri,%d], illegal CPU %d", __LINE__, cpu)); + KASSERT(ri >= 0 && ri < beri_npmcs, + ("[beri,%d] row-index %d out of range", __LINE__, ri)); + + phw = &beri_pcpu[cpu]->pc_beripmcs[ri]; + snprintf(beri_name, sizeof(beri_name), "MIPS-%d", ri); + if ((error = copystr(beri_name, pi->pm_name, PMC_NAME_MAX, + NULL)) != 0) + return error; + pi->pm_class = beri_pmc_spec.ps_cpuclass; + if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) { + pi->pm_enabled = TRUE; + *ppmc = phw->phw_pmc; + } else { + pi->pm_enabled = FALSE; + *ppmc = NULL; + } + + return (0); +} + +static int +beri_get_config(int cpu, int ri, struct pmc **ppm) +{ + + *ppm = beri_pcpu[cpu]->pc_beripmcs[ri].phw_pmc; + + return (0); +} + +static int +beri_pmc_switch_in(struct pmc_cpu *pc, struct pmc_process *pp) +{ + + return (0); +} + +static int +beri_pmc_switch_out(struct pmc_cpu *pc, struct pmc_process *pp) +{ + + return (0); +} + +static int +beri_pcpu_init(struct pmc_mdep *md, int cpu) +{ + int first_ri, i; + struct pmc_cpu *pc; + struct beri_cpu *pac; + struct pmc_hw *phw; + + KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), + ("[beri,%d] wrong cpu number %d", __LINE__, cpu)); + PMCDBG1(MDP,INI,1,"beri-init cpu=%d", cpu); + + beri_pcpu[cpu] = pac = malloc(sizeof(struct beri_cpu), M_PMC, + M_WAITOK|M_ZERO); + pac->pc_beripmcs = malloc(sizeof(struct pmc_hw) * beri_npmcs, + M_PMC, M_WAITOK|M_ZERO); + pc = pmc_pcpu[cpu]; + first_ri = md->pmd_classdep[PMC_MDEP_CLASS_INDEX_MIPS].pcd_ri; + KASSERT(pc != NULL, ("[beri,%d] NULL per-cpu pointer", __LINE__)); + + for (i = 0, phw = pac->pc_beripmcs; i < beri_npmcs; i++, phw++) { + phw->phw_state = PMC_PHW_FLAG_IS_ENABLED | + PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(i); + phw->phw_pmc = NULL; + pc->pc_hwpmcs[i + first_ri] = phw; + } + + return (0); +} + +static int +beri_pcpu_fini(struct pmc_mdep *md, int cpu) +{ + + return (0); +} + +struct pmc_mdep * +pmc_beri_initialize() +{ + struct pmc_mdep *pmc_mdep; + struct pmc_classdep *pcd; + + snprintf(pmc_cpuid, sizeof(pmc_cpuid), "beri"); + + beri_npmcs = 2; + + PMCDBG1(MDP,INI,1,"beri-init npmcs=%d", beri_npmcs); + + /* + * Allocate space for pointers to PMC HW descriptors and for + * the MDEP structure used by MI code. + */ + beri_pcpu = malloc(sizeof(struct beri_cpu *) * pmc_cpu_max(), M_PMC, + M_WAITOK|M_ZERO); + + /* Just one class */ + pmc_mdep = pmc_mdep_alloc(1); + + pmc_mdep->pmd_cputype = beri_pmc_spec.ps_cputype; + + pcd = &pmc_mdep->pmd_classdep[PMC_MDEP_CLASS_INDEX_MIPS]; + pcd->pcd_caps = beri_pmc_spec.ps_capabilities; + pcd->pcd_class = beri_pmc_spec.ps_cpuclass; + pcd->pcd_num = beri_npmcs; + pcd->pcd_ri = pmc_mdep->pmd_npmc; + pcd->pcd_width = beri_pmc_spec.ps_counter_width; + + pcd->pcd_allocate_pmc = beri_allocate_pmc; + pcd->pcd_config_pmc = beri_config_pmc; + pcd->pcd_pcpu_fini = beri_pcpu_fini; + pcd->pcd_pcpu_init = beri_pcpu_init; + pcd->pcd_describe = beri_describe; + pcd->pcd_get_config = beri_get_config; + pcd->pcd_read_pmc = beri_read_pmc; + pcd->pcd_release_pmc = beri_release_pmc; + pcd->pcd_start_pmc = beri_start_pmc; + pcd->pcd_stop_pmc = beri_stop_pmc; + pcd->pcd_write_pmc = beri_write_pmc; + + pmc_mdep->pmd_intr = NULL; + pmc_mdep->pmd_switch_in = beri_pmc_switch_in; + pmc_mdep->pmd_switch_out = beri_pmc_switch_out; + + pmc_mdep->pmd_npmc += beri_npmcs; + + return (pmc_mdep); +} + +void +pmc_beri_finalize(struct pmc_mdep *md) +{ + +} + +struct pmc_mdep * +pmc_md_initialize() +{ + + return (pmc_beri_initialize()); +} + +void +pmc_md_finalize(struct pmc_mdep *md) +{ + + return (pmc_beri_finalize(md)); +} + +int +pmc_save_kernel_callchain(uintptr_t *cc, int nframes, + struct trapframe *tf) +{ + + return (0); +} + +int +pmc_save_user_callchain(uintptr_t *cc, int nframes, + struct trapframe *tf) +{ + + return (0); +} Property changes on: projects/clang900-import/sys/dev/hwpmc/hwpmc_beri.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang900-import/sys/dev/hwpmc/hwpmc_beri.h =================================================================== --- projects/clang900-import/sys/dev/hwpmc/hwpmc_beri.h (nonexistent) +++ projects/clang900-import/sys/dev/hwpmc/hwpmc_beri.h (revision 352537) @@ -0,0 +1,107 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Alex Richardson + * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory (Department of Computer Science and + * Technology) under DARPA contract HR0011-18-C-0016 ("ECATS"), as part of the + * DARPA SSITH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _DEV_HWPMC_HWPMC_BERI_H_ +#define _DEV_HWPMC_HWPMC_BERI_H_ + +#define STATCOUNTER_ITEM(name, X, Y) \ +static inline uint64_t statcounters_get_##name##_count(void) \ +{ \ + uint64_t ret; \ + __asm __volatile( \ + ".word (0x1f << 26) | (0x0 << 21) | \ + (12 << 16) | ("#X" << 11) | \ + ( "#Y" << 6) | 0x3b\n\t" \ + "move %0,$12" : "=r" (ret) :: "$12"); \ + return (ret); \ +} + +STATCOUNTER_ITEM(cycle,2,0) +STATCOUNTER_ITEM(inst,4,0) +STATCOUNTER_ITEM(inst_user,4,1) +STATCOUNTER_ITEM(inst_kernel,4,2) +STATCOUNTER_ITEM(imprecise_setbounds,4,3) +STATCOUNTER_ITEM(unrepresentable_caps,4,4) +STATCOUNTER_ITEM(itlb_miss,5,0) +STATCOUNTER_ITEM(dtlb_miss,6,0) +STATCOUNTER_ITEM(icache_write_hit,8,0) +STATCOUNTER_ITEM(icache_write_miss,8,1) +STATCOUNTER_ITEM(icache_read_hit,8,2) +STATCOUNTER_ITEM(icache_read_miss,8,3) +STATCOUNTER_ITEM(icache_evict,8,6) +STATCOUNTER_ITEM(dcache_write_hit,9,0) +STATCOUNTER_ITEM(dcache_write_miss,9,1) +STATCOUNTER_ITEM(dcache_read_hit,9,2) +STATCOUNTER_ITEM(dcache_read_miss,9,3) +STATCOUNTER_ITEM(dcache_evict,9,6) +STATCOUNTER_ITEM(dcache_set_tag_write,9,8) +STATCOUNTER_ITEM(dcache_set_tag_read,9,9) +STATCOUNTER_ITEM(l2cache_write_hit,10,0) +STATCOUNTER_ITEM(l2cache_write_miss,10,1) +STATCOUNTER_ITEM(l2cache_read_hit,10,2) +STATCOUNTER_ITEM(l2cache_read_miss,10,3) +STATCOUNTER_ITEM(l2cache_evict,10,6) +STATCOUNTER_ITEM(l2cache_set_tag_write,10,8) +STATCOUNTER_ITEM(l2cache_set_tag_read,10,9) +STATCOUNTER_ITEM(mem_byte_read,11,0) +STATCOUNTER_ITEM(mem_byte_write,11,1) +STATCOUNTER_ITEM(mem_hword_read,11,2) +STATCOUNTER_ITEM(mem_hword_write,11,3) +STATCOUNTER_ITEM(mem_word_read,11,4) +STATCOUNTER_ITEM(mem_word_write,11,5) +STATCOUNTER_ITEM(mem_dword_read,11,6) +STATCOUNTER_ITEM(mem_dword_write,11,7) +STATCOUNTER_ITEM(mem_cap_read,11,8) +STATCOUNTER_ITEM(mem_cap_write,11,9) +STATCOUNTER_ITEM(mem_cap_read_tag_set,11,10) +STATCOUNTER_ITEM(mem_cap_write_tag_set,11,11) +STATCOUNTER_ITEM(tagcache_write_hit,12,0) +STATCOUNTER_ITEM(tagcache_write_miss,12,1) +STATCOUNTER_ITEM(tagcache_read_hit,12,2) +STATCOUNTER_ITEM(tagcache_read_miss,12,3) +STATCOUNTER_ITEM(tagcache_evict,12,6) +STATCOUNTER_ITEM(l2cachemaster_read_req,13,0) +STATCOUNTER_ITEM(l2cachemaster_write_req,13,1) +STATCOUNTER_ITEM(l2cachemaster_write_req_flit,13,2) +STATCOUNTER_ITEM(l2cachemaster_read_rsp,13,3) +STATCOUNTER_ITEM(l2cachemaster_read_rsp_flit,13,4) +STATCOUNTER_ITEM(l2cachemaster_write_rsp,13,5) +STATCOUNTER_ITEM(tagcachemaster_read_req,14,0) +STATCOUNTER_ITEM(tagcachemaster_write_req,14,1) +STATCOUNTER_ITEM(tagcachemaster_write_req_flit,14,2) +STATCOUNTER_ITEM(tagcachemaster_read_rsp,14,3) +STATCOUNTER_ITEM(tagcachemaster_read_rsp_flit,14,4) +STATCOUNTER_ITEM(tagcachemaster_write_rsp,14,5) + +#endif /* !_DEV_HWPMC_HWPMC_BERI_H_ */ Property changes on: projects/clang900-import/sys/dev/hwpmc/hwpmc_beri.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang900-import/sys/dev/hwpmc/pmc_events.h =================================================================== --- projects/clang900-import/sys/dev/hwpmc/pmc_events.h (revision 352536) +++ projects/clang900-import/sys/dev/hwpmc/pmc_events.h (revision 352537) @@ -1,1817 +1,1881 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 Joseph Koshy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _DEV_HWPMC_PMC_EVENTS_H_ #define _DEV_HWPMC_PMC_EVENTS_H_ /* * Note: Documentation on adding events can be found both in * the source tree at src/share/doc/papers/hwpmc/hwpmc.ms * as well as on-line at: * * https://wiki.freebsd.org/PmcTools/PmcHardwareHowTo * * Please refer to those resources before you attempt to modify * this file or the hwpmc driver/subsystem. */ /* * PMC event codes. * * __PMC_EV(CLASS, SYMBOLIC-NAME) * */ /* timestamp counters. */ #define __PMC_EV_TSC() \ __PMC_EV(TSC, TSC) #define PMC_EV_TSC_FIRST PMC_EV_TSC_TSC #define PMC_EV_TSC_LAST PMC_EV_TSC_TSC /* * Software events are dynamically defined. */ #define PMC_EV_DYN_COUNT 0x1000 #define PMC_EV_SOFT_FIRST 0x20000 #define PMC_EV_SOFT_LAST (PMC_EV_SOFT_FIRST + PMC_EV_DYN_COUNT - 1) /* * AMD K7 Events, from "The AMD Athlon(tm) Processor x86 Code * Optimization Guide" [Doc#22007K, Feb 2002] */ #define __PMC_EV_K7() \ __PMC_EV(K7, DC_ACCESSES) \ __PMC_EV(K7, DC_MISSES) \ __PMC_EV(K7, DC_REFILLS_FROM_L2) \ __PMC_EV(K7, DC_REFILLS_FROM_SYSTEM) \ __PMC_EV(K7, DC_WRITEBACKS) \ __PMC_EV(K7, L1_DTLB_MISS_AND_L2_DTLB_HITS) \ __PMC_EV(K7, L1_AND_L2_DTLB_MISSES) \ __PMC_EV(K7, MISALIGNED_REFERENCES) \ __PMC_EV(K7, IC_FETCHES) \ __PMC_EV(K7, IC_MISSES) \ __PMC_EV(K7, L1_ITLB_MISSES) \ __PMC_EV(K7, L1_L2_ITLB_MISSES) \ __PMC_EV(K7, RETIRED_INSTRUCTIONS) \ __PMC_EV(K7, RETIRED_OPS) \ __PMC_EV(K7, RETIRED_BRANCHES) \ __PMC_EV(K7, RETIRED_BRANCHES_MISPREDICTED) \ __PMC_EV(K7, RETIRED_TAKEN_BRANCHES) \ __PMC_EV(K7, RETIRED_TAKEN_BRANCHES_MISPREDICTED) \ __PMC_EV(K7, RETIRED_FAR_CONTROL_TRANSFERS) \ __PMC_EV(K7, RETIRED_RESYNC_BRANCHES) \ __PMC_EV(K7, INTERRUPTS_MASKED_CYCLES) \ __PMC_EV(K7, INTERRUPTS_MASKED_WHILE_PENDING_CYCLES) \ __PMC_EV(K7, HARDWARE_INTERRUPTS) #define PMC_EV_K7_FIRST PMC_EV_K7_DC_ACCESSES #define PMC_EV_K7_LAST PMC_EV_K7_HARDWARE_INTERRUPTS /* AMD K8 PMCs */ #define __PMC_EV_K8() \ __PMC_EV(K8, FP_DISPATCHED_FPU_OPS) \ __PMC_EV(K8, FP_CYCLES_WITH_NO_FPU_OPS_RETIRED) \ __PMC_EV(K8, FP_DISPATCHED_FPU_FAST_FLAG_OPS) \ __PMC_EV(K8, LS_SEGMENT_REGISTER_LOAD) \ __PMC_EV(K8, LS_MICROARCHITECTURAL_RESYNC_BY_SELF_MODIFYING_CODE) \ __PMC_EV(K8, LS_MICROARCHITECTURAL_RESYNC_BY_SNOOP) \ __PMC_EV(K8, LS_BUFFER2_FULL) \ __PMC_EV(K8, LS_LOCKED_OPERATION) \ __PMC_EV(K8, LS_MICROARCHITECTURAL_LATE_CANCEL) \ __PMC_EV(K8, LS_RETIRED_CFLUSH_INSTRUCTIONS) \ __PMC_EV(K8, LS_RETIRED_CPUID_INSTRUCTIONS) \ __PMC_EV(K8, DC_ACCESS) \ __PMC_EV(K8, DC_MISS) \ __PMC_EV(K8, DC_REFILL_FROM_L2) \ __PMC_EV(K8, DC_REFILL_FROM_SYSTEM) \ __PMC_EV(K8, DC_COPYBACK) \ __PMC_EV(K8, DC_L1_DTLB_MISS_AND_L2_DTLB_HIT) \ __PMC_EV(K8, DC_L1_DTLB_MISS_AND_L2_DTLB_MISS) \ __PMC_EV(K8, DC_MISALIGNED_DATA_REFERENCE) \ __PMC_EV(K8, DC_MICROARCHITECTURAL_LATE_CANCEL) \ __PMC_EV(K8, DC_MICROARCHITECTURAL_EARLY_CANCEL) \ __PMC_EV(K8, DC_ONE_BIT_ECC_ERROR) \ __PMC_EV(K8, DC_DISPATCHED_PREFETCH_INSTRUCTIONS) \ __PMC_EV(K8, DC_DCACHE_ACCESSES_BY_LOCKS) \ __PMC_EV(K8, BU_CPU_CLK_UNHALTED) \ __PMC_EV(K8, BU_INTERNAL_L2_REQUEST) \ __PMC_EV(K8, BU_FILL_REQUEST_L2_MISS) \ __PMC_EV(K8, BU_FILL_INTO_L2) \ __PMC_EV(K8, IC_FETCH) \ __PMC_EV(K8, IC_MISS) \ __PMC_EV(K8, IC_REFILL_FROM_L2) \ __PMC_EV(K8, IC_REFILL_FROM_SYSTEM) \ __PMC_EV(K8, IC_L1_ITLB_MISS_AND_L2_ITLB_HIT) \ __PMC_EV(K8, IC_L1_ITLB_MISS_AND_L2_ITLB_MISS) \ __PMC_EV(K8, IC_MICROARCHITECTURAL_RESYNC_BY_SNOOP) \ __PMC_EV(K8, IC_INSTRUCTION_FETCH_STALL) \ __PMC_EV(K8, IC_RETURN_STACK_HIT) \ __PMC_EV(K8, IC_RETURN_STACK_OVERFLOW) \ __PMC_EV(K8, FR_RETIRED_X86_INSTRUCTIONS) \ __PMC_EV(K8, FR_RETIRED_UOPS) \ __PMC_EV(K8, FR_RETIRED_BRANCHES) \ __PMC_EV(K8, FR_RETIRED_BRANCHES_MISPREDICTED) \ __PMC_EV(K8, FR_RETIRED_TAKEN_BRANCHES) \ __PMC_EV(K8, FR_RETIRED_TAKEN_BRANCHES_MISPREDICTED) \ __PMC_EV(K8, FR_RETIRED_FAR_CONTROL_TRANSFERS) \ __PMC_EV(K8, FR_RETIRED_RESYNCS) \ __PMC_EV(K8, FR_RETIRED_NEAR_RETURNS) \ __PMC_EV(K8, FR_RETIRED_NEAR_RETURNS_MISPREDICTED) \ __PMC_EV(K8, FR_RETIRED_TAKEN_BRANCHES_MISPREDICTED_BY_ADDR_MISCOMPARE) \ __PMC_EV(K8, FR_RETIRED_FPU_INSTRUCTIONS) \ __PMC_EV(K8, FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS) \ __PMC_EV(K8, FR_INTERRUPTS_MASKED_CYCLES) \ __PMC_EV(K8, FR_INTERRUPTS_MASKED_WHILE_PENDING_CYCLES) \ __PMC_EV(K8, FR_TAKEN_HARDWARE_INTERRUPTS) \ __PMC_EV(K8, FR_DECODER_EMPTY) \ __PMC_EV(K8, FR_DISPATCH_STALLS) \ __PMC_EV(K8, FR_DISPATCH_STALL_FROM_BRANCH_ABORT_TO_RETIRE) \ __PMC_EV(K8, FR_DISPATCH_STALL_FOR_SERIALIZATION) \ __PMC_EV(K8, FR_DISPATCH_STALL_FOR_SEGMENT_LOAD) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_REORDER_BUFFER_IS_FULL) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_RESERVATION_STATIONS_ARE_FULL) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_FPU_IS_FULL) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_LS_IS_FULL) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_WAITING_FOR_ALL_TO_BE_QUIET) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_FAR_XFER_OR_RESYNC_BRANCH_PENDING) \ __PMC_EV(K8, FR_FPU_EXCEPTIONS) \ __PMC_EV(K8, FR_NUMBER_OF_BREAKPOINTS_FOR_DR0) \ __PMC_EV(K8, FR_NUMBER_OF_BREAKPOINTS_FOR_DR1) \ __PMC_EV(K8, FR_NUMBER_OF_BREAKPOINTS_FOR_DR2) \ __PMC_EV(K8, FR_NUMBER_OF_BREAKPOINTS_FOR_DR3) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_PAGE_ACCESS_EVENT) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_PAGE_TABLE_OVERFLOW) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_DRAM_COMMAND_SLOTS_MISSED) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_TURNAROUND) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_BYPASS_SATURATION) \ __PMC_EV(K8, NB_SIZED_COMMANDS) \ __PMC_EV(K8, NB_PROBE_RESULT) \ __PMC_EV(K8, NB_HT_BUS0_BANDWIDTH) \ __PMC_EV(K8, NB_HT_BUS1_BANDWIDTH) \ __PMC_EV(K8, NB_HT_BUS2_BANDWIDTH) #define PMC_EV_K8_FIRST PMC_EV_K8_FP_DISPATCHED_FPU_OPS #define PMC_EV_K8_LAST PMC_EV_K8_NB_HT_BUS2_BANDWIDTH /* * Events supported by Intel architectural fixed function counters, * from the "Intel 64 and IA-32 Architectures Software Developer's * Manual Volume 3B: System Programming Guide, Part 2", July 2008. */ #define __PMC_EV_IAF() \ __PMC_EV(IAF, INSTR_RETIRED_ANY) \ __PMC_EV(IAF, CPU_CLK_UNHALTED_CORE) \ __PMC_EV(IAF, CPU_CLK_UNHALTED_REF) #define PMC_EV_IAF_FIRST PMC_EV_IAF_INSTR_RETIRED_ANY #define PMC_EV_IAF_LAST PMC_EV_IAF_CPU_CLK_UNHALTED_REF #define __PMC_EV_ALIAS_IAF() \ __PMC_EV_ALIAS("instruction-retired", IAF_INSTR_RETIRED_ANY) \ __PMC_EV_ALIAS("unhalted-core-cycles", IAF_CPU_CLK_UNHALTED_CORE) \ __PMC_EV_ALIAS("unhalted-reference-cycles", IAF_CPU_CLK_UNHALTED_REF) #define PMC_EV_IAP_FIRST PMC_EV_IAP_ARCH_BR_INS_RET #define PMC_EV_IAP_LAST PMC_EV_IAP_EVENT_FDH_40H /* * Map "architectural" event names to event ids. */ #define __PMC_EV_ALIAS_INTEL_ARCHITECTURAL() \ __PMC_EV_ALIAS("branch-instruction-retired", IAP_ARCH_BR_INS_RET) \ __PMC_EV_ALIAS("branch-misses-retired", IAP_ARCH_BR_MIS_RET) \ __PMC_EV_ALIAS("instruction-retired", IAP_ARCH_INS_RET) \ __PMC_EV_ALIAS("llc-misses", IAP_ARCH_LLC_MIS) \ __PMC_EV_ALIAS("llc-reference", IAP_ARCH_LLC_REF) \ __PMC_EV_ALIAS("unhalted-reference-cycles", IAP_ARCH_UNH_REF_CYC) \ __PMC_EV_ALIAS("unhalted-core-cycles", IAP_ARCH_UNH_COR_CYC) #define __PMC_EV_UCP() \ __PMC_EV(UCP, EVENT_0CH_04H_E) \ __PMC_EV(UCP, EVENT_0CH_04H_F) \ __PMC_EV(UCP, EVENT_0CH_04H_M) \ __PMC_EV(UCP, EVENT_0CH_04H_S) \ __PMC_EV(UCP, EVENT_0CH_08H_E) \ __PMC_EV(UCP, EVENT_0CH_08H_F) \ __PMC_EV(UCP, EVENT_0CH_08H_M) \ __PMC_EV(UCP, EVENT_0CH_08H_S) \ /* * Intel XScale events from: * * Intel XScale Core Developer's Manual * January, 2004, #27347302 * * 3rd Generation Intel XScale Microarchitecture * Developer's Manual * May 2007, #31628302 * * First 14 events are for 1st and 2nd Generation Intel XScale cores. The * remaining are available only on 3rd Generation Intel XScale cores. */ #define __PMC_EV_XSCALE() \ __PMC_EV(XSCALE, IC_FETCH) \ __PMC_EV(XSCALE, IC_MISS) \ __PMC_EV(XSCALE, DATA_DEPENDENCY_STALLED) \ __PMC_EV(XSCALE, ITLB_MISS) \ __PMC_EV(XSCALE, DTLB_MISS) \ __PMC_EV(XSCALE, BRANCH_RETIRED) \ __PMC_EV(XSCALE, BRANCH_MISPRED) \ __PMC_EV(XSCALE, INSTR_RETIRED) \ __PMC_EV(XSCALE, DC_FULL_CYCLE) \ __PMC_EV(XSCALE, DC_FULL_CONTIG) \ __PMC_EV(XSCALE, DC_ACCESS) \ __PMC_EV(XSCALE, DC_MISS) \ __PMC_EV(XSCALE, DC_WRITEBACK) \ __PMC_EV(XSCALE, PC_CHANGE) \ __PMC_EV(XSCALE, BRANCH_RETIRED_ALL) \ __PMC_EV(XSCALE, INSTR_CYCLE) \ __PMC_EV(XSCALE, CP_STALL) \ __PMC_EV(XSCALE, PC_CHANGE_ALL) \ __PMC_EV(XSCALE, PIPELINE_FLUSH) \ __PMC_EV(XSCALE, BACKEND_STALL) \ __PMC_EV(XSCALE, MULTIPLIER_USE) \ __PMC_EV(XSCALE, MULTIPLIER_STALLED) \ __PMC_EV(XSCALE, DATA_CACHE_STALLED) \ __PMC_EV(XSCALE, L2_CACHE_REQ) \ __PMC_EV(XSCALE, L2_CACHE_MISS) \ __PMC_EV(XSCALE, ADDRESS_BUS_TRANS) \ __PMC_EV(XSCALE, SELF_ADDRESS_BUS_TRANS) \ __PMC_EV(XSCALE, DATA_BUS_TRANS) #define PMC_EV_XSCALE_FIRST PMC_EV_XSCALE_IC_FETCH #define PMC_EV_XSCALE_LAST PMC_EV_XSCALE_DATA_BUS_TRANS /* * ARMv7 Events */ #define __PMC_EV_ARMV7() \ __PMC_EV(ARMV7, EVENT_00H) \ __PMC_EV(ARMV7, EVENT_01H) \ __PMC_EV(ARMV7, EVENT_02H) \ __PMC_EV(ARMV7, EVENT_03H) \ __PMC_EV(ARMV7, EVENT_04H) \ __PMC_EV(ARMV7, EVENT_05H) \ __PMC_EV(ARMV7, EVENT_06H) \ __PMC_EV(ARMV7, EVENT_07H) \ __PMC_EV(ARMV7, EVENT_08H) \ __PMC_EV(ARMV7, EVENT_09H) \ __PMC_EV(ARMV7, EVENT_0AH) \ __PMC_EV(ARMV7, EVENT_0BH) \ __PMC_EV(ARMV7, EVENT_0CH) \ __PMC_EV(ARMV7, EVENT_0DH) \ __PMC_EV(ARMV7, EVENT_0EH) \ __PMC_EV(ARMV7, EVENT_0FH) \ __PMC_EV(ARMV7, EVENT_10H) \ __PMC_EV(ARMV7, EVENT_11H) \ __PMC_EV(ARMV7, EVENT_12H) \ __PMC_EV(ARMV7, EVENT_13H) \ __PMC_EV(ARMV7, EVENT_14H) \ __PMC_EV(ARMV7, EVENT_15H) \ __PMC_EV(ARMV7, EVENT_16H) \ __PMC_EV(ARMV7, EVENT_17H) \ __PMC_EV(ARMV7, EVENT_18H) \ __PMC_EV(ARMV7, EVENT_19H) \ __PMC_EV(ARMV7, EVENT_1AH) \ __PMC_EV(ARMV7, EVENT_1BH) \ __PMC_EV(ARMV7, EVENT_1CH) \ __PMC_EV(ARMV7, EVENT_1DH) \ __PMC_EV(ARMV7, EVENT_1EH) \ __PMC_EV(ARMV7, EVENT_1FH) \ __PMC_EV(ARMV7, EVENT_20H) \ __PMC_EV(ARMV7, EVENT_21H) \ __PMC_EV(ARMV7, EVENT_22H) \ __PMC_EV(ARMV7, EVENT_23H) \ __PMC_EV(ARMV7, EVENT_24H) \ __PMC_EV(ARMV7, EVENT_25H) \ __PMC_EV(ARMV7, EVENT_26H) \ __PMC_EV(ARMV7, EVENT_27H) \ __PMC_EV(ARMV7, EVENT_28H) \ __PMC_EV(ARMV7, EVENT_29H) \ __PMC_EV(ARMV7, EVENT_2AH) \ __PMC_EV(ARMV7, EVENT_2BH) \ __PMC_EV(ARMV7, EVENT_2CH) \ __PMC_EV(ARMV7, EVENT_2DH) \ __PMC_EV(ARMV7, EVENT_2EH) \ __PMC_EV(ARMV7, EVENT_2FH) \ __PMC_EV(ARMV7, EVENT_30H) \ __PMC_EV(ARMV7, EVENT_31H) \ __PMC_EV(ARMV7, EVENT_32H) \ __PMC_EV(ARMV7, EVENT_33H) \ __PMC_EV(ARMV7, EVENT_34H) \ __PMC_EV(ARMV7, EVENT_35H) \ __PMC_EV(ARMV7, EVENT_36H) \ __PMC_EV(ARMV7, EVENT_37H) \ __PMC_EV(ARMV7, EVENT_38H) \ __PMC_EV(ARMV7, EVENT_39H) \ __PMC_EV(ARMV7, EVENT_3AH) \ __PMC_EV(ARMV7, EVENT_3BH) \ __PMC_EV(ARMV7, EVENT_3CH) \ __PMC_EV(ARMV7, EVENT_3DH) \ __PMC_EV(ARMV7, EVENT_3EH) \ __PMC_EV(ARMV7, EVENT_3FH) \ __PMC_EV(ARMV7, EVENT_40H) \ __PMC_EV(ARMV7, EVENT_41H) \ __PMC_EV(ARMV7, EVENT_42H) \ __PMC_EV(ARMV7, EVENT_43H) \ __PMC_EV(ARMV7, EVENT_44H) \ __PMC_EV(ARMV7, EVENT_45H) \ __PMC_EV(ARMV7, EVENT_46H) \ __PMC_EV(ARMV7, EVENT_47H) \ __PMC_EV(ARMV7, EVENT_48H) \ __PMC_EV(ARMV7, EVENT_49H) \ __PMC_EV(ARMV7, EVENT_4AH) \ __PMC_EV(ARMV7, EVENT_4BH) \ __PMC_EV(ARMV7, EVENT_4CH) \ __PMC_EV(ARMV7, EVENT_4DH) \ __PMC_EV(ARMV7, EVENT_4EH) \ __PMC_EV(ARMV7, EVENT_4FH) \ __PMC_EV(ARMV7, EVENT_50H) \ __PMC_EV(ARMV7, EVENT_51H) \ __PMC_EV(ARMV7, EVENT_52H) \ __PMC_EV(ARMV7, EVENT_53H) \ __PMC_EV(ARMV7, EVENT_54H) \ __PMC_EV(ARMV7, EVENT_55H) \ __PMC_EV(ARMV7, EVENT_56H) \ __PMC_EV(ARMV7, EVENT_57H) \ __PMC_EV(ARMV7, EVENT_58H) \ __PMC_EV(ARMV7, EVENT_59H) \ __PMC_EV(ARMV7, EVENT_5AH) \ __PMC_EV(ARMV7, EVENT_5BH) \ __PMC_EV(ARMV7, EVENT_5CH) \ __PMC_EV(ARMV7, EVENT_5DH) \ __PMC_EV(ARMV7, EVENT_5EH) \ __PMC_EV(ARMV7, EVENT_5FH) \ __PMC_EV(ARMV7, EVENT_60H) \ __PMC_EV(ARMV7, EVENT_61H) \ __PMC_EV(ARMV7, EVENT_62H) \ __PMC_EV(ARMV7, EVENT_63H) \ __PMC_EV(ARMV7, EVENT_64H) \ __PMC_EV(ARMV7, EVENT_65H) \ __PMC_EV(ARMV7, EVENT_66H) \ __PMC_EV(ARMV7, EVENT_67H) \ __PMC_EV(ARMV7, EVENT_68H) \ __PMC_EV(ARMV7, EVENT_69H) \ __PMC_EV(ARMV7, EVENT_6AH) \ __PMC_EV(ARMV7, EVENT_6BH) \ __PMC_EV(ARMV7, EVENT_6CH) \ __PMC_EV(ARMV7, EVENT_6DH) \ __PMC_EV(ARMV7, EVENT_6EH) \ __PMC_EV(ARMV7, EVENT_6FH) \ __PMC_EV(ARMV7, EVENT_70H) \ __PMC_EV(ARMV7, EVENT_71H) \ __PMC_EV(ARMV7, EVENT_72H) \ __PMC_EV(ARMV7, EVENT_73H) \ __PMC_EV(ARMV7, EVENT_74H) \ __PMC_EV(ARMV7, EVENT_75H) \ __PMC_EV(ARMV7, EVENT_76H) \ __PMC_EV(ARMV7, EVENT_77H) \ __PMC_EV(ARMV7, EVENT_78H) \ __PMC_EV(ARMV7, EVENT_79H) \ __PMC_EV(ARMV7, EVENT_7AH) \ __PMC_EV(ARMV7, EVENT_7BH) \ __PMC_EV(ARMV7, EVENT_7CH) \ __PMC_EV(ARMV7, EVENT_7DH) \ __PMC_EV(ARMV7, EVENT_7EH) \ __PMC_EV(ARMV7, EVENT_7FH) \ __PMC_EV(ARMV7, EVENT_80H) \ __PMC_EV(ARMV7, EVENT_81H) \ __PMC_EV(ARMV7, EVENT_82H) \ __PMC_EV(ARMV7, EVENT_83H) \ __PMC_EV(ARMV7, EVENT_84H) \ __PMC_EV(ARMV7, EVENT_85H) \ __PMC_EV(ARMV7, EVENT_86H) \ __PMC_EV(ARMV7, EVENT_87H) \ __PMC_EV(ARMV7, EVENT_88H) \ __PMC_EV(ARMV7, EVENT_89H) \ __PMC_EV(ARMV7, EVENT_8AH) \ __PMC_EV(ARMV7, EVENT_8BH) \ __PMC_EV(ARMV7, EVENT_8CH) \ __PMC_EV(ARMV7, EVENT_8DH) \ __PMC_EV(ARMV7, EVENT_8EH) \ __PMC_EV(ARMV7, EVENT_8FH) \ __PMC_EV(ARMV7, EVENT_90H) \ __PMC_EV(ARMV7, EVENT_91H) \ __PMC_EV(ARMV7, EVENT_92H) \ __PMC_EV(ARMV7, EVENT_93H) \ __PMC_EV(ARMV7, EVENT_94H) \ __PMC_EV(ARMV7, EVENT_95H) \ __PMC_EV(ARMV7, EVENT_96H) \ __PMC_EV(ARMV7, EVENT_97H) \ __PMC_EV(ARMV7, EVENT_98H) \ __PMC_EV(ARMV7, EVENT_99H) \ __PMC_EV(ARMV7, EVENT_9AH) \ __PMC_EV(ARMV7, EVENT_9BH) \ __PMC_EV(ARMV7, EVENT_9CH) \ __PMC_EV(ARMV7, EVENT_9DH) \ __PMC_EV(ARMV7, EVENT_9EH) \ __PMC_EV(ARMV7, EVENT_9FH) \ __PMC_EV(ARMV7, EVENT_A0H) \ __PMC_EV(ARMV7, EVENT_A1H) \ __PMC_EV(ARMV7, EVENT_A2H) \ __PMC_EV(ARMV7, EVENT_A3H) \ __PMC_EV(ARMV7, EVENT_A4H) \ __PMC_EV(ARMV7, EVENT_A5H) \ __PMC_EV(ARMV7, EVENT_A6H) \ __PMC_EV(ARMV7, EVENT_A7H) \ __PMC_EV(ARMV7, EVENT_A8H) \ __PMC_EV(ARMV7, EVENT_A9H) \ __PMC_EV(ARMV7, EVENT_AAH) \ __PMC_EV(ARMV7, EVENT_ABH) \ __PMC_EV(ARMV7, EVENT_ACH) \ __PMC_EV(ARMV7, EVENT_ADH) \ __PMC_EV(ARMV7, EVENT_AEH) \ __PMC_EV(ARMV7, EVENT_AFH) \ __PMC_EV(ARMV7, EVENT_B0H) \ __PMC_EV(ARMV7, EVENT_B1H) \ __PMC_EV(ARMV7, EVENT_B2H) \ __PMC_EV(ARMV7, EVENT_B3H) \ __PMC_EV(ARMV7, EVENT_B4H) \ __PMC_EV(ARMV7, EVENT_B5H) \ __PMC_EV(ARMV7, EVENT_B6H) \ __PMC_EV(ARMV7, EVENT_B7H) \ __PMC_EV(ARMV7, EVENT_B8H) \ __PMC_EV(ARMV7, EVENT_B9H) \ __PMC_EV(ARMV7, EVENT_BAH) \ __PMC_EV(ARMV7, EVENT_BBH) \ __PMC_EV(ARMV7, EVENT_BCH) \ __PMC_EV(ARMV7, EVENT_BDH) \ __PMC_EV(ARMV7, EVENT_BEH) \ __PMC_EV(ARMV7, EVENT_BFH) \ __PMC_EV(ARMV7, EVENT_C0H) \ __PMC_EV(ARMV7, EVENT_C1H) \ __PMC_EV(ARMV7, EVENT_C2H) \ __PMC_EV(ARMV7, EVENT_C3H) \ __PMC_EV(ARMV7, EVENT_C4H) \ __PMC_EV(ARMV7, EVENT_C5H) \ __PMC_EV(ARMV7, EVENT_C6H) \ __PMC_EV(ARMV7, EVENT_C7H) \ __PMC_EV(ARMV7, EVENT_C8H) \ __PMC_EV(ARMV7, EVENT_C9H) \ __PMC_EV(ARMV7, EVENT_CAH) \ __PMC_EV(ARMV7, EVENT_CBH) \ __PMC_EV(ARMV7, EVENT_CCH) \ __PMC_EV(ARMV7, EVENT_CDH) \ __PMC_EV(ARMV7, EVENT_CEH) \ __PMC_EV(ARMV7, EVENT_CFH) \ __PMC_EV(ARMV7, EVENT_D0H) \ __PMC_EV(ARMV7, EVENT_D1H) \ __PMC_EV(ARMV7, EVENT_D2H) \ __PMC_EV(ARMV7, EVENT_D3H) \ __PMC_EV(ARMV7, EVENT_D4H) \ __PMC_EV(ARMV7, EVENT_D5H) \ __PMC_EV(ARMV7, EVENT_D6H) \ __PMC_EV(ARMV7, EVENT_D7H) \ __PMC_EV(ARMV7, EVENT_D8H) \ __PMC_EV(ARMV7, EVENT_D9H) \ __PMC_EV(ARMV7, EVENT_DAH) \ __PMC_EV(ARMV7, EVENT_DBH) \ __PMC_EV(ARMV7, EVENT_DCH) \ __PMC_EV(ARMV7, EVENT_DDH) \ __PMC_EV(ARMV7, EVENT_DEH) \ __PMC_EV(ARMV7, EVENT_DFH) \ __PMC_EV(ARMV7, EVENT_E0H) \ __PMC_EV(ARMV7, EVENT_E1H) \ __PMC_EV(ARMV7, EVENT_E2H) \ __PMC_EV(ARMV7, EVENT_E3H) \ __PMC_EV(ARMV7, EVENT_E4H) \ __PMC_EV(ARMV7, EVENT_E5H) \ __PMC_EV(ARMV7, EVENT_E6H) \ __PMC_EV(ARMV7, EVENT_E7H) \ __PMC_EV(ARMV7, EVENT_E8H) \ __PMC_EV(ARMV7, EVENT_E9H) \ __PMC_EV(ARMV7, EVENT_EAH) \ __PMC_EV(ARMV7, EVENT_EBH) \ __PMC_EV(ARMV7, EVENT_ECH) \ __PMC_EV(ARMV7, EVENT_EDH) \ __PMC_EV(ARMV7, EVENT_EEH) \ __PMC_EV(ARMV7, EVENT_EFH) \ __PMC_EV(ARMV7, EVENT_F0H) \ __PMC_EV(ARMV7, EVENT_F1H) \ __PMC_EV(ARMV7, EVENT_F2H) \ __PMC_EV(ARMV7, EVENT_F3H) \ __PMC_EV(ARMV7, EVENT_F4H) \ __PMC_EV(ARMV7, EVENT_F5H) \ __PMC_EV(ARMV7, EVENT_F6H) \ __PMC_EV(ARMV7, EVENT_F7H) \ __PMC_EV(ARMV7, EVENT_F8H) \ __PMC_EV(ARMV7, EVENT_F9H) \ __PMC_EV(ARMV7, EVENT_FAH) \ __PMC_EV(ARMV7, EVENT_FBH) \ __PMC_EV(ARMV7, EVENT_FCH) \ __PMC_EV(ARMV7, EVENT_FDH) \ __PMC_EV(ARMV7, EVENT_FEH) \ __PMC_EV(ARMV7, EVENT_FFH) #define PMC_EV_ARMV7_FIRST PMC_EV_ARMV7_EVENT_00H #define PMC_EV_ARMV7_LAST PMC_EV_ARMV7_EVENT_FFH #define __PMC_EV_ALIAS_ARMV7_COMMON() \ __PMC_EV_ALIAS("PMNC_SW_INCR", ARMV7_EVENT_00H) \ __PMC_EV_ALIAS("L1_ICACHE_REFILL", ARMV7_EVENT_01H) \ __PMC_EV_ALIAS("ITLB_REFILL", ARMV7_EVENT_02H) \ __PMC_EV_ALIAS("L1_DCACHE_REFILL", ARMV7_EVENT_03H) \ __PMC_EV_ALIAS("L1_DCACHE_ACCESS", ARMV7_EVENT_04H) \ __PMC_EV_ALIAS("DTLB_REFILL", ARMV7_EVENT_05H) \ __PMC_EV_ALIAS("MEM_READ", ARMV7_EVENT_06H) \ __PMC_EV_ALIAS("MEM_WRITE", ARMV7_EVENT_07H) \ __PMC_EV_ALIAS("EXC_TAKEN", ARMV7_EVENT_09H) \ __PMC_EV_ALIAS("EXC_EXECUTED", ARMV7_EVENT_0AH) \ __PMC_EV_ALIAS("CID_WRITE", ARMV7_EVENT_0BH) \ __PMC_EV_ALIAS("PC_WRITE", ARMV7_EVENT_0CH) \ __PMC_EV_ALIAS("PC_IMM_BRANCH", ARMV7_EVENT_0DH) \ __PMC_EV_ALIAS("MEM_UNALIGNED_ACCESS", ARMV7_EVENT_0FH) \ __PMC_EV_ALIAS("PC_BRANCH_MIS_PRED", ARMV7_EVENT_10H) \ __PMC_EV_ALIAS("CLOCK_CYCLES", ARMV7_EVENT_11H) \ __PMC_EV_ALIAS("PC_BRANCH_PRED", ARMV7_EVENT_12H) #define __PMC_EV_ALIAS_ARMV7_COMMON_A8() \ __PMC_EV_ALIAS_ARMV7_COMMON() \ __PMC_EV_ALIAS("INSTR_EXECUTED", ARMV7_EVENT_08H) \ __PMC_EV_ALIAS("PC_PROC_RETURN", ARMV7_EVENT_0EH) \ __PMC_EV_ALIAS("MEM_ACCESS", ARMV7_EVENT_13H) \ __PMC_EV_ALIAS("L1_ICACHE_ACCESS", ARMV7_EVENT_14H) \ __PMC_EV_ALIAS("L1_DCACHE_WB", ARMV7_EVENT_15H) \ __PMC_EV_ALIAS("L2_CACHE_ACCESS", ARMV7_EVENT_16H) \ __PMC_EV_ALIAS("L2_CACHE_REFILL", ARMV7_EVENT_17H) \ __PMC_EV_ALIAS("L2_CACHE_WB", ARMV7_EVENT_18H) \ __PMC_EV_ALIAS("BUS_ACCESS", ARMV7_EVENT_19H) \ __PMC_EV_ALIAS("MEM_ERROR", ARMV7_EVENT_1AH) \ __PMC_EV_ALIAS("INSTR_SPEC", ARMV7_EVENT_1BH) \ __PMC_EV_ALIAS("TTBR_WRITE", ARMV7_EVENT_1CH) \ __PMC_EV_ALIAS("BUS_CYCLES", ARMV7_EVENT_1DH) \ __PMC_EV_ALIAS("CPU_CYCLES", ARMV7_EVENT_FFH) #define __PMC_EV_ALIAS_ARMV7_CORTEX_A8() \ __PMC_EV_ALIAS_ARMV7_COMMON_A8() \ __PMC_EV_ALIAS("WRITE_BUF_FULL", ARMV7_EVENT_40H) \ __PMC_EV_ALIAS("L2_STORE_MERGED", ARMV7_EVENT_41H) \ __PMC_EV_ALIAS("L2_STORE_BUFFERABLE", ARMV7_EVENT_42H) \ __PMC_EV_ALIAS("L2_ACCESS", ARMV7_EVENT_43H) \ __PMC_EV_ALIAS("L2_CACHE_MISS", ARMV7_EVENT_44H) \ __PMC_EV_ALIAS("AXI_READ", ARMV7_EVENT_45H) \ __PMC_EV_ALIAS("AXI_WRITE", ARMV7_EVENT_46H) \ __PMC_EV_ALIAS("MEM_REPLAY_EVT", ARMV7_EVENT_47H) \ __PMC_EV_ALIAS("MEM_UNALIGNED_ACCESS_REPLAY", ARMV7_EVENT_48H) \ __PMC_EV_ALIAS("L1_DCACHE_HASH_MISS", ARMV7_EVENT_49H) \ __PMC_EV_ALIAS("L1_ICACHE_HASH_MISS", ARMV7_EVENT_4AH) \ __PMC_EV_ALIAS("L1_CACHE_PAGECOL_ALIAS", ARMV7_EVENT_4BH) \ __PMC_EV_ALIAS("L1_DCACHE_NEON_ACCESS", ARMV7_EVENT_4CH) \ __PMC_EV_ALIAS("L1_DCACHE_NEON_CACHEABLE", ARMV7_EVENT_4DH) \ __PMC_EV_ALIAS("L2_CACHE_NEON_MEM_ACCESS", ARMV7_EVENT_4EH) \ __PMC_EV_ALIAS("L2_CACHE_NEON_HIT", ARMV7_EVENT_4FH) \ __PMC_EV_ALIAS("L1_CACHE_ACCESS_NOCP15", ARMV7_EVENT_50H) \ __PMC_EV_ALIAS("RET_STACK_MISPREDICT", ARMV7_EVENT_51H) \ __PMC_EV_ALIAS("BRANCH_DIR_MISPREDICT", ARMV7_EVENT_52H) \ __PMC_EV_ALIAS("PRED_BRANCH_PRED_TAKEN", ARMV7_EVENT_53H) \ __PMC_EV_ALIAS("PRED_BRANCH_EXEC_TAKEN", ARMV7_EVENT_54H) \ __PMC_EV_ALIAS("OPS_ISSUED", ARMV7_EVENT_55H) \ __PMC_EV_ALIAS("CYCLES_NO_INSTRUCTION", ARMV7_EVENT_56H) \ __PMC_EV_ALIAS("INSTRUCTIONS_ISSUED_CYCLE", ARMV7_EVENT_57H) \ __PMC_EV_ALIAS("CYCLES_STALLED_NEON_MRC", ARMV7_EVENT_58H) \ __PMC_EV_ALIAS("CYCLES_STALLED_NEON_FULLQ", ARMV7_EVENT_59H) \ __PMC_EV_ALIAS("CYCLES_NONIDLE_NEON_INT", ARMV7_EVENT_5AH) \ __PMC_EV_ALIAS("PMUEXTIN0_EVT", ARMV7_EVENT_70H) \ __PMC_EV_ALIAS("PMUEXTIN1_EVT", ARMV7_EVENT_71H) \ __PMC_EV_ALIAS("PMUEXTIN_EVT", ARMV7_EVENT_72H) #define PMC_EV_ARMV7_CORTEX_A8_FIRST PMC_EV_ARMV7_PMNC_SW_INCR #define PMC_EV_ARMV7_CORTEX_A8_LAST PMC_EV_ARMV7_PMUEXTIN_EVT #define __PMC_EV_ALIAS_ARMV7_CORTEX_A9() \ __PMC_EV_ALIAS_ARMV7_COMMON() \ __PMC_EV_ALIAS("JAVA_BYTECODE", ARMV7_EVENT_40H) \ __PMC_EV_ALIAS("SOFTWARE_JAVA_BYTECODE", ARMV7_EVENT_41H) \ __PMC_EV_ALIAS("JAZELLE_BACKWARD_BRANCH", ARMV7_EVENT_42H) \ __PMC_EV_ALIAS("COHERENT_LINEFILL_MISSC", ARMV7_EVENT_50H) \ __PMC_EV_ALIAS("COHERENT_LINEFILL_HITC", ARMV7_EVENT_51H) \ __PMC_EV_ALIAS("INSTR_CACHE_DEPENDENT_STALL", ARMV7_EVENT_60H) \ __PMC_EV_ALIAS("DATA_CACHE_DEPENDENT_STALL", ARMV7_EVENT_61H) \ __PMC_EV_ALIAS("MAIN_TLB_MISS_STALL", ARMV7_EVENT_62H) \ __PMC_EV_ALIAS("STREX_PASSED", ARMV7_EVENT_63H) \ __PMC_EV_ALIAS("STREX_FAILED", ARMV7_EVENT_64H) \ __PMC_EV_ALIAS("DATA_EVICTION", ARMV7_EVENT_65H) \ __PMC_EV_ALIAS("ISSUE_DNOT_DISPATCH_ANY_INSTR", ARMV7_EVENT_66H) \ __PMC_EV_ALIAS("ISSUE_IS_EMPTY", ARMV7_EVENT_67H) \ __PMC_EV_ALIAS("INSTR_RENAMED", ARMV7_EVENT_68H) \ __PMC_EV_ALIAS("PREDICTABLE_FUNCTION_RETURN", ARMV7_EVENT_6EH) \ __PMC_EV_ALIAS("MAIN_EXECUTION_UNIT_PIPE", ARMV7_EVENT_70H) \ __PMC_EV_ALIAS("SECOND_EXECUTION_UNIT_PIPE", ARMV7_EVENT_71H) \ __PMC_EV_ALIAS("LOAD_STORE_PIPE", ARMV7_EVENT_72H) \ __PMC_EV_ALIAS("FLOATING_POINT_INSTR_RENAMED", ARMV7_EVENT_73H) \ __PMC_EV_ALIAS("NEON_INSTRS_RENAMED", ARMV7_EVENT_74H) \ __PMC_EV_ALIAS("PLD_STALL", ARMV7_EVENT_80H) \ __PMC_EV_ALIAS("WRITE_STALL", ARMV7_EVENT_81H) \ __PMC_EV_ALIAS("INSTR_MAIN_TLB_MISS_STALL", ARMV7_EVENT_82H) \ __PMC_EV_ALIAS("DATA_MAIN_TLB_MISS_STALL", ARMV7_EVENT_83H) \ __PMC_EV_ALIAS("INSTR_MICRO_TLB_MISS_STALL", ARMV7_EVENT_84H) \ __PMC_EV_ALIAS("DATA_MICRO_TLB_MISS_STALL", ARMV7_EVENT_85H) \ __PMC_EV_ALIAS("DMB_STALL", ARMV7_EVENT_86H) \ __PMC_EV_ALIAS("INTEGER_CORE_CLOCK_ENABLED", ARMV7_EVENT_8AH) \ __PMC_EV_ALIAS("DATA_ENGINE_CLOCK_ENABLED", ARMV7_EVENT_8BH) \ __PMC_EV_ALIAS("ISB", ARMV7_EVENT_90H) \ __PMC_EV_ALIAS("DSB", ARMV7_EVENT_91H) \ __PMC_EV_ALIAS("DMB", ARMV7_EVENT_92H) \ __PMC_EV_ALIAS("EXTERNAL_INTERRUPT", ARMV7_EVENT_93H) \ __PMC_EV_ALIAS("PLE_CACHE_LINE_REQ_COMPLETED", ARMV7_EVENT_A0H) \ __PMC_EV_ALIAS("PLE_CACHE_LINE_REQ_SKIPPED", ARMV7_EVENT_A1H) \ __PMC_EV_ALIAS("PLE_FIFO_FLUSH", ARMV7_EVENT_A2H) \ __PMC_EV_ALIAS("PLE_REQUEST_COMPLETED", ARMV7_EVENT_A3H) \ __PMC_EV_ALIAS("PLE_FIFO_OVERFLOW", ARMV7_EVENT_A4H) \ __PMC_EV_ALIAS("PLE_REQUEST_PROGRAMMED", ARMV7_EVENT_A5H) /* * ARMv8 Events */ #define __PMC_EV_ARMV8() \ __PMC_EV(ARMV8, EVENT_00H) \ __PMC_EV(ARMV8, EVENT_01H) \ __PMC_EV(ARMV8, EVENT_02H) \ __PMC_EV(ARMV8, EVENT_03H) \ __PMC_EV(ARMV8, EVENT_04H) \ __PMC_EV(ARMV8, EVENT_05H) \ __PMC_EV(ARMV8, EVENT_06H) \ __PMC_EV(ARMV8, EVENT_07H) \ __PMC_EV(ARMV8, EVENT_08H) \ __PMC_EV(ARMV8, EVENT_09H) \ __PMC_EV(ARMV8, EVENT_0AH) \ __PMC_EV(ARMV8, EVENT_0BH) \ __PMC_EV(ARMV8, EVENT_0CH) \ __PMC_EV(ARMV8, EVENT_0DH) \ __PMC_EV(ARMV8, EVENT_0EH) \ __PMC_EV(ARMV8, EVENT_0FH) \ __PMC_EV(ARMV8, EVENT_10H) \ __PMC_EV(ARMV8, EVENT_11H) \ __PMC_EV(ARMV8, EVENT_12H) \ __PMC_EV(ARMV8, EVENT_13H) \ __PMC_EV(ARMV8, EVENT_14H) \ __PMC_EV(ARMV8, EVENT_15H) \ __PMC_EV(ARMV8, EVENT_16H) \ __PMC_EV(ARMV8, EVENT_17H) \ __PMC_EV(ARMV8, EVENT_18H) \ __PMC_EV(ARMV8, EVENT_19H) \ __PMC_EV(ARMV8, EVENT_1AH) \ __PMC_EV(ARMV8, EVENT_1BH) \ __PMC_EV(ARMV8, EVENT_1CH) \ __PMC_EV(ARMV8, EVENT_1DH) \ __PMC_EV(ARMV8, EVENT_1EH) \ __PMC_EV(ARMV8, EVENT_1FH) \ __PMC_EV(ARMV8, EVENT_20H) \ __PMC_EV(ARMV8, EVENT_21H) \ __PMC_EV(ARMV8, EVENT_22H) \ __PMC_EV(ARMV8, EVENT_23H) \ __PMC_EV(ARMV8, EVENT_24H) \ __PMC_EV(ARMV8, EVENT_25H) \ __PMC_EV(ARMV8, EVENT_26H) \ __PMC_EV(ARMV8, EVENT_27H) \ __PMC_EV(ARMV8, EVENT_28H) \ __PMC_EV(ARMV8, EVENT_29H) \ __PMC_EV(ARMV8, EVENT_2AH) \ __PMC_EV(ARMV8, EVENT_2BH) \ __PMC_EV(ARMV8, EVENT_2CH) \ __PMC_EV(ARMV8, EVENT_2DH) \ __PMC_EV(ARMV8, EVENT_2EH) \ __PMC_EV(ARMV8, EVENT_2FH) \ __PMC_EV(ARMV8, EVENT_30H) \ __PMC_EV(ARMV8, EVENT_31H) \ __PMC_EV(ARMV8, EVENT_32H) \ __PMC_EV(ARMV8, EVENT_33H) \ __PMC_EV(ARMV8, EVENT_34H) \ __PMC_EV(ARMV8, EVENT_35H) \ __PMC_EV(ARMV8, EVENT_36H) \ __PMC_EV(ARMV8, EVENT_37H) \ __PMC_EV(ARMV8, EVENT_38H) \ __PMC_EV(ARMV8, EVENT_39H) \ __PMC_EV(ARMV8, EVENT_3AH) \ __PMC_EV(ARMV8, EVENT_3BH) \ __PMC_EV(ARMV8, EVENT_3CH) \ __PMC_EV(ARMV8, EVENT_3DH) \ __PMC_EV(ARMV8, EVENT_3EH) \ __PMC_EV(ARMV8, EVENT_3FH) \ __PMC_EV(ARMV8, EVENT_40H) \ __PMC_EV(ARMV8, EVENT_41H) \ __PMC_EV(ARMV8, EVENT_42H) \ __PMC_EV(ARMV8, EVENT_43H) \ __PMC_EV(ARMV8, EVENT_44H) \ __PMC_EV(ARMV8, EVENT_45H) \ __PMC_EV(ARMV8, EVENT_46H) \ __PMC_EV(ARMV8, EVENT_47H) \ __PMC_EV(ARMV8, EVENT_48H) \ __PMC_EV(ARMV8, EVENT_49H) \ __PMC_EV(ARMV8, EVENT_4AH) \ __PMC_EV(ARMV8, EVENT_4BH) \ __PMC_EV(ARMV8, EVENT_4CH) \ __PMC_EV(ARMV8, EVENT_4DH) \ __PMC_EV(ARMV8, EVENT_4EH) \ __PMC_EV(ARMV8, EVENT_4FH) \ __PMC_EV(ARMV8, EVENT_50H) \ __PMC_EV(ARMV8, EVENT_51H) \ __PMC_EV(ARMV8, EVENT_52H) \ __PMC_EV(ARMV8, EVENT_53H) \ __PMC_EV(ARMV8, EVENT_54H) \ __PMC_EV(ARMV8, EVENT_55H) \ __PMC_EV(ARMV8, EVENT_56H) \ __PMC_EV(ARMV8, EVENT_57H) \ __PMC_EV(ARMV8, EVENT_58H) \ __PMC_EV(ARMV8, EVENT_59H) \ __PMC_EV(ARMV8, EVENT_5AH) \ __PMC_EV(ARMV8, EVENT_5BH) \ __PMC_EV(ARMV8, EVENT_5CH) \ __PMC_EV(ARMV8, EVENT_5DH) \ __PMC_EV(ARMV8, EVENT_5EH) \ __PMC_EV(ARMV8, EVENT_5FH) \ __PMC_EV(ARMV8, EVENT_60H) \ __PMC_EV(ARMV8, EVENT_61H) \ __PMC_EV(ARMV8, EVENT_62H) \ __PMC_EV(ARMV8, EVENT_63H) \ __PMC_EV(ARMV8, EVENT_64H) \ __PMC_EV(ARMV8, EVENT_65H) \ __PMC_EV(ARMV8, EVENT_66H) \ __PMC_EV(ARMV8, EVENT_67H) \ __PMC_EV(ARMV8, EVENT_68H) \ __PMC_EV(ARMV8, EVENT_69H) \ __PMC_EV(ARMV8, EVENT_6AH) \ __PMC_EV(ARMV8, EVENT_6BH) \ __PMC_EV(ARMV8, EVENT_6CH) \ __PMC_EV(ARMV8, EVENT_6DH) \ __PMC_EV(ARMV8, EVENT_6EH) \ __PMC_EV(ARMV8, EVENT_6FH) \ __PMC_EV(ARMV8, EVENT_70H) \ __PMC_EV(ARMV8, EVENT_71H) \ __PMC_EV(ARMV8, EVENT_72H) \ __PMC_EV(ARMV8, EVENT_73H) \ __PMC_EV(ARMV8, EVENT_74H) \ __PMC_EV(ARMV8, EVENT_75H) \ __PMC_EV(ARMV8, EVENT_76H) \ __PMC_EV(ARMV8, EVENT_77H) \ __PMC_EV(ARMV8, EVENT_78H) \ __PMC_EV(ARMV8, EVENT_79H) \ __PMC_EV(ARMV8, EVENT_7AH) \ __PMC_EV(ARMV8, EVENT_7BH) \ __PMC_EV(ARMV8, EVENT_7CH) \ __PMC_EV(ARMV8, EVENT_7DH) \ __PMC_EV(ARMV8, EVENT_7EH) \ __PMC_EV(ARMV8, EVENT_7FH) \ __PMC_EV(ARMV8, EVENT_80H) \ __PMC_EV(ARMV8, EVENT_81H) \ __PMC_EV(ARMV8, EVENT_82H) \ __PMC_EV(ARMV8, EVENT_83H) \ __PMC_EV(ARMV8, EVENT_84H) \ __PMC_EV(ARMV8, EVENT_85H) \ __PMC_EV(ARMV8, EVENT_86H) \ __PMC_EV(ARMV8, EVENT_87H) \ __PMC_EV(ARMV8, EVENT_88H) \ __PMC_EV(ARMV8, EVENT_89H) \ __PMC_EV(ARMV8, EVENT_8AH) \ __PMC_EV(ARMV8, EVENT_8BH) \ __PMC_EV(ARMV8, EVENT_8CH) \ __PMC_EV(ARMV8, EVENT_8DH) \ __PMC_EV(ARMV8, EVENT_8EH) \ __PMC_EV(ARMV8, EVENT_8FH) \ __PMC_EV(ARMV8, EVENT_90H) \ __PMC_EV(ARMV8, EVENT_91H) \ __PMC_EV(ARMV8, EVENT_92H) \ __PMC_EV(ARMV8, EVENT_93H) \ __PMC_EV(ARMV8, EVENT_94H) \ __PMC_EV(ARMV8, EVENT_95H) \ __PMC_EV(ARMV8, EVENT_96H) \ __PMC_EV(ARMV8, EVENT_97H) \ __PMC_EV(ARMV8, EVENT_98H) \ __PMC_EV(ARMV8, EVENT_99H) \ __PMC_EV(ARMV8, EVENT_9AH) \ __PMC_EV(ARMV8, EVENT_9BH) \ __PMC_EV(ARMV8, EVENT_9CH) \ __PMC_EV(ARMV8, EVENT_9DH) \ __PMC_EV(ARMV8, EVENT_9EH) \ __PMC_EV(ARMV8, EVENT_9FH) \ __PMC_EV(ARMV8, EVENT_A0H) \ __PMC_EV(ARMV8, EVENT_A1H) \ __PMC_EV(ARMV8, EVENT_A2H) \ __PMC_EV(ARMV8, EVENT_A3H) \ __PMC_EV(ARMV8, EVENT_A4H) \ __PMC_EV(ARMV8, EVENT_A5H) \ __PMC_EV(ARMV8, EVENT_A6H) \ __PMC_EV(ARMV8, EVENT_A7H) \ __PMC_EV(ARMV8, EVENT_A8H) \ __PMC_EV(ARMV8, EVENT_A9H) \ __PMC_EV(ARMV8, EVENT_AAH) \ __PMC_EV(ARMV8, EVENT_ABH) \ __PMC_EV(ARMV8, EVENT_ACH) \ __PMC_EV(ARMV8, EVENT_ADH) \ __PMC_EV(ARMV8, EVENT_AEH) \ __PMC_EV(ARMV8, EVENT_AFH) \ __PMC_EV(ARMV8, EVENT_B0H) \ __PMC_EV(ARMV8, EVENT_B1H) \ __PMC_EV(ARMV8, EVENT_B2H) \ __PMC_EV(ARMV8, EVENT_B3H) \ __PMC_EV(ARMV8, EVENT_B4H) \ __PMC_EV(ARMV8, EVENT_B5H) \ __PMC_EV(ARMV8, EVENT_B6H) \ __PMC_EV(ARMV8, EVENT_B7H) \ __PMC_EV(ARMV8, EVENT_B8H) \ __PMC_EV(ARMV8, EVENT_B9H) \ __PMC_EV(ARMV8, EVENT_BAH) \ __PMC_EV(ARMV8, EVENT_BBH) \ __PMC_EV(ARMV8, EVENT_BCH) \ __PMC_EV(ARMV8, EVENT_BDH) \ __PMC_EV(ARMV8, EVENT_BEH) \ __PMC_EV(ARMV8, EVENT_BFH) \ __PMC_EV(ARMV8, EVENT_C0H) \ __PMC_EV(ARMV8, EVENT_C1H) \ __PMC_EV(ARMV8, EVENT_C2H) \ __PMC_EV(ARMV8, EVENT_C3H) \ __PMC_EV(ARMV8, EVENT_C4H) \ __PMC_EV(ARMV8, EVENT_C5H) \ __PMC_EV(ARMV8, EVENT_C6H) \ __PMC_EV(ARMV8, EVENT_C7H) \ __PMC_EV(ARMV8, EVENT_C8H) \ __PMC_EV(ARMV8, EVENT_C9H) \ __PMC_EV(ARMV8, EVENT_CAH) \ __PMC_EV(ARMV8, EVENT_CBH) \ __PMC_EV(ARMV8, EVENT_CCH) \ __PMC_EV(ARMV8, EVENT_CDH) \ __PMC_EV(ARMV8, EVENT_CEH) \ __PMC_EV(ARMV8, EVENT_CFH) \ __PMC_EV(ARMV8, EVENT_D0H) \ __PMC_EV(ARMV8, EVENT_D1H) \ __PMC_EV(ARMV8, EVENT_D2H) \ __PMC_EV(ARMV8, EVENT_D3H) \ __PMC_EV(ARMV8, EVENT_D4H) \ __PMC_EV(ARMV8, EVENT_D5H) \ __PMC_EV(ARMV8, EVENT_D6H) \ __PMC_EV(ARMV8, EVENT_D7H) \ __PMC_EV(ARMV8, EVENT_D8H) \ __PMC_EV(ARMV8, EVENT_D9H) \ __PMC_EV(ARMV8, EVENT_DAH) \ __PMC_EV(ARMV8, EVENT_DBH) \ __PMC_EV(ARMV8, EVENT_DCH) \ __PMC_EV(ARMV8, EVENT_DDH) \ __PMC_EV(ARMV8, EVENT_DEH) \ __PMC_EV(ARMV8, EVENT_DFH) \ __PMC_EV(ARMV8, EVENT_E0H) \ __PMC_EV(ARMV8, EVENT_E1H) \ __PMC_EV(ARMV8, EVENT_E2H) \ __PMC_EV(ARMV8, EVENT_E3H) \ __PMC_EV(ARMV8, EVENT_E4H) \ __PMC_EV(ARMV8, EVENT_E5H) \ __PMC_EV(ARMV8, EVENT_E6H) \ __PMC_EV(ARMV8, EVENT_E7H) \ __PMC_EV(ARMV8, EVENT_E8H) \ __PMC_EV(ARMV8, EVENT_E9H) \ __PMC_EV(ARMV8, EVENT_EAH) \ __PMC_EV(ARMV8, EVENT_EBH) \ __PMC_EV(ARMV8, EVENT_ECH) \ __PMC_EV(ARMV8, EVENT_EDH) \ __PMC_EV(ARMV8, EVENT_EEH) \ __PMC_EV(ARMV8, EVENT_EFH) \ __PMC_EV(ARMV8, EVENT_F0H) \ __PMC_EV(ARMV8, EVENT_F1H) \ __PMC_EV(ARMV8, EVENT_F2H) \ __PMC_EV(ARMV8, EVENT_F3H) \ __PMC_EV(ARMV8, EVENT_F4H) \ __PMC_EV(ARMV8, EVENT_F5H) \ __PMC_EV(ARMV8, EVENT_F6H) \ __PMC_EV(ARMV8, EVENT_F7H) \ __PMC_EV(ARMV8, EVENT_F8H) \ __PMC_EV(ARMV8, EVENT_F9H) \ __PMC_EV(ARMV8, EVENT_FAH) \ __PMC_EV(ARMV8, EVENT_FBH) \ __PMC_EV(ARMV8, EVENT_FCH) \ __PMC_EV(ARMV8, EVENT_FDH) \ __PMC_EV(ARMV8, EVENT_FEH) \ __PMC_EV(ARMV8, EVENT_FFH) #define PMC_EV_ARMV8_FIRST PMC_EV_ARMV8_EVENT_00H #define PMC_EV_ARMV8_LAST PMC_EV_ARMV8_EVENT_FFH #define __PMC_EV_ALIAS_ARMV8_COMMON() \ __PMC_EV_ALIAS("SW_INCR", ARMV8_EVENT_00H) \ __PMC_EV_ALIAS("L1I_CACHE_REFILL", ARMV8_EVENT_01H) \ __PMC_EV_ALIAS("L1I_TLB_REFILL", ARMV8_EVENT_02H) \ __PMC_EV_ALIAS("L1D_CACHE_REFILL", ARMV8_EVENT_03H) \ __PMC_EV_ALIAS("L1D_CACHE", ARMV8_EVENT_04H) \ __PMC_EV_ALIAS("L1D_TLB_REFILL", ARMV8_EVENT_05H) \ __PMC_EV_ALIAS("INST_RETIRED", ARMV8_EVENT_08H) \ __PMC_EV_ALIAS("EXC_TAKEN", ARMV8_EVENT_09H) \ __PMC_EV_ALIAS("EXC_RETURN", ARMV8_EVENT_0AH) \ __PMC_EV_ALIAS("CID_WRITE_RETIRED", ARMV8_EVENT_0BH) \ __PMC_EV_ALIAS("BR_MIS_PRED", ARMV8_EVENT_10H) \ __PMC_EV_ALIAS("CPU_CYCLES", ARMV8_EVENT_11H) \ __PMC_EV_ALIAS("BR_PRED", ARMV8_EVENT_12H) \ __PMC_EV_ALIAS("MEM_ACCESS", ARMV8_EVENT_13H) \ __PMC_EV_ALIAS("L1I_CACHE", ARMV8_EVENT_14H) \ __PMC_EV_ALIAS("L1D_CACHE_WB", ARMV8_EVENT_15H) \ __PMC_EV_ALIAS("L2D_CACHE", ARMV8_EVENT_16H) \ __PMC_EV_ALIAS("L2D_CACHE_REFILL", ARMV8_EVENT_17H) \ __PMC_EV_ALIAS("L2D_CACHE_WB", ARMV8_EVENT_18H) \ __PMC_EV_ALIAS("BUS_ACCESS", ARMV8_EVENT_19H) \ __PMC_EV_ALIAS("MEMORY_ERROR", ARMV8_EVENT_1AH) \ __PMC_EV_ALIAS("BUS_CYCLES", ARMV8_EVENT_1DH) \ __PMC_EV_ALIAS("CHAIN", ARMV8_EVENT_1EH) \ __PMC_EV_ALIAS("BUS_ACCESS_LD", ARMV8_EVENT_60H) \ __PMC_EV_ALIAS("BUS_ACCESS_ST", ARMV8_EVENT_61H) \ __PMC_EV_ALIAS("BR_INDIRECT_SPEC", ARMV8_EVENT_7AH) \ __PMC_EV_ALIAS("EXC_IRQ", ARMV8_EVENT_86H) \ __PMC_EV_ALIAS("EXC_FIQ", ARMV8_EVENT_87H) #define __PMC_EV_ALIAS_ARMV8_CORTEX_A53() \ __PMC_EV_ALIAS_ARMV8_COMMON() \ __PMC_EV_ALIAS("LD_RETIRED", ARMV8_EVENT_06H) \ __PMC_EV_ALIAS("ST_RETIRED", ARMV8_EVENT_07H) \ __PMC_EV_ALIAS("PC_WRITE_RETIRED", ARMV8_EVENT_0CH) \ __PMC_EV_ALIAS("BR_IMMED_RETIRED", ARMV8_EVENT_0DH) \ __PMC_EV_ALIAS("BR_RETURN_RETIRED", ARMV8_EVENT_0EH) \ __PMC_EV_ALIAS("UNALIGNED_LDST_RETIRED",ARMV8_EVENT_0FH) #define __PMC_EV_ALIAS_ARMV8_CORTEX_A57() \ __PMC_EV_ALIAS_ARMV8_COMMON() \ __PMC_EV_ALIAS("INST_SPEC", ARMV8_EVENT_1BH) \ __PMC_EV_ALIAS("TTBR_WRITE_RETIRED", ARMV8_EVENT_1CH) \ __PMC_EV_ALIAS("L1D_CACHE_LD", ARMV8_EVENT_40H) \ __PMC_EV_ALIAS("L1D_CACHE_ST", ARMV8_EVENT_41H) \ __PMC_EV_ALIAS("L1D_CACHE_REFILL_LD", ARMV8_EVENT_42H) \ __PMC_EV_ALIAS("L1D_CACHE_REFILL_ST", ARMV8_EVENT_43H) \ __PMC_EV_ALIAS("L1D_CACHE_WB_VICTIM", ARMV8_EVENT_46H) \ __PMC_EV_ALIAS("L1D_CACHE_WB_CLEAN", ARMV8_EVENT_47H) \ __PMC_EV_ALIAS("L1D_CACHE_INVAL", ARMV8_EVENT_48H) \ __PMC_EV_ALIAS("L1D_TLB_REFILL_LD", ARMV8_EVENT_4CH) \ __PMC_EV_ALIAS("L1D_TLB_REFILL_ST", ARMV8_EVENT_4DH) \ __PMC_EV_ALIAS("L2D_CACHE_LD", ARMV8_EVENT_50H) \ __PMC_EV_ALIAS("L2D_CACHE_ST", ARMV8_EVENT_51H) \ __PMC_EV_ALIAS("L2D_CACHE_REFILL_LD", ARMV8_EVENT_52H) \ __PMC_EV_ALIAS("L2D_CACHE_REFILL_ST", ARMV8_EVENT_53H) \ __PMC_EV_ALIAS("L2D_CACHE_WB_VICTIM", ARMV8_EVENT_56H) \ __PMC_EV_ALIAS("L2D_CACHE_WB_CLEAN", ARMV8_EVENT_57H) \ __PMC_EV_ALIAS("L2D_CACHE_INVAL", ARMV8_EVENT_58H) \ __PMC_EV_ALIAS("BUS_ACCESS_SHARED", ARMV8_EVENT_62H) \ __PMC_EV_ALIAS("BUS_ACCESS_NOT_SHARED", ARMV8_EVENT_63H) \ __PMC_EV_ALIAS("BUS_ACCESS_NORMAL", ARMV8_EVENT_64H) \ __PMC_EV_ALIAS("BUS_ACCESS_PERIPH", ARMV8_EVENT_65H) \ __PMC_EV_ALIAS("MEM_ACCESS_LD", ARMV8_EVENT_66H) \ __PMC_EV_ALIAS("MEM_ACCESS_ST", ARMV8_EVENT_67H) \ __PMC_EV_ALIAS("UNALIGNED_LD_SPEC", ARMV8_EVENT_68H) \ __PMC_EV_ALIAS("UNALIGNED_ST_SPEC", ARMV8_EVENT_69H) \ __PMC_EV_ALIAS("UNALIGNED_LDST_SPEC", ARMV8_EVENT_6AH) \ __PMC_EV_ALIAS("LDREX_SPEC", ARMV8_EVENT_6CH) \ __PMC_EV_ALIAS("STREX_PASS_SPEC", ARMV8_EVENT_6DH) \ __PMC_EV_ALIAS("STREX_FAIL_SPEC", ARMV8_EVENT_6EH) \ __PMC_EV_ALIAS("LD_SPEC", ARMV8_EVENT_70H) \ __PMC_EV_ALIAS("ST_SPEC", ARMV8_EVENT_71H) \ __PMC_EV_ALIAS("LDST_SPEC", ARMV8_EVENT_72H) \ __PMC_EV_ALIAS("DP_SPEC", ARMV8_EVENT_73H) \ __PMC_EV_ALIAS("ASE_SPEC", ARMV8_EVENT_74H) \ __PMC_EV_ALIAS("VFP_SPEC", ARMV8_EVENT_75H) \ __PMC_EV_ALIAS("PC_WRITE_SPEC", ARMV8_EVENT_76H) \ __PMC_EV_ALIAS("CRYPTO_SPEC", ARMV8_EVENT_77H) \ __PMC_EV_ALIAS("BR_IMMED_SPEC", ARMV8_EVENT_78H) \ __PMC_EV_ALIAS("BR_RETURN_SPEC", ARMV8_EVENT_79H) \ __PMC_EV_ALIAS("ISB_SPEC", ARMV8_EVENT_7CH) \ __PMC_EV_ALIAS("DSB_SPEC", ARMV8_EVENT_7DH) \ __PMC_EV_ALIAS("DMB_SPEC", ARMV8_EVENT_7EH) \ __PMC_EV_ALIAS("EXC_UNDEF", ARMV8_EVENT_81H) \ __PMC_EV_ALIAS("EXC_SVC", ARMV8_EVENT_82H) \ __PMC_EV_ALIAS("EXC_PABORT", ARMV8_EVENT_83H) \ __PMC_EV_ALIAS("EXC_DABORT", ARMV8_EVENT_84H) \ __PMC_EV_ALIAS("EXC_SMC", ARMV8_EVENT_88H) \ __PMC_EV_ALIAS("EXC_HVC", ARMV8_EVENT_8AH) \ __PMC_EV_ALIAS("EXC_TRAP_PABORT", ARMV8_EVENT_8BH) \ __PMC_EV_ALIAS("EXC_TRAP_DABORT", ARMV8_EVENT_8CH) \ __PMC_EV_ALIAS("EXC_TRAP_OTHER", ARMV8_EVENT_8DH) \ __PMC_EV_ALIAS("EXC_TRAP_IRQ", ARMV8_EVENT_8EH) \ __PMC_EV_ALIAS("EXC_TRAP_FIQ", ARMV8_EVENT_8FH) \ __PMC_EV_ALIAS("RC_LD_SPEC", ARMV8_EVENT_90H) \ __PMC_EV_ALIAS("RC_ST_SPEC", ARMV8_EVENT_91H) /* * MIPS Events from "Programming the MIPS32 24K Core Family", * Document Number: MD00355 Revision 04.63 December 19, 2008 * These events are kept in the order found in Table 7.4. * For counters which are different between the left hand * column (0/2) and the right hand column (1/3) the left * hand is given first, e.g. BRANCH_COMPLETED and BRANCH_MISPRED * in the definition below. */ #define __PMC_EV_MIPS24K() \ __PMC_EV(MIPS24K, CYCLE) \ __PMC_EV(MIPS24K, INSTR_EXECUTED) \ __PMC_EV(MIPS24K, BRANCH_COMPLETED) \ __PMC_EV(MIPS24K, BRANCH_MISPRED) \ __PMC_EV(MIPS24K, RETURN) \ __PMC_EV(MIPS24K, RETURN_MISPRED) \ __PMC_EV(MIPS24K, RETURN_NOT_31) \ __PMC_EV(MIPS24K, RETURN_NOTPRED) \ __PMC_EV(MIPS24K, ITLB_ACCESS) \ __PMC_EV(MIPS24K, ITLB_MISS) \ __PMC_EV(MIPS24K, DTLB_ACCESS) \ __PMC_EV(MIPS24K, DTLB_MISS) \ __PMC_EV(MIPS24K, JTLB_IACCESS) \ __PMC_EV(MIPS24K, JTLB_IMISS) \ __PMC_EV(MIPS24K, JTLB_DACCESS) \ __PMC_EV(MIPS24K, JTLB_DMISS) \ __PMC_EV(MIPS24K, IC_FETCH) \ __PMC_EV(MIPS24K, IC_MISS) \ __PMC_EV(MIPS24K, DC_LOADSTORE) \ __PMC_EV(MIPS24K, DC_WRITEBACK) \ __PMC_EV(MIPS24K, DC_MISS) \ __PMC_EV(MIPS24K, STORE_MISS) \ __PMC_EV(MIPS24K, LOAD_MISS) \ __PMC_EV(MIPS24K, INTEGER_COMPLETED) \ __PMC_EV(MIPS24K, FP_COMPLETED) \ __PMC_EV(MIPS24K, LOAD_COMPLETED) \ __PMC_EV(MIPS24K, STORE_COMPLETED) \ __PMC_EV(MIPS24K, BARRIER_COMPLETED) \ __PMC_EV(MIPS24K, MIPS16_COMPLETED) \ __PMC_EV(MIPS24K, NOP_COMPLETED) \ __PMC_EV(MIPS24K, INTEGER_MULDIV_COMPLETED)\ __PMC_EV(MIPS24K, RF_STALL) \ __PMC_EV(MIPS24K, INSTR_REFETCH) \ __PMC_EV(MIPS24K, STORE_COND_COMPLETED) \ __PMC_EV(MIPS24K, STORE_COND_FAILED) \ __PMC_EV(MIPS24K, ICACHE_REQUESTS) \ __PMC_EV(MIPS24K, ICACHE_HIT) \ __PMC_EV(MIPS24K, L2_WRITEBACK) \ __PMC_EV(MIPS24K, L2_ACCESS) \ __PMC_EV(MIPS24K, L2_MISS) \ __PMC_EV(MIPS24K, L2_ERR_CORRECTED) \ __PMC_EV(MIPS24K, EXCEPTIONS) \ __PMC_EV(MIPS24K, RF_CYCLES_STALLED) \ __PMC_EV(MIPS24K, IFU_CYCLES_STALLED) \ __PMC_EV(MIPS24K, ALU_CYCLES_STALLED) \ __PMC_EV(MIPS24K, UNCACHED_LOAD) \ __PMC_EV(MIPS24K, UNCACHED_STORE) \ __PMC_EV(MIPS24K, CP2_REG_TO_REG_COMPLETED)\ __PMC_EV(MIPS24K, MFTC_COMPLETED) \ __PMC_EV(MIPS24K, IC_BLOCKED_CYCLES) \ __PMC_EV(MIPS24K, DC_BLOCKED_CYCLES) \ __PMC_EV(MIPS24K, L2_IMISS_STALL_CYCLES) \ __PMC_EV(MIPS24K, L2_DMISS_STALL_CYCLES) \ __PMC_EV(MIPS24K, DMISS_CYCLES) \ __PMC_EV(MIPS24K, L2_MISS_CYCLES) \ __PMC_EV(MIPS24K, UNCACHED_BLOCK_CYCLES) \ __PMC_EV(MIPS24K, MDU_STALL_CYCLES) \ __PMC_EV(MIPS24K, FPU_STALL_CYCLES) \ __PMC_EV(MIPS24K, CP2_STALL_CYCLES) \ __PMC_EV(MIPS24K, COREXTEND_STALL_CYCLES) \ __PMC_EV(MIPS24K, ISPRAM_STALL_CYCLES) \ __PMC_EV(MIPS24K, DSPRAM_STALL_CYCLES) \ __PMC_EV(MIPS24K, CACHE_STALL_CYCLES) \ __PMC_EV(MIPS24K, LOAD_TO_USE_STALLS) \ __PMC_EV(MIPS24K, BASE_MISPRED_STALLS) \ __PMC_EV(MIPS24K, CPO_READ_STALLS) \ __PMC_EV(MIPS24K, BRANCH_MISPRED_CYCLES) \ __PMC_EV(MIPS24K, IFETCH_BUFFER_FULL) \ __PMC_EV(MIPS24K, FETCH_BUFFER_ALLOCATED) \ __PMC_EV(MIPS24K, EJTAG_ITRIGGER) \ __PMC_EV(MIPS24K, EJTAG_DTRIGGER) \ __PMC_EV(MIPS24K, FSB_LT_QUARTER) \ __PMC_EV(MIPS24K, FSB_QUARTER_TO_HALF) \ __PMC_EV(MIPS24K, FSB_GT_HALF) \ __PMC_EV(MIPS24K, FSB_FULL_PIPELINE_STALLS)\ __PMC_EV(MIPS24K, LDQ_LT_QUARTER) \ __PMC_EV(MIPS24K, LDQ_QUARTER_TO_HALF) \ __PMC_EV(MIPS24K, LDQ_GT_HALF) \ __PMC_EV(MIPS24K, LDQ_FULL_PIPELINE_STALLS)\ __PMC_EV(MIPS24K, WBB_LT_QUARTER) \ __PMC_EV(MIPS24K, WBB_QUARTER_TO_HALF) \ __PMC_EV(MIPS24K, WBB_GT_HALF) \ __PMC_EV(MIPS24K, WBB_FULL_PIPELINE_STALLS) \ __PMC_EV(MIPS24K, REQUEST_LATENCY) \ __PMC_EV(MIPS24K, REQUEST_COUNT) #define PMC_EV_MIPS24K_FIRST PMC_EV_MIPS24K_CYCLE #define PMC_EV_MIPS24K_LAST PMC_EV_MIPS24K_WBB_FULL_PIPELINE_STALLS /* * MIPS74k events. Similar to MIPS24k, the arrangement * is (0,2) then (1,3) events. */ #define __PMC_EV_MIPS74K() \ __PMC_EV(MIPS74K, CYCLES) \ __PMC_EV(MIPS74K, INSTR_EXECUTED) \ __PMC_EV(MIPS74K, PREDICTED_JR_31) \ __PMC_EV(MIPS74K, JR_31_MISPREDICTIONS) \ __PMC_EV(MIPS74K, REDIRECT_STALLS) \ __PMC_EV(MIPS74K, JR_31_NO_PREDICTIONS) \ __PMC_EV(MIPS74K, ITLB_ACCESSES) \ __PMC_EV(MIPS74K, ITLB_MISSES) \ __PMC_EV(MIPS74K, JTLB_INSN_MISSES) \ __PMC_EV(MIPS74K, ICACHE_ACCESSES) \ __PMC_EV(MIPS74K, ICACHE_MISSES) \ __PMC_EV(MIPS74K, ICACHE_MISS_STALLS) \ __PMC_EV(MIPS74K, UNCACHED_IFETCH_STALLS) \ __PMC_EV(MIPS74K, PDTRACE_BACK_STALLS) \ __PMC_EV(MIPS74K, IFU_REPLAYS) \ __PMC_EV(MIPS74K, KILLED_FETCH_SLOTS) \ __PMC_EV(MIPS74K, IFU_IDU_MISS_PRED_UPSTREAM_CYCLES) \ __PMC_EV(MIPS74K, IFU_IDU_NO_FETCH_CYCLES) \ __PMC_EV(MIPS74K, IFU_IDU_CLOGED_DOWNSTREAM_CYCLES) \ __PMC_EV(MIPS74K, DDQ0_FULL_DR_STALLS) \ __PMC_EV(MIPS74K, DDQ1_FULL_DR_STALLS) \ __PMC_EV(MIPS74K, ALCB_FULL_DR_STALLS) \ __PMC_EV(MIPS74K, AGCB_FULL_DR_STALLS) \ __PMC_EV(MIPS74K, CLDQ_FULL_DR_STALLS) \ __PMC_EV(MIPS74K, IODQ_FULL_DR_STALLS) \ __PMC_EV(MIPS74K, ALU_EMPTY_CYCLES) \ __PMC_EV(MIPS74K, AGEN_EMPTY_CYCLES) \ __PMC_EV(MIPS74K, ALU_OPERANDS_NOT_READY_CYCLES) \ __PMC_EV(MIPS74K, AGEN_OPERANDS_NOT_READY_CYCLES) \ __PMC_EV(MIPS74K, ALU_NO_ISSUES_CYCLES) \ __PMC_EV(MIPS74K, AGEN_NO_ISSUES_CYCLES) \ __PMC_EV(MIPS74K, ALU_BUBBLE_CYCLES) \ __PMC_EV(MIPS74K, AGEN_BUBBLE_CYCLES) \ __PMC_EV(MIPS74K, SINGLE_ISSUE_CYCLES) \ __PMC_EV(MIPS74K, DUAL_ISSUE_CYCLES) \ __PMC_EV(MIPS74K, OOO_ALU_ISSUE_CYCLES) \ __PMC_EV(MIPS74K, OOO_AGEN_ISSUE_CYCLES) \ __PMC_EV(MIPS74K, JALR_JALR_HB_INSNS) \ __PMC_EV(MIPS74K, DCACHE_LINE_REFILL_REQUESTS) \ __PMC_EV(MIPS74K, DCACHE_LOAD_ACCESSES) \ __PMC_EV(MIPS74K, DCACHE_ACCESSES) \ __PMC_EV(MIPS74K, DCACHE_WRITEBACKS) \ __PMC_EV(MIPS74K, DCACHE_MISSES) \ __PMC_EV(MIPS74K, JTLB_DATA_ACCESSES) \ __PMC_EV(MIPS74K, JTLB_DATA_MISSES) \ __PMC_EV(MIPS74K, LOAD_STORE_REPLAYS) \ __PMC_EV(MIPS74K, VA_TRANSALTION_CORNER_CASES) \ __PMC_EV(MIPS74K, LOAD_STORE_BLOCKED_CYCLES) \ __PMC_EV(MIPS74K, LOAD_STORE_NO_FILL_REQUESTS) \ __PMC_EV(MIPS74K, L2_CACHE_WRITEBACKS) \ __PMC_EV(MIPS74K, L2_CACHE_ACCESSES) \ __PMC_EV(MIPS74K, L2_CACHE_MISSES) \ __PMC_EV(MIPS74K, L2_CACHE_MISS_CYCLES) \ __PMC_EV(MIPS74K, FSB_FULL_STALLS) \ __PMC_EV(MIPS74K, FSB_OVER_50_FULL) \ __PMC_EV(MIPS74K, LDQ_FULL_STALLS) \ __PMC_EV(MIPS74K, LDQ_OVER_50_FULL) \ __PMC_EV(MIPS74K, WBB_FULL_STALLS) \ __PMC_EV(MIPS74K, WBB_OVER_50_FULL) \ __PMC_EV(MIPS74K, LOAD_MISS_CONSUMER_REPLAYS) \ __PMC_EV(MIPS74K, CP1_CP2_LOAD_INSNS) \ __PMC_EV(MIPS74K, JR_NON_31_INSNS) \ __PMC_EV(MIPS74K, MISPREDICTED_JR_31_INSNS) \ __PMC_EV(MIPS74K, BRANCH_INSNS) \ __PMC_EV(MIPS74K, CP1_CP2_COND_BRANCH_INSNS) \ __PMC_EV(MIPS74K, BRANCH_LIKELY_INSNS) \ __PMC_EV(MIPS74K, MISPREDICTED_BRANCH_LIKELY_INSNS) \ __PMC_EV(MIPS74K, COND_BRANCH_INSNS) \ __PMC_EV(MIPS74K, MISPREDICTED_BRANCH_INSNS) \ __PMC_EV(MIPS74K, INTEGER_INSNS) \ __PMC_EV(MIPS74K, FPU_INSNS) \ __PMC_EV(MIPS74K, LOAD_INSNS) \ __PMC_EV(MIPS74K, STORE_INSNS) \ __PMC_EV(MIPS74K, J_JAL_INSNS) \ __PMC_EV(MIPS74K, MIPS16_INSNS) \ __PMC_EV(MIPS74K, NOP_INSNS) \ __PMC_EV(MIPS74K, NT_MUL_DIV_INSNS) \ __PMC_EV(MIPS74K, DSP_INSNS) \ __PMC_EV(MIPS74K, ALU_DSP_SATURATION_INSNS) \ __PMC_EV(MIPS74K, DSP_BRANCH_INSNS) \ __PMC_EV(MIPS74K, MDU_DSP_SATURATION_INSNS) \ __PMC_EV(MIPS74K, UNCACHED_LOAD_INSNS) \ __PMC_EV(MIPS74K, UNCACHED_STORE_INSNS) \ __PMC_EV(MIPS74K, EJTAG_INSN_TRIGGERS) \ __PMC_EV(MIPS74K, CP1_BRANCH_MISPREDICTIONS) \ __PMC_EV(MIPS74K, SC_INSNS) \ __PMC_EV(MIPS74K, FAILED_SC_INSNS) \ __PMC_EV(MIPS74K, PREFETCH_INSNS) \ __PMC_EV(MIPS74K, CACHE_HIT_PREFETCH_INSNS) \ __PMC_EV(MIPS74K, NO_INSN_CYCLES) \ __PMC_EV(MIPS74K, LOAD_MISS_INSNS) \ __PMC_EV(MIPS74K, ONE_INSN_CYCLES) \ __PMC_EV(MIPS74K, TWO_INSNS_CYCLES) \ __PMC_EV(MIPS74K, GFIFO_BLOCKED_CYCLES) \ __PMC_EV(MIPS74K, CP1_CP2_STORE_INSNS) \ __PMC_EV(MIPS74K, MISPREDICTION_STALLS) \ __PMC_EV(MIPS74K, MISPREDICTED_BRANCH_INSNS_CYCLES) \ __PMC_EV(MIPS74K, EXCEPTIONS_TAKEN) \ __PMC_EV(MIPS74K, GRADUATION_REPLAYS) \ __PMC_EV(MIPS74K, COREEXTEND_EVENTS) \ __PMC_EV(MIPS74K, ISPRAM_EVENTS) \ __PMC_EV(MIPS74K, DSPRAM_EVENTS) \ __PMC_EV(MIPS74K, L2_CACHE_SINGLE_BIT_ERRORS) \ __PMC_EV(MIPS74K, SYSTEM_EVENT_0) \ __PMC_EV(MIPS74K, SYSTEM_EVENT_1) \ __PMC_EV(MIPS74K, SYSTEM_EVENT_2) \ __PMC_EV(MIPS74K, SYSTEM_EVENT_3) \ __PMC_EV(MIPS74K, SYSTEM_EVENT_4) \ __PMC_EV(MIPS74K, SYSTEM_EVENT_5) \ __PMC_EV(MIPS74K, SYSTEM_EVENT_6) \ __PMC_EV(MIPS74K, SYSTEM_EVENT_7) \ __PMC_EV(MIPS74K, OCP_ALL_REQUESTS) \ __PMC_EV(MIPS74K, OCP_ALL_CACHEABLE_REQUESTS) \ __PMC_EV(MIPS74K, OCP_READ_REQUESTS) \ __PMC_EV(MIPS74K, OCP_READ_CACHEABLE_REQUESTS) \ __PMC_EV(MIPS74K, OCP_WRITE_REQUESTS) \ __PMC_EV(MIPS74K, OCP_WRITE_CACHEABLE_REQUESTS) \ __PMC_EV(MIPS74K, FSB_LESS_25_FULL) \ __PMC_EV(MIPS74K, FSB_25_50_FULL) \ __PMC_EV(MIPS74K, LDQ_LESS_25_FULL) \ __PMC_EV(MIPS74K, LDQ_25_50_FULL) \ __PMC_EV(MIPS74K, WBB_LESS_25_FULL) \ __PMC_EV(MIPS74K, WBB_25_50_FULL) #define PMC_EV_MIPS74K_FIRST PMC_EV_MIPS74K_CYCLES #define PMC_EV_MIPS74K_LAST PMC_EV_MIPS74K_WBB_25_50_FULL +#define __PMC_EV_BERI() \ + __PMC_EV(BERI, CYCLE) \ + __PMC_EV(BERI, INST) \ + __PMC_EV(BERI, INST_USER) \ + __PMC_EV(BERI, INST_KERNEL) \ + __PMC_EV(BERI, IMPRECISE_SETBOUNDS) \ + __PMC_EV(BERI, UNREPRESENTABLE_CAPS) \ + __PMC_EV(BERI, ITLB_MISS) \ + __PMC_EV(BERI, DTLB_MISS) \ + __PMC_EV(BERI, ICACHE_WRITE_HIT) \ + __PMC_EV(BERI, ICACHE_WRITE_MISS) \ + __PMC_EV(BERI, ICACHE_READ_HIT) \ + __PMC_EV(BERI, ICACHE_READ_MISS) \ + __PMC_EV(BERI, ICACHE_EVICT) \ + __PMC_EV(BERI, DCACHE_WRITE_HIT) \ + __PMC_EV(BERI, DCACHE_WRITE_MISS) \ + __PMC_EV(BERI, DCACHE_READ_HIT) \ + __PMC_EV(BERI, DCACHE_READ_MISS) \ + __PMC_EV(BERI, DCACHE_EVICT) \ + __PMC_EV(BERI, DCACHE_SET_TAG_WRITE) \ + __PMC_EV(BERI, DCACHE_SET_TAG_READ) \ + __PMC_EV(BERI, L2CACHE_WRITE_HIT) \ + __PMC_EV(BERI, L2CACHE_WRITE_MISS) \ + __PMC_EV(BERI, L2CACHE_READ_HIT) \ + __PMC_EV(BERI, L2CACHE_READ_MISS) \ + __PMC_EV(BERI, L2CACHE_EVICT) \ + __PMC_EV(BERI, L2CACHE_SET_TAG_WRITE) \ + __PMC_EV(BERI, L2CACHE_SET_TAG_READ) \ + __PMC_EV(BERI, MEM_BYTE_READ) \ + __PMC_EV(BERI, MEM_BYTE_WRITE) \ + __PMC_EV(BERI, MEM_HWORD_READ) \ + __PMC_EV(BERI, MEM_HWORD_WRITE) \ + __PMC_EV(BERI, MEM_WORD_READ) \ + __PMC_EV(BERI, MEM_WORD_WRITE) \ + __PMC_EV(BERI, MEM_DWORD_READ) \ + __PMC_EV(BERI, MEM_DWORD_WRITE) \ + __PMC_EV(BERI, MEM_CAP_READ) \ + __PMC_EV(BERI, MEM_CAP_WRITE) \ + __PMC_EV(BERI, MEM_CAP_READ_TAG_SET) \ + __PMC_EV(BERI, MEM_CAP_WRITE_TAG_SET) \ + __PMC_EV(BERI, TAGCACHE_WRITE_HIT) \ + __PMC_EV(BERI, TAGCACHE_WRITE_MISS) \ + __PMC_EV(BERI, TAGCACHE_READ_HIT) \ + __PMC_EV(BERI, TAGCACHE_READ_MISS) \ + __PMC_EV(BERI, TAGCACHE_EVICT) \ + __PMC_EV(BERI, L2CACHEMASTER_READ_REQ) \ + __PMC_EV(BERI, L2CACHEMASTER_WRITE_REQ) \ + __PMC_EV(BERI, L2CACHEMASTER_WRITE_REQ_FLIT) \ + __PMC_EV(BERI, L2CACHEMASTER_READ_RSP) \ + __PMC_EV(BERI, L2CACHEMASTER_READ_RSP_FLIT) \ + __PMC_EV(BERI, L2CACHEMASTER_WRITE_RSP) \ + __PMC_EV(BERI, TAGCACHEMASTER_READ_REQ) \ + __PMC_EV(BERI, TAGCACHEMASTER_WRITE_REQ) \ + __PMC_EV(BERI, TAGCACHEMASTER_WRITE_REQ_FLIT) \ + __PMC_EV(BERI, TAGCACHEMASTER_READ_RSP) \ + __PMC_EV(BERI, TAGCACHEMASTER_READ_RSP_FLIT) \ + __PMC_EV(BERI, TAGCACHEMASTER_WRITE_RSP) + +#define PMC_EV_BERI_FIRST PMC_EV_BERI_CYCLE +#define PMC_EV_BERI_LAST PMC_EV_BERI_TAGCACHEMASTER_WRITE_RSP + /* * Cavium Octeon counters. Obtained from cvmx-core.h */ #define __PMC_EV_OCTEON() \ __PMC_EV(OCTEON, CLK) \ __PMC_EV(OCTEON, ISSUE) \ __PMC_EV(OCTEON, RET) \ __PMC_EV(OCTEON, NISSUE) \ __PMC_EV(OCTEON, SISSUE) \ __PMC_EV(OCTEON, DISSUE) \ __PMC_EV(OCTEON, IFI) \ __PMC_EV(OCTEON, BR) \ __PMC_EV(OCTEON, BRMIS) \ __PMC_EV(OCTEON, J) \ __PMC_EV(OCTEON, JMIS) \ __PMC_EV(OCTEON, REPLAY) \ __PMC_EV(OCTEON, IUNA) \ __PMC_EV(OCTEON, TRAP) \ __PMC_EV(OCTEON, UULOAD) \ __PMC_EV(OCTEON, UUSTORE) \ __PMC_EV(OCTEON, ULOAD) \ __PMC_EV(OCTEON, USTORE) \ __PMC_EV(OCTEON, EC) \ __PMC_EV(OCTEON, MC) \ __PMC_EV(OCTEON, CC) \ __PMC_EV(OCTEON, CSRC) \ __PMC_EV(OCTEON, CFETCH) \ __PMC_EV(OCTEON, CPREF) \ __PMC_EV(OCTEON, ICA) \ __PMC_EV(OCTEON, II) \ __PMC_EV(OCTEON, IP) \ __PMC_EV(OCTEON, CIMISS) \ __PMC_EV(OCTEON, WBUF) \ __PMC_EV(OCTEON, WDAT) \ __PMC_EV(OCTEON, WBUFLD) \ __PMC_EV(OCTEON, WBUFFL) \ __PMC_EV(OCTEON, WBUFTR) \ __PMC_EV(OCTEON, BADD) \ __PMC_EV(OCTEON, BADDL2) \ __PMC_EV(OCTEON, BFILL) \ __PMC_EV(OCTEON, DDIDS) \ __PMC_EV(OCTEON, IDIDS) \ __PMC_EV(OCTEON, DIDNA) \ __PMC_EV(OCTEON, LDS) \ __PMC_EV(OCTEON, LMLDS) \ __PMC_EV(OCTEON, IOLDS) \ __PMC_EV(OCTEON, DMLDS) \ __PMC_EV(OCTEON, STS) \ __PMC_EV(OCTEON, LMSTS) \ __PMC_EV(OCTEON, IOSTS) \ __PMC_EV(OCTEON, IOBDMA) \ __PMC_EV(OCTEON, DTLB) \ __PMC_EV(OCTEON, DTLBAD) \ __PMC_EV(OCTEON, ITLB) \ __PMC_EV(OCTEON, SYNC) \ __PMC_EV(OCTEON, SYNCIOB) \ __PMC_EV(OCTEON, SYNCW) #define PMC_EV_OCTEON_FIRST PMC_EV_OCTEON_CLK #define PMC_EV_OCTEON_LAST PMC_EV_OCTEON_SYNCW #define __PMC_EV_PPC7450() \ __PMC_EV(PPC7450, CYCLE) \ __PMC_EV(PPC7450, INSTR_COMPLETED) \ __PMC_EV(PPC7450, TLB_BIT_TRANSITIONS) \ __PMC_EV(PPC7450, INSTR_DISPATCHED) \ __PMC_EV(PPC7450, PMON_EXCEPT) \ __PMC_EV(PPC7450, PMON_SIG) \ __PMC_EV(PPC7450, VPU_INSTR_COMPLETED) \ __PMC_EV(PPC7450, VFPU_INSTR_COMPLETED) \ __PMC_EV(PPC7450, VIU1_INSTR_COMPLETED) \ __PMC_EV(PPC7450, VIU2_INSTR_COMPLETED) \ __PMC_EV(PPC7450, MTVSCR_INSTR_COMPLETED) \ __PMC_EV(PPC7450, MTVRSAVE_INSTR_COMPLETED) \ __PMC_EV(PPC7450, VPU_INSTR_WAIT_CYCLES) \ __PMC_EV(PPC7450, VFPU_INSTR_WAIT_CYCLES) \ __PMC_EV(PPC7450, VIU1_INSTR_WAIT_CYCLES) \ __PMC_EV(PPC7450, VIU2_INSTR_WAIT_CYCLES) \ __PMC_EV(PPC7450, MFVSCR_SYNC_CYCLES) \ __PMC_EV(PPC7450, VSCR_SAT_SET) \ __PMC_EV(PPC7450, STORE_INSTR_COMPLETED) \ __PMC_EV(PPC7450, L1_INSTR_CACHE_MISSES) \ __PMC_EV(PPC7450, L1_DATA_SNOOPS) \ __PMC_EV(PPC7450, UNRESOLVED_BRANCHES) \ __PMC_EV(PPC7450, SPEC_BUFFER_CYCLES) \ __PMC_EV(PPC7450, BRANCH_UNIT_STALL_CYCLES) \ __PMC_EV(PPC7450, TRUE_BRANCH_TARGET_HITS) \ __PMC_EV(PPC7450, BRANCH_LINK_STAC_PREDICTED) \ __PMC_EV(PPC7450, GPR_ISSUE_QUEUE_DISPATCHES) \ __PMC_EV(PPC7450, CYCLES_THREE_INSTR_DISPATCHED) \ __PMC_EV(PPC7450, THRESHOLD_INSTR_QUEUE_ENTRIES_CYCLES) \ __PMC_EV(PPC7450, THRESHOLD_VEC_INSTR_QUEUE_ENTRIES_CYCLES) \ __PMC_EV(PPC7450, CYCLES_NO_COMPLETED_INSTRS) \ __PMC_EV(PPC7450, IU2_INSTR_COMPLETED) \ __PMC_EV(PPC7450, BRANCHES_COMPLETED) \ __PMC_EV(PPC7450, EIEIO_INSTR_COMPLETED) \ __PMC_EV(PPC7450, MTSPR_INSTR_COMPLETED) \ __PMC_EV(PPC7450, SC_INSTR_COMPLETED) \ __PMC_EV(PPC7450, LS_LM_COMPLETED) \ __PMC_EV(PPC7450, ITLB_HW_TABLE_SEARCH_CYCLES) \ __PMC_EV(PPC7450, DTLB_HW_SEARCH_CYCLES_OVER_THRESHOLD) \ __PMC_EV(PPC7450, L1_INSTR_CACHE_ACCESSES) \ __PMC_EV(PPC7450, INSTR_BKPT_MATCHES) \ __PMC_EV(PPC7450, L1_DATA_CACHE_LOAD_MISS_CYCLES_OVER_THRESHOLD)\ __PMC_EV(PPC7450, L1_DATA_SNOOP_HIT_ON_MODIFIED) \ __PMC_EV(PPC7450, LOAD_MISS_ALIAS) \ __PMC_EV(PPC7450, LOAD_MISS_ALIAS_ON_TOUCH) \ __PMC_EV(PPC7450, TOUCH_ALIAS) \ __PMC_EV(PPC7450, L1_DATA_SNOOP_HIT_CASTOUT_QUEUE) \ __PMC_EV(PPC7450, L1_DATA_SNOOP_HIT_CASTOUT) \ __PMC_EV(PPC7450, L1_DATA_SNOOP_HITS) \ __PMC_EV(PPC7450, WRITE_THROUGH_STORES) \ __PMC_EV(PPC7450, CACHE_INHIBITED_STORES) \ __PMC_EV(PPC7450, L1_DATA_LOAD_HIT) \ __PMC_EV(PPC7450, L1_DATA_TOUCH_HIT) \ __PMC_EV(PPC7450, L1_DATA_STORE_HIT) \ __PMC_EV(PPC7450, L1_DATA_TOTAL_HITS) \ __PMC_EV(PPC7450, DST_INSTR_DISPATCHED) \ __PMC_EV(PPC7450, REFRESHED_DSTS) \ __PMC_EV(PPC7450, SUCCESSFUL_DST_TABLE_SEARCHES) \ __PMC_EV(PPC7450, DSS_INSTR_COMPLETED) \ __PMC_EV(PPC7450, DST_STREAM_0_CACHE_LINE_FETCHES) \ __PMC_EV(PPC7450, VTQ_SUSPENDS_DUE_TO_CTX_CHANGE) \ __PMC_EV(PPC7450, VTQ_LINE_FETCH_HIT) \ __PMC_EV(PPC7450, VEC_LOAD_INSTR_COMPLETED) \ __PMC_EV(PPC7450, FP_STORE_INSTR_COMPLETED_IN_LSU) \ __PMC_EV(PPC7450, FPU_RENORMALIZATION) \ __PMC_EV(PPC7450, FPU_DENORMALIZATION) \ __PMC_EV(PPC7450, FP_STORE_CAUSES_STALL_IN_LSU) \ __PMC_EV(PPC7450, LD_ST_TRUE_ALIAS_STALL) \ __PMC_EV(PPC7450, LSU_INDEXED_ALIAS_STALL) \ __PMC_EV(PPC7450, LSU_ALIAS_VS_FSQ_WB0_WB1) \ __PMC_EV(PPC7450, LSU_ALIAS_VS_CSQ) \ __PMC_EV(PPC7450, LSU_LOAD_HIT_LINE_ALIAS_VS_CSQ0) \ __PMC_EV(PPC7450, LSU_LOAD_MISS_LINE_ALIAS_VS_CSQ0) \ __PMC_EV(PPC7450, LSU_TOUCH_LINE_ALIAS_VS_FSQ_WB0_WB1) \ __PMC_EV(PPC7450, LSU_TOUCH_ALIAS_VS_CSQ) \ __PMC_EV(PPC7450, LSU_LMQ_FULL_STALL) \ __PMC_EV(PPC7450, FP_LOAD_INSTR_COMPLETED_IN_LSU) \ __PMC_EV(PPC7450, FP_LOAD_SINGLE_INSTR_COMPLETED_IN_LSU) \ __PMC_EV(PPC7450, FP_LOAD_DOUBLE_COMPLETED_IN_LSU) \ __PMC_EV(PPC7450, LSU_RA_LATCH_STALL) \ __PMC_EV(PPC7450, LSU_LOAD_VS_STORE_QUEUE_ALIAS_STALL) \ __PMC_EV(PPC7450, LSU_LMQ_INDEX_ALIAS) \ __PMC_EV(PPC7450, LSU_STORE_QUEUE_INDEX_ALIAS) \ __PMC_EV(PPC7450, LSU_CSQ_FORWARDING) \ __PMC_EV(PPC7450, LSU_MISALIGNED_LOAD_FINISH) \ __PMC_EV(PPC7450, LSU_MISALIGN_STORE_COMPLETED) \ __PMC_EV(PPC7450, LSU_MISALIGN_STALL) \ __PMC_EV(PPC7450, FP_ONE_QUARTER_FPSCR_RENAMES_BUSY) \ __PMC_EV(PPC7450, FP_ONE_HALF_FPSCR_RENAMES_BUSY) \ __PMC_EV(PPC7450, FP_THREE_QUARTERS_FPSCR_RENAMES_BUSY) \ __PMC_EV(PPC7450, FP_ALL_FPSCR_RENAMES_BUSY) \ __PMC_EV(PPC7450, FP_DENORMALIZED_RESULT) \ __PMC_EV(PPC7450, L1_DATA_TOTAL_MISSES) \ __PMC_EV(PPC7450, DISPATCHES_TO_FPR_ISSUE_QUEUE) \ __PMC_EV(PPC7450, LSU_INSTR_COMPLETED) \ __PMC_EV(PPC7450, LOAD_INSTR_COMPLETED) \ __PMC_EV(PPC7450, SS_SM_INSTR_COMPLETED) \ __PMC_EV(PPC7450, TLBIE_INSTR_COMPLETED) \ __PMC_EV(PPC7450, LWARX_INSTR_COMPLETED) \ __PMC_EV(PPC7450, MFSPR_INSTR_COMPLETED) \ __PMC_EV(PPC7450, REFETCH_SERIALIZATION) \ __PMC_EV(PPC7450, COMPLETION_QUEUE_ENTRIES_OVER_THRESHOLD) \ __PMC_EV(PPC7450, CYCLES_ONE_INSTR_DISPATCHED) \ __PMC_EV(PPC7450, CYCLES_TWO_INSTR_COMPLETED) \ __PMC_EV(PPC7450, ITLB_NON_SPECULATIVE_MISSES) \ __PMC_EV(PPC7450, CYCLES_WAITING_FROM_L1_INSTR_CACHE_MISS) \ __PMC_EV(PPC7450, L1_DATA_LOAD_ACCESS_MISS) \ __PMC_EV(PPC7450, L1_DATA_TOUCH_MISS) \ __PMC_EV(PPC7450, L1_DATA_STORE_MISS) \ __PMC_EV(PPC7450, L1_DATA_TOUCH_MISS_CYCLES) \ __PMC_EV(PPC7450, L1_DATA_CYCLES_USED) \ __PMC_EV(PPC7450, DST_STREAM_1_CACHE_LINE_FETCHES) \ __PMC_EV(PPC7450, VTQ_STREAM_CANCELED_PREMATURELY) \ __PMC_EV(PPC7450, VTQ_RESUMES_DUE_TO_CTX_CHANGE) \ __PMC_EV(PPC7450, VTQ_LINE_FETCH_MISS) \ __PMC_EV(PPC7450, VTQ_LINE_FETCH) \ __PMC_EV(PPC7450, TLBIE_SNOOPS) \ __PMC_EV(PPC7450, L1_INSTR_CACHE_RELOADS) \ __PMC_EV(PPC7450, L1_DATA_CACHE_RELOADS) \ __PMC_EV(PPC7450, L1_DATA_CACHE_CASTOUTS_TO_L2) \ __PMC_EV(PPC7450, STORE_MERGE_GATHER) \ __PMC_EV(PPC7450, CACHEABLE_STORE_MERGE_TO_32_BYTES) \ __PMC_EV(PPC7450, DATA_BKPT_MATCHES) \ __PMC_EV(PPC7450, FALL_THROUGH_BRANCHES_PROCESSED) \ __PMC_EV(PPC7450, \ FIRST_SPECULATIVE_BRANCH_BUFFER_RESOLVED_CORRECTLY) \ __PMC_EV(PPC7450, SECOND_SPECULATION_BUFFER_ACTIVE) \ __PMC_EV(PPC7450, BPU_STALL_ON_LR_DEPENDENCY) \ __PMC_EV(PPC7450, BTIC_MISS) \ __PMC_EV(PPC7450, BRANCH_LINK_STACK_CORRECTLY_RESOLVED) \ __PMC_EV(PPC7450, FPR_ISSUE_STALLED) \ __PMC_EV(PPC7450, SWITCHES_BETWEEN_PRIV_USER) \ __PMC_EV(PPC7450, LSU_COMPLETES_FP_STORE_SINGLE) \ __PMC_EV(PPC7450, VR_ISSUE_QUEUE_DISPATCHES) \ __PMC_EV(PPC7450, VR_STALLS) \ __PMC_EV(PPC7450, GPR_RENAME_BUFFER_ENTRIES_OVER_THRESHOLD) \ __PMC_EV(PPC7450, FPR_ISSUE_QUEUE_ENTRIES) \ __PMC_EV(PPC7450, FPU_INSTR_COMPLETED) \ __PMC_EV(PPC7450, STWCX_INSTR_COMPLETED) \ __PMC_EV(PPC7450, LS_LM_INSTR_PIECES) \ __PMC_EV(PPC7450, ITLB_HW_SEARCH_CYCLES_OVER_THRESHOLD) \ __PMC_EV(PPC7450, DTLB_MISSES) \ __PMC_EV(PPC7450, CANCELLED_L1_INSTR_CACHE_MISSES) \ __PMC_EV(PPC7450, L1_DATA_CACHE_OP_HIT) \ __PMC_EV(PPC7450, L1_DATA_LOAD_MISS_CYCLES) \ __PMC_EV(PPC7450, L1_DATA_PUSHES) \ __PMC_EV(PPC7450, L1_DATA_TOTAL_MISS) \ __PMC_EV(PPC7450, VT2_FETCHES) \ __PMC_EV(PPC7450, TAKEN_BRANCHES_PROCESSED) \ __PMC_EV(PPC7450, BRANCH_FLUSHES) \ __PMC_EV(PPC7450, \ SECOND_SPECULATIVE_BRANCH_BUFFER_RESOLVED_CORRECTLY) \ __PMC_EV(PPC7450, THIRD_SPECULATION_BUFFER_ACTIVE) \ __PMC_EV(PPC7450, BRANCH_UNIT_STALL_ON_CTR_DEPENDENCY) \ __PMC_EV(PPC7450, FAST_BTIC_HIT) \ __PMC_EV(PPC7450, BRANCH_LINK_STACK_MISPREDICTED) \ __PMC_EV(PPC7450, CYCLES_THREE_INSTR_COMPLETED) \ __PMC_EV(PPC7450, CYCLES_NO_INSTR_DISPATCHED) \ __PMC_EV(PPC7450, GPR_ISSUE_QUEUE_ENTRIES_OVER_THRESHOLD) \ __PMC_EV(PPC7450, GPR_ISSUE_QUEUE_STALLED) \ __PMC_EV(PPC7450, IU1_INSTR_COMPLETED) \ __PMC_EV(PPC7450, DSSALL_INSTR_COMPLETED) \ __PMC_EV(PPC7450, TLBSYNC_INSTR_COMPLETED) \ __PMC_EV(PPC7450, SYNC_INSTR_COMPLETED) \ __PMC_EV(PPC7450, SS_SM_INSTR_PIECES) \ __PMC_EV(PPC7450, DTLB_HW_SEARCH_CYCLES) \ __PMC_EV(PPC7450, SNOOP_RETRIES) \ __PMC_EV(PPC7450, SUCCESSFUL_STWCX) \ __PMC_EV(PPC7450, DST_STREAM_3_CACHE_LINE_FETCHES) \ __PMC_EV(PPC7450, \ THIRD_SPECULATIVE_BRANCH_BUFFER_RESOLVED_CORRECTLY) \ __PMC_EV(PPC7450, MISPREDICTED_BRANCHES) \ __PMC_EV(PPC7450, FOLDED_BRANCHES) \ __PMC_EV(PPC7450, FP_STORE_DOUBLE_COMPLETES_IN_LSU) \ __PMC_EV(PPC7450, L2_CACHE_HITS) \ __PMC_EV(PPC7450, L3_CACHE_HITS) \ __PMC_EV(PPC7450, L2_INSTR_CACHE_MISSES) \ __PMC_EV(PPC7450, L3_INSTR_CACHE_MISSES) \ __PMC_EV(PPC7450, L2_DATA_CACHE_MISSES) \ __PMC_EV(PPC7450, L3_DATA_CACHE_MISSES) \ __PMC_EV(PPC7450, L2_LOAD_HITS) \ __PMC_EV(PPC7450, L2_STORE_HITS) \ __PMC_EV(PPC7450, L3_LOAD_HITS) \ __PMC_EV(PPC7450, L3_STORE_HITS) \ __PMC_EV(PPC7450, L2_TOUCH_HITS) \ __PMC_EV(PPC7450, L3_TOUCH_HITS) \ __PMC_EV(PPC7450, SNOOP_MODIFIED) \ __PMC_EV(PPC7450, SNOOP_VALID) \ __PMC_EV(PPC7450, INTERVENTION) \ __PMC_EV(PPC7450, L2_CACHE_MISSES) \ __PMC_EV(PPC7450, L3_CACHE_MISSES) \ __PMC_EV(PPC7450, L2_CACHE_CASTOUTS) \ __PMC_EV(PPC7450, L3_CACHE_CASTOUTS) \ __PMC_EV(PPC7450, L2SQ_FULL_CYCLES) \ __PMC_EV(PPC7450, L3SQ_FULL_CYCLES) \ __PMC_EV(PPC7450, RAQ_FULL_CYCLES) \ __PMC_EV(PPC7450, WAQ_FULL_CYCLES) \ __PMC_EV(PPC7450, L1_EXTERNAL_INTERVENTIONS) \ __PMC_EV(PPC7450, L2_EXTERNAL_INTERVENTIONS) \ __PMC_EV(PPC7450, L3_EXTERNAL_INTERVENTIONS) \ __PMC_EV(PPC7450, EXTERNAL_INTERVENTIONS) \ __PMC_EV(PPC7450, EXTERNAL_PUSHES) \ __PMC_EV(PPC7450, EXTERNAL_SNOOP_RETRY) \ __PMC_EV(PPC7450, DTQ_FULL_CYCLES) \ __PMC_EV(PPC7450, BUS_RETRY) \ __PMC_EV(PPC7450, L2_VALID_REQUEST) \ __PMC_EV(PPC7450, BORDQ_FULL) \ __PMC_EV(PPC7450, BUS_TAS_FOR_READS) \ __PMC_EV(PPC7450, BUS_TAS_FOR_WRITES) \ __PMC_EV(PPC7450, BUS_READS_NOT_RETRIED) \ __PMC_EV(PPC7450, BUS_WRITES_NOT_RETRIED) \ __PMC_EV(PPC7450, BUS_READS_WRITES_NOT_RETRIED) \ __PMC_EV(PPC7450, BUS_RETRY_DUE_TO_L1_RETRY) \ __PMC_EV(PPC7450, BUS_RETRY_DUE_TO_PREVIOUS_ADJACENT) \ __PMC_EV(PPC7450, BUS_RETRY_DUE_TO_COLLISION) \ __PMC_EV(PPC7450, BUS_RETRY_DUE_TO_INTERVENTION_ORDERING) \ __PMC_EV(PPC7450, SNOOP_REQUESTS) \ __PMC_EV(PPC7450, PREFETCH_ENGINE_REQUEST) \ __PMC_EV(PPC7450, PREFETCH_ENGINE_COLLISION_VS_LOAD) \ __PMC_EV(PPC7450, PREFETCH_ENGINE_COLLISION_VS_STORE) \ __PMC_EV(PPC7450, PREFETCH_ENGINE_COLLISION_VS_INSTR_FETCH) \ __PMC_EV(PPC7450, \ PREFETCH_ENGINE_COLLISION_VS_LOAD_STORE_INSTR_FETCH) \ __PMC_EV(PPC7450, PREFETCH_ENGINE_FULL) #define PMC_EV_PPC7450_FIRST PMC_EV_PPC7450_CYCLE #define PMC_EV_PPC7450_LAST PMC_EV_PPC7450_PREFETCH_ENGINE_FULL #define __PMC_EV_PPC970() \ __PMC_EV(PPC970, INSTR_COMPLETED) \ __PMC_EV(PPC970, MARKED_GROUP_DISPATCH) \ __PMC_EV(PPC970, MARKED_STORE_COMPLETED) \ __PMC_EV(PPC970, GCT_EMPTY) \ __PMC_EV(PPC970, RUN_CYCLES) \ __PMC_EV(PPC970, OVERFLOW) \ __PMC_EV(PPC970, CYCLES) \ __PMC_EV(PPC970, THRESHOLD_TIMEOUT) \ __PMC_EV(PPC970, GROUP_DISPATCH) \ __PMC_EV(PPC970, BR_MARKED_INSTR_FINISH) \ __PMC_EV(PPC970, GCT_EMPTY_BY_SRQ_FULL) \ __PMC_EV(PPC970, STOP_COMPLETION) \ __PMC_EV(PPC970, LSU_EMPTY) \ __PMC_EV(PPC970, MARKED_STORE_WITH_INTR) \ __PMC_EV(PPC970, CYCLES_IN_SUPER) \ __PMC_EV(PPC970, VPU_MARKED_INSTR_COMPLETED) \ __PMC_EV(PPC970, FXU0_IDLE_FXU1_BUSY) \ __PMC_EV(PPC970, SRQ_EMPTY) \ __PMC_EV(PPC970, MARKED_GROUP_COMPLETED) \ __PMC_EV(PPC970, CR_MARKED_INSTR_FINISH) \ __PMC_EV(PPC970, DISPATCH_SUCCESS) \ __PMC_EV(PPC970, FXU0_IDLE_FXU1_IDLE) \ __PMC_EV(PPC970, ONE_PLUS_INSTR_COMPLETED) \ __PMC_EV(PPC970, GROUP_MARKED_IDU) \ __PMC_EV(PPC970, MARKED_GROUP_COMPLETE_TIMEOUT) \ __PMC_EV(PPC970, FXU0_BUSY_FXU1_BUSY) \ __PMC_EV(PPC970, MARKED_STORE_SENT_TO_STS) \ __PMC_EV(PPC970, FXU_MARKED_INSTR_FINISHED) \ __PMC_EV(PPC970, MARKED_GROUP_ISSUED) \ __PMC_EV(PPC970, FXU0_BUSY_FXU1_IDLE) \ __PMC_EV(PPC970, GROUP_COMPLETED) \ __PMC_EV(PPC970, FPU_MARKED_INSTR_COMPLETED) \ __PMC_EV(PPC970, MARKED_INSTR_FINISH_ANY_UNIT) \ __PMC_EV(PPC970, EXTERNAL_INTERRUPT) \ __PMC_EV(PPC970, GROUP_DISPATCH_REJECT) \ __PMC_EV(PPC970, LSU_MARKED_INSTR_FINISH) \ __PMC_EV(PPC970, TIMEBASE_EVENT) \ __PMC_EV(PPC970, LSU_COMPLETION_STALL) \ __PMC_EV(PPC970, FXU_COMPLETION_STALL) \ __PMC_EV(PPC970, DCACHE_MISS_COMPLETION_STALL) \ __PMC_EV(PPC970, FPU_COMPLETION_STALL) \ __PMC_EV(PPC970, FXU_LONG_INSTR_COMPLETION_STALL) \ __PMC_EV(PPC970, REJECT_COMPLETION_STALL) \ __PMC_EV(PPC970, FPU_LONG_INSTR_COMPLETION_STALL) \ __PMC_EV(PPC970, GCT_EMPTY_BY_ICACHE_MISS) \ __PMC_EV(PPC970, REJECT_COMPLETION_STALL_ERAT_MISS) \ __PMC_EV(PPC970, GCT_EMPTY_BY_BRANCH_MISS_PREDICT) \ __PMC_EV(PPC970, BUS_HIGH) \ __PMC_EV(PPC970, BUS_LOW) \ __PMC_EV(PPC970, ADDER) #define PMC_EV_PPC970_FIRST PMC_EV_PPC970_INSTR_COMPLETED #define PMC_EV_PPC970_LAST PMC_EV_PPC970_ADDER #define __PMC_EV_E500() \ __PMC_EV(E500, CYCLES) \ __PMC_EV(E500, INSTR_COMPLETED) \ __PMC_EV(E500, UOPS_COMPLETED) \ __PMC_EV(E500, INSTR_FETCHED) \ __PMC_EV(E500, UOPS_DECODED) \ __PMC_EV(E500, PM_EVENT_TRANSITIONS) \ __PMC_EV(E500, PM_EVENT_CYCLES) \ __PMC_EV(E500, BRANCH_INSTRS_COMPLETED) \ __PMC_EV(E500, LOAD_UOPS_COMPLETED) \ __PMC_EV(E500, STORE_UOPS_COMPLETED) \ __PMC_EV(E500, CQ_REDIRECTS) \ __PMC_EV(E500, BRANCHES_FINISHED) \ __PMC_EV(E500, TAKEN_BRANCHES_FINISHED) \ __PMC_EV(E500, FINISHED_UNCOND_BRANCHES_MISS_BTB) \ __PMC_EV(E500, BRANCH_MISPRED) \ __PMC_EV(E500, BTB_BRANCH_MISPRED_FROM_DIRECTION) \ __PMC_EV(E500, BTB_HITS_PSEUDO_HITS) \ __PMC_EV(E500, CYCLES_DECODE_STALLED) \ __PMC_EV(E500, CYCLES_ISSUE_STALLED) \ __PMC_EV(E500, CYCLES_BRANCH_ISSUE_STALLED) \ __PMC_EV(E500, CYCLES_SU1_SCHED_STALLED) \ __PMC_EV(E500, CYCLES_SU2_SCHED_STALLED) \ __PMC_EV(E500, CYCLES_MU_SCHED_STALLED) \ __PMC_EV(E500, CYCLES_LRU_SCHED_STALLED) \ __PMC_EV(E500, CYCLES_BU_SCHED_STALLED) \ __PMC_EV(E500, TOTAL_TRANSLATED) \ __PMC_EV(E500, LOADS_TRANSLATED) \ __PMC_EV(E500, STORES_TRANSLATED) \ __PMC_EV(E500, TOUCHES_TRANSLATED) \ __PMC_EV(E500, CACHEOPS_TRANSLATED) \ __PMC_EV(E500, CACHE_INHIBITED_ACCESS_TRANSLATED) \ __PMC_EV(E500, GUARDED_LOADS_TRANSLATED) \ __PMC_EV(E500, WRITE_THROUGH_STORES_TRANSLATED) \ __PMC_EV(E500, MISALIGNED_LOAD_STORE_ACCESS_TRANSLATED) \ __PMC_EV(E500, TOTAL_ALLOCATED_TO_DLFB) \ __PMC_EV(E500, LOADS_TRANSLATED_ALLOCATED_TO_DLFB) \ __PMC_EV(E500, STORES_COMPLETED_ALLOCATED_TO_DLFB) \ __PMC_EV(E500, TOUCHES_TRANSLATED_ALLOCATED_TO_DLFB) \ __PMC_EV(E500, STORES_COMPLETED) \ __PMC_EV(E500, DATA_L1_CACHE_LOCKS) \ __PMC_EV(E500, DATA_L1_CACHE_RELOADS) \ __PMC_EV(E500, DATA_L1_CACHE_CASTOUTS) \ __PMC_EV(E500, LOAD_MISS_DLFB_FULL) \ __PMC_EV(E500, LOAD_MISS_LDQ_FULL) \ __PMC_EV(E500, LOAD_GUARDED_MISS) \ __PMC_EV(E500, STORE_TRANSLATE_WHEN_QUEUE_FULL) \ __PMC_EV(E500, ADDRESS_COLLISION) \ __PMC_EV(E500, DATA_MMU_MISS) \ __PMC_EV(E500, DATA_MMU_BUSY) \ __PMC_EV(E500, PART2_MISALIGNED_CACHE_ACCESS) \ __PMC_EV(E500, LOAD_MISS_DLFB_FULL_CYCLES) \ __PMC_EV(E500, LOAD_MISS_LDQ_FULL_CYCLES) \ __PMC_EV(E500, LOAD_GUARDED_MISS_CYCLES) \ __PMC_EV(E500, STORE_TRANSLATE_WHEN_QUEUE_FULL_CYCLES) \ __PMC_EV(E500, ADDRESS_COLLISION_CYCLES) \ __PMC_EV(E500, DATA_MMU_MISS_CYCLES) \ __PMC_EV(E500, DATA_MMU_BUSY_CYCLES) \ __PMC_EV(E500, PART2_MISALIGNED_CACHE_ACCESS_CYCLES) \ __PMC_EV(E500, INSTR_L1_CACHE_LOCKS) \ __PMC_EV(E500, INSTR_L1_CACHE_RELOADS) \ __PMC_EV(E500, INSTR_L1_CACHE_FETCHES) \ __PMC_EV(E500, INSTR_MMU_TLB4K_RELOADS) \ __PMC_EV(E500, INSTR_MMU_VSP_RELOADS) \ __PMC_EV(E500, DATA_MMU_TLB4K_RELOADS) \ __PMC_EV(E500, DATA_MMU_VSP_RELOADS) \ __PMC_EV(E500, L2MMU_MISSES) \ __PMC_EV(E500, BIU_MASTER_REQUESTS) \ __PMC_EV(E500, BIU_MASTER_INSTR_SIDE_REQUESTS) \ __PMC_EV(E500, BIU_MASTER_DATA_SIDE_REQUESTS) \ __PMC_EV(E500, BIU_MASTER_DATA_SIDE_CASTOUT_REQUESTS) \ __PMC_EV(E500, BIU_MASTER_RETRIES) \ __PMC_EV(E500, SNOOP_REQUESTS) \ __PMC_EV(E500, SNOOP_HITS) \ __PMC_EV(E500, SNOOP_PUSHES) \ __PMC_EV(E500, SNOOP_RETRIES) \ __PMC_EV(E500, DLFB_LOAD_MISS_CYCLES) \ __PMC_EV(E500, ILFB_FETCH_MISS_CYCLES) \ __PMC_EV(E500, EXT_INPU_INTR_LATENCY_CYCLES) \ __PMC_EV(E500, CRIT_INPUT_INTR_LATENCY_CYCLES) \ __PMC_EV(E500, EXT_INPUT_INTR_PENDING_LATENCY_CYCLES) \ __PMC_EV(E500, CRIT_INPUT_INTR_PENDING_LATENCY_CYCLES) \ __PMC_EV(E500, PMC0_OVERFLOW) \ __PMC_EV(E500, PMC1_OVERFLOW) \ __PMC_EV(E500, PMC2_OVERFLOW) \ __PMC_EV(E500, PMC3_OVERFLOW) \ __PMC_EV(E500, INTERRUPTS_TAKEN) \ __PMC_EV(E500, EXT_INPUT_INTR_TAKEN) \ __PMC_EV(E500, CRIT_INPUT_INTR_TAKEN) \ __PMC_EV(E500, SYSCALL_TRAP_INTR) \ __PMC_EV(E500, TLB_BIT_TRANSITIONS) \ __PMC_EV(E500, L2_LINEFILL_BUFFER) \ __PMC_EV(E500, LV2_VS) \ __PMC_EV(E500, CASTOUTS_RELEASED) \ __PMC_EV(E500, INTV_ALLOCATIONS) \ __PMC_EV(E500, DLFB_RETRIES_TO_MBAR) \ __PMC_EV(E500, STORE_RETRIES) \ __PMC_EV(E500, STASH_L1_HITS) \ __PMC_EV(E500, STASH_L2_HITS) \ __PMC_EV(E500, STASH_BUSY_1) \ __PMC_EV(E500, STASH_BUSY_2) \ __PMC_EV(E500, STASH_BUSY_3) \ __PMC_EV(E500, STASH_HITS) \ __PMC_EV(E500, STASH_HIT_DLFB) \ __PMC_EV(E500, STASH_REQUESTS) \ __PMC_EV(E500, STASH_REQUESTS_L1) \ __PMC_EV(E500, STASH_REQUESTS_L2) \ __PMC_EV(E500, STALLS_NO_CAQ_OR_COB) \ __PMC_EV(E500, L2_CACHE_ACCESSES) \ __PMC_EV(E500, L2_HIT_CACHE_ACCESSES) \ __PMC_EV(E500, L2_CACHE_DATA_ACCESSES) \ __PMC_EV(E500, L2_CACHE_DATA_HITS) \ __PMC_EV(E500, L2_CACHE_INSTR_ACCESSES) \ __PMC_EV(E500, L2_CACHE_INSTR_HITS) \ __PMC_EV(E500, L2_CACHE_ALLOCATIONS) \ __PMC_EV(E500, L2_CACHE_DATA_ALLOCATIONS) \ __PMC_EV(E500, L2_CACHE_DIRTY_DATA_ALLOCATIONS) \ __PMC_EV(E500, L2_CACHE_INSTR_ALLOCATIONS) \ __PMC_EV(E500, L2_CACHE_UPDATES) \ __PMC_EV(E500, L2_CACHE_CLEAN_UPDATES) \ __PMC_EV(E500, L2_CACHE_DIRTY_UPDATES) \ __PMC_EV(E500, L2_CACHE_CLEAN_REDUNDANT_UPDATES) \ __PMC_EV(E500, L2_CACHE_DIRTY_REDUNDANT_UPDATES) \ __PMC_EV(E500, L2_CACHE_LOCKS) \ __PMC_EV(E500, L2_CACHE_CASTOUTS) \ __PMC_EV(E500, L2_CACHE_DATA_DIRTY_HITS) \ __PMC_EV(E500, INSTR_LFB_WENT_HIGH_PRIORITY) \ __PMC_EV(E500, SNOOP_THROTTLING_TURNED_ON) \ __PMC_EV(E500, L2_CLEAN_LINE_INVALIDATIONS) \ __PMC_EV(E500, L2_INCOHERENT_LINE_INVALIDATIONS) \ __PMC_EV(E500, L2_COHERENT_LINE_INVALIDATIONS) \ __PMC_EV(E500, COHERENT_LOOKUP_MISS_DUE_TO_VALID_BUT_INCOHERENT_MATCHES) \ __PMC_EV(E500, IAC1S_DETECTED) \ __PMC_EV(E500, IAC2S_DETECTED) \ __PMC_EV(E500, DAC1S_DTECTED) \ __PMC_EV(E500, DAC2S_DTECTED) \ __PMC_EV(E500, DVT0_DETECTED) \ __PMC_EV(E500, DVT1_DETECTED) \ __PMC_EV(E500, DVT2_DETECTED) \ __PMC_EV(E500, DVT3_DETECTED) \ __PMC_EV(E500, DVT4_DETECTED) \ __PMC_EV(E500, DVT5_DETECTED) \ __PMC_EV(E500, DVT6_DETECTED) \ __PMC_EV(E500, DVT7_DETECTED) \ __PMC_EV(E500, CYCLES_COMPLETION_STALLED_NEXUS_FIFO_FULL) \ __PMC_EV(E500, FPU_DOUBLE_PUMP) \ __PMC_EV(E500, FPU_FINISH) \ __PMC_EV(E500, FPU_DIVIDE_CYCLES) \ __PMC_EV(E500, FPU_DENORM_INPUT_CYCLES) \ __PMC_EV(E500, FPU_RESULT_STALL_CYCLES) \ __PMC_EV(E500, FPU_FPSCR_FULL_STALL) \ __PMC_EV(E500, FPU_PIPE_SYNC_STALLS) \ __PMC_EV(E500, FPU_INPUT_DATA_STALLS) \ __PMC_EV(E500, DECORATED_LOADS) \ __PMC_EV(E500, DECORATED_STORES) \ __PMC_EV(E500, LOAD_RETRIES) \ __PMC_EV(E500, STWCX_SUCCESSES) \ __PMC_EV(E500, STWCX_FAILURES) \ #define PMC_EV_E500_FIRST PMC_EV_E500_CYCLES #define PMC_EV_E500_LAST PMC_EV_E500_STWCX_FAILURES /* * All known PMC events. * * PMC event numbers are allocated sparsely to allow new PMC events to * be added to a PMC class without breaking ABI compatibility. The * current allocation scheme is: * * START #EVENTS DESCRIPTION * 0 0x1000 Reserved * 0x1000 0x0001 TSC * 0x2000 0x0080 AMD K7 events * 0x2080 0x0100 AMD K8 events * 0x10000 0x0080 INTEL architectural fixed-function events * 0x10080 0x0F80 INTEL architectural programmable events * 0x11000 0x0080 INTEL Pentium 4 events * 0x11080 0x0080 INTEL Pentium MMX events * 0x11100 0x0100 INTEL Pentium Pro/P-II/P-III/Pentium-M events * 0x11200 0x00FF INTEL XScale events * 0x11300 0x00FF MIPS 24K events * 0x11400 0x00FF Octeon events * 0x11500 0x00FF MIPS 74K events + * 0x11600 0x00FF BERI statcounters * 0x13000 0x00FF MPC7450 events * 0x13100 0x00FF IBM PPC970 events * 0x13300 0x00FF Freescale e500 events * 0x14000 0x0100 ARMv7 events * 0x14100 0x0100 ARMv8 events * 0x20000 0x1000 Software events */ #define __PMC_EVENTS() \ __PMC_EV_BLOCK(TSC, 0x01000) \ __PMC_EV_TSC() \ __PMC_EV_BLOCK(IAF, 0x10000) \ __PMC_EV_IAF() \ __PMC_EV_BLOCK(K7, 0x2000) \ __PMC_EV_K7() \ __PMC_EV_BLOCK(K8, 0x2080) \ __PMC_EV_K8() \ __PMC_EV_BLOCK(XSCALE, 0x11200) \ __PMC_EV_XSCALE() \ __PMC_EV_BLOCK(MIPS24K, 0x11300) \ __PMC_EV_MIPS24K() \ __PMC_EV_BLOCK(OCTEON, 0x11400) \ __PMC_EV_OCTEON() \ __PMC_EV_BLOCK(MIPS74K, 0x11500) \ __PMC_EV_MIPS74K() \ + __PMC_EV_BLOCK(BERI, 0x11600) \ + __PMC_EV_BERI() \ __PMC_EV_BLOCK(UCP, 0x12080) \ __PMC_EV_UCP() \ __PMC_EV_BLOCK(PPC7450, 0x13000) \ __PMC_EV_PPC7450() \ __PMC_EV_BLOCK(PPC970, 0x13100) \ __PMC_EV_PPC970() \ __PMC_EV_BLOCK(E500, 0x13300) \ __PMC_EV_E500() \ __PMC_EV_BLOCK(ARMV7, 0x14000) \ __PMC_EV_ARMV7() \ __PMC_EV_BLOCK(ARMV8, 0x14100) \ __PMC_EV_ARMV8() #define PMC_EVENT_FIRST PMC_EV_TSC_TSC #define PMC_EVENT_LAST PMC_EV_SOFT_LAST #endif /* _DEV_HWPMC_PMC_EVENTS_H_ */ Index: projects/clang900-import/sys/dev/vt/vt_core.c =================================================================== --- projects/clang900-import/sys/dev/vt/vt_core.c (revision 352536) +++ projects/clang900-import/sys/dev/vt/vt_core.c (revision 352537) @@ -1,2959 +1,2981 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009, 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed by Ed Schouten under sponsorship from the * FreeBSD Foundation. * * Portions of this software were developed by Oleksandr Rybalko * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif static tc_bell_t vtterm_bell; static tc_cursor_t vtterm_cursor; static tc_putchar_t vtterm_putchar; static tc_fill_t vtterm_fill; static tc_copy_t vtterm_copy; static tc_pre_input_t vtterm_pre_input; static tc_post_input_t vtterm_post_input; static tc_param_t vtterm_param; static tc_done_t vtterm_done; static tc_cnprobe_t vtterm_cnprobe; static tc_cngetc_t vtterm_cngetc; static tc_cngrab_t vtterm_cngrab; static tc_cnungrab_t vtterm_cnungrab; static tc_opened_t vtterm_opened; static tc_ioctl_t vtterm_ioctl; static tc_mmap_t vtterm_mmap; const struct terminal_class vt_termclass = { .tc_bell = vtterm_bell, .tc_cursor = vtterm_cursor, .tc_putchar = vtterm_putchar, .tc_fill = vtterm_fill, .tc_copy = vtterm_copy, .tc_pre_input = vtterm_pre_input, .tc_post_input = vtterm_post_input, .tc_param = vtterm_param, .tc_done = vtterm_done, .tc_cnprobe = vtterm_cnprobe, .tc_cngetc = vtterm_cngetc, .tc_cngrab = vtterm_cngrab, .tc_cnungrab = vtterm_cnungrab, .tc_opened = vtterm_opened, .tc_ioctl = vtterm_ioctl, .tc_mmap = vtterm_mmap, }; /* * Use a constant timer of 25 Hz to redraw the screen. * * XXX: In theory we should only fire up the timer when there is really * activity. Unfortunately we cannot always start timers. We really * don't want to process kernel messages synchronously, because it * really slows down the system. */ #define VT_TIMERFREQ 25 /* Bell pitch/duration. */ #define VT_BELLDURATION ((5 * hz + 99) / 100) #define VT_BELLPITCH 800 #define VT_UNIT(vw) ((vw)->vw_device->vd_unit * VT_MAXWINDOWS + \ (vw)->vw_number) static SYSCTL_NODE(_kern, OID_AUTO, vt, CTLFLAG_RD, 0, "vt(9) parameters"); static VT_SYSCTL_INT(enable_altgr, 1, "Enable AltGr key (Do not assume R.Alt as Alt)"); static VT_SYSCTL_INT(enable_bell, 1, "Enable bell"); static VT_SYSCTL_INT(debug, 0, "vt(9) debug level"); static VT_SYSCTL_INT(deadtimer, 15, "Time to wait busy process in VT_PROCESS mode"); static VT_SYSCTL_INT(suspendswitch, 1, "Switch to VT0 before suspend"); /* Allow to disable some keyboard combinations. */ static VT_SYSCTL_INT(kbd_halt, 1, "Enable halt keyboard combination. " "See kbdmap(5) to configure."); static VT_SYSCTL_INT(kbd_poweroff, 1, "Enable Power Off keyboard combination. " "See kbdmap(5) to configure."); static VT_SYSCTL_INT(kbd_reboot, 1, "Enable reboot keyboard combination. " "See kbdmap(5) to configure (typically Ctrl-Alt-Delete)."); static VT_SYSCTL_INT(kbd_debug, 1, "Enable key combination to enter debugger. " "See kbdmap(5) to configure (typically Ctrl-Alt-Esc)."); static VT_SYSCTL_INT(kbd_panic, 0, "Enable request to panic. " "See kbdmap(5) to configure."); /* Used internally, not a tunable. */ int vt_draw_logo_cpus; VT_SYSCTL_INT(splash_cpu, 0, "Show logo CPUs during boot"); VT_SYSCTL_INT(splash_ncpu, 0, "Override number of logos displayed " "(0 = do not override)"); VT_SYSCTL_INT(splash_cpu_style, 2, "Draw logo style " "(0 = Alternate beastie, 1 = Beastie, 2 = Orb)"); VT_SYSCTL_INT(splash_cpu_duration, 10, "Hide logos after (seconds)"); static unsigned int vt_unit = 0; static MALLOC_DEFINE(M_VT, "vt", "vt device"); struct vt_device *main_vd = &vt_consdev; /* Boot logo. */ extern unsigned int vt_logo_width; extern unsigned int vt_logo_height; extern unsigned int vt_logo_depth; extern unsigned char vt_logo_image[]; #ifndef DEV_SPLASH #define vtterm_draw_cpu_logos(...) do {} while (0) const unsigned int vt_logo_sprite_height; #endif /* Font. */ extern struct vt_font vt_font_default; #ifndef SC_NO_CUTPASTE extern struct vt_mouse_cursor vt_default_mouse_pointer; #endif static int signal_vt_rel(struct vt_window *); static int signal_vt_acq(struct vt_window *); static int finish_vt_rel(struct vt_window *, int, int *); static int finish_vt_acq(struct vt_window *); static int vt_window_switch(struct vt_window *); static int vt_late_window_switch(struct vt_window *); static int vt_proc_alive(struct vt_window *); static void vt_resize(struct vt_device *); static void vt_update_static(void *); #ifndef SC_NO_CUTPASTE static void vt_mouse_paste(void); #endif static void vt_suspend_handler(void *priv); static void vt_resume_handler(void *priv); SET_DECLARE(vt_drv_set, struct vt_driver); #define _VTDEFH MAX(100, PIXEL_HEIGHT(VT_FB_MAX_HEIGHT)) #define _VTDEFW MAX(200, PIXEL_WIDTH(VT_FB_MAX_WIDTH)) struct terminal vt_consterm; static struct vt_window vt_conswindow; #ifndef SC_NO_CONSDRAWN static term_char_t vt_consdrawn[PIXEL_HEIGHT(VT_FB_MAX_HEIGHT) * PIXEL_WIDTH(VT_FB_MAX_WIDTH)]; static term_color_t vt_consdrawnfg[PIXEL_HEIGHT(VT_FB_MAX_HEIGHT) * PIXEL_WIDTH(VT_FB_MAX_WIDTH)]; static term_color_t vt_consdrawnbg[PIXEL_HEIGHT(VT_FB_MAX_HEIGHT) * PIXEL_WIDTH(VT_FB_MAX_WIDTH)]; #endif struct vt_device vt_consdev = { .vd_driver = NULL, .vd_softc = NULL, .vd_prev_driver = NULL, .vd_prev_softc = NULL, .vd_flags = VDF_INVALID, .vd_windows = { [VT_CONSWINDOW] = &vt_conswindow, }, .vd_curwindow = &vt_conswindow, .vd_kbstate = 0, #ifndef SC_NO_CUTPASTE .vd_pastebuf = { .vpb_buf = NULL, .vpb_bufsz = 0, .vpb_len = 0 }, .vd_mcursor = &vt_default_mouse_pointer, .vd_mcursor_fg = TC_WHITE, .vd_mcursor_bg = TC_BLACK, #endif #ifndef SC_NO_CONSDRAWN .vd_drawn = vt_consdrawn, .vd_drawnfg = vt_consdrawnfg, .vd_drawnbg = vt_consdrawnbg, #endif }; static term_char_t vt_constextbuf[(_VTDEFW) * (VBF_DEFAULT_HISTORY_SIZE)]; static term_char_t *vt_constextbufrows[VBF_DEFAULT_HISTORY_SIZE]; static struct vt_window vt_conswindow = { .vw_number = VT_CONSWINDOW, .vw_flags = VWF_CONSOLE, .vw_buf = { .vb_buffer = &vt_constextbuf[0], .vb_rows = &vt_constextbufrows[0], .vb_history_size = VBF_DEFAULT_HISTORY_SIZE, .vb_curroffset = 0, .vb_roffset = 0, .vb_flags = VBF_STATIC, .vb_mark_start = {.tp_row = 0, .tp_col = 0,}, .vb_mark_end = {.tp_row = 0, .tp_col = 0,}, .vb_scr_size = { .tp_row = _VTDEFH, .tp_col = _VTDEFW, }, }, .vw_device = &vt_consdev, .vw_terminal = &vt_consterm, .vw_kbdmode = K_XLATE, .vw_grabbed = 0, }; struct terminal vt_consterm = { .tm_class = &vt_termclass, .tm_softc = &vt_conswindow, .tm_flags = TF_CONS, }; static struct consdev vt_consterm_consdev = { .cn_ops = &termcn_cnops, .cn_arg = &vt_consterm, .cn_name = "ttyv0", }; /* Add to set of consoles. */ DATA_SET(cons_set, vt_consterm_consdev); /* * Right after kmem is done to allow early drivers to use locking and allocate * memory. */ SYSINIT(vt_update_static, SI_SUB_KMEM, SI_ORDER_ANY, vt_update_static, &vt_consdev); /* Delay until all devices attached, to not waste time. */ SYSINIT(vt_early_cons, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_ANY, vt_upgrade, &vt_consdev); /* Initialize locks/mem depended members. */ static void vt_update_static(void *dummy) { if (!vty_enabled(VTY_VT)) return; if (main_vd->vd_driver != NULL) printf("VT(%s): %s %ux%u\n", main_vd->vd_driver->vd_name, (main_vd->vd_flags & VDF_TEXTMODE) ? "text" : "resolution", main_vd->vd_width, main_vd->vd_height); else printf("VT: init without driver.\n"); mtx_init(&main_vd->vd_lock, "vtdev", NULL, MTX_DEF); cv_init(&main_vd->vd_winswitch, "vtwswt"); } static void vt_schedule_flush(struct vt_device *vd, int ms) { if (ms <= 0) /* Default to initial value. */ ms = 1000 / VT_TIMERFREQ; callout_schedule(&vd->vd_timer, hz / (1000 / ms)); } void vt_resume_flush_timer(struct vt_window *vw, int ms) { struct vt_device *vd = vw->vw_device; if (vd->vd_curwindow != vw) return; if (!(vd->vd_flags & VDF_ASYNC) || !atomic_cmpset_int(&vd->vd_timer_armed, 0, 1)) return; vt_schedule_flush(vd, ms); } static void vt_suspend_flush_timer(struct vt_device *vd) { /* * As long as this function is called locked, callout_stop() * has the same effect like callout_drain() with regard to * preventing the callback function from executing. */ VT_LOCK_ASSERT(vd, MA_OWNED); if (!(vd->vd_flags & VDF_ASYNC) || !atomic_cmpset_int(&vd->vd_timer_armed, 1, 0)) return; callout_stop(&vd->vd_timer); } static void vt_switch_timer(void *arg) { - vt_late_window_switch((struct vt_window *)arg); + (void)vt_late_window_switch((struct vt_window *)arg); } static int vt_save_kbd_mode(struct vt_window *vw, keyboard_t *kbd) { int mode, ret; mode = 0; ret = kbdd_ioctl(kbd, KDGKBMODE, (caddr_t)&mode); if (ret == ENOIOCTL) ret = ENODEV; if (ret != 0) return (ret); vw->vw_kbdmode = mode; return (0); } static int vt_update_kbd_mode(struct vt_window *vw, keyboard_t *kbd) { int ret; ret = kbdd_ioctl(kbd, KDSKBMODE, (caddr_t)&vw->vw_kbdmode); if (ret == ENOIOCTL) ret = ENODEV; return (ret); } static int vt_save_kbd_state(struct vt_window *vw, keyboard_t *kbd) { int state, ret; state = 0; ret = kbdd_ioctl(kbd, KDGKBSTATE, (caddr_t)&state); if (ret == ENOIOCTL) ret = ENODEV; if (ret != 0) return (ret); vw->vw_kbdstate &= ~LOCK_MASK; vw->vw_kbdstate |= state & LOCK_MASK; return (0); } static int vt_update_kbd_state(struct vt_window *vw, keyboard_t *kbd) { int state, ret; state = vw->vw_kbdstate & LOCK_MASK; ret = kbdd_ioctl(kbd, KDSKBSTATE, (caddr_t)&state); if (ret == ENOIOCTL) ret = ENODEV; return (ret); } static int vt_save_kbd_leds(struct vt_window *vw, keyboard_t *kbd) { int leds, ret; leds = 0; ret = kbdd_ioctl(kbd, KDGETLED, (caddr_t)&leds); if (ret == ENOIOCTL) ret = ENODEV; if (ret != 0) return (ret); vw->vw_kbdstate &= ~LED_MASK; vw->vw_kbdstate |= leds & LED_MASK; return (0); } static int vt_update_kbd_leds(struct vt_window *vw, keyboard_t *kbd) { int leds, ret; leds = vw->vw_kbdstate & LED_MASK; ret = kbdd_ioctl(kbd, KDSETLED, (caddr_t)&leds); if (ret == ENOIOCTL) ret = ENODEV; return (ret); } static int vt_window_preswitch(struct vt_window *vw, struct vt_window *curvw) { DPRINTF(40, "%s\n", __func__); curvw->vw_switch_to = vw; /* Set timer to allow switch in case when process hang. */ callout_reset(&vw->vw_proc_dead_timer, hz * vt_deadtimer, vt_switch_timer, (void *)vw); /* Notify process about vt switch attempt. */ DPRINTF(30, "%s: Notify process.\n", __func__); signal_vt_rel(curvw); return (0); } static int vt_window_postswitch(struct vt_window *vw) { signal_vt_acq(vw); return (0); } /* vt_late_window_switch will done VT switching for regular case. */ static int vt_late_window_switch(struct vt_window *vw) { + struct vt_window *curvw; int ret; callout_stop(&vw->vw_proc_dead_timer); ret = vt_window_switch(vw); - if (ret) + if (ret != 0) { + /* + * If the switch hasn't happened, then return the VT + * to the current owner, if any. + */ + curvw = vw->vw_device->vd_curwindow; + if (curvw->vw_smode.mode == VT_PROCESS) + (void)vt_window_postswitch(curvw); return (ret); + } /* Notify owner process about terminal availability. */ if (vw->vw_smode.mode == VT_PROCESS) { ret = vt_window_postswitch(vw); } return (ret); } /* Switch window. */ static int vt_proc_window_switch(struct vt_window *vw) { struct vt_window *curvw; struct vt_device *vd; int ret; /* Prevent switching to NULL */ if (vw == NULL) { DPRINTF(30, "%s: Cannot switch: vw is NULL.", __func__); return (EINVAL); } vd = vw->vw_device; curvw = vd->vd_curwindow; /* Check if virtual terminal is locked */ if (curvw->vw_flags & VWF_VTYLOCK) return (EBUSY); /* Check if switch already in progress */ if (curvw->vw_flags & VWF_SWWAIT_REL) { /* Check if switching to same window */ if (curvw->vw_switch_to == vw) { DPRINTF(30, "%s: Switch in progress to same vw.", __func__); return (0); /* success */ } DPRINTF(30, "%s: Switch in progress to different vw.", __func__); return (EBUSY); } /* Avoid switching to already selected window */ if (vw == curvw) { DPRINTF(30, "%s: Cannot switch: vw == curvw.", __func__); return (0); /* success */ } + /* + * Early check for an attempt to switch to a non-functional VT. + * The same check is done in vt_window_switch(), but it's better + * to fail as early as possible to avoid needless pre-switch + * actions. + */ + VT_LOCK(vd); + if ((vw->vw_flags & (VWF_OPENED|VWF_CONSOLE)) == 0) { + VT_UNLOCK(vd); + return (EINVAL); + } + VT_UNLOCK(vd); + /* Ask current process permission to switch away. */ if (curvw->vw_smode.mode == VT_PROCESS) { DPRINTF(30, "%s: VT_PROCESS ", __func__); if (vt_proc_alive(curvw) == FALSE) { DPRINTF(30, "Dead. Cleaning."); /* Dead */ } else { DPRINTF(30, "%s: Signaling process.\n", __func__); /* Alive, try to ask him. */ ret = vt_window_preswitch(vw, curvw); /* Wait for process answer or timeout. */ return (ret); } DPRINTF(30, "\n"); } ret = vt_late_window_switch(vw); return (ret); } /* Switch window ignoring process locking. */ static int vt_window_switch(struct vt_window *vw) { struct vt_device *vd = vw->vw_device; struct vt_window *curvw = vd->vd_curwindow; keyboard_t *kbd; if (kdb_active) { /* * When grabbing the console for the debugger, avoid * locks as that can result in deadlock. While this * could use try locks, that wouldn't really make a * difference as there are sufficient barriers in * debugger entry/exit to be equivalent to * successfully try-locking here. */ if (curvw == vw) return (0); if (!(vw->vw_flags & (VWF_OPENED|VWF_CONSOLE))) return (EINVAL); vd->vd_curwindow = vw; vd->vd_flags |= VDF_INVALID; if (vd->vd_driver->vd_postswitch) vd->vd_driver->vd_postswitch(vd); return (0); } VT_LOCK(vd); if (curvw == vw) { /* Nothing to do. */ VT_UNLOCK(vd); return (0); } if (!(vw->vw_flags & (VWF_OPENED|VWF_CONSOLE))) { VT_UNLOCK(vd); return (EINVAL); } vt_suspend_flush_timer(vd); vd->vd_curwindow = vw; vd->vd_flags |= VDF_INVALID; cv_broadcast(&vd->vd_winswitch); VT_UNLOCK(vd); if (vd->vd_driver->vd_postswitch) vd->vd_driver->vd_postswitch(vd); vt_resume_flush_timer(vw, 0); /* Restore per-window keyboard mode. */ mtx_lock(&Giant); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd != NULL) { if (curvw->vw_kbdmode == K_XLATE) vt_save_kbd_state(curvw, kbd); vt_update_kbd_mode(vw, kbd); vt_update_kbd_state(vw, kbd); } mtx_unlock(&Giant); DPRINTF(10, "%s(ttyv%d) done\n", __func__, vw->vw_number); return (0); } void vt_termsize(struct vt_device *vd, struct vt_font *vf, term_pos_t *size) { size->tp_row = vd->vd_height; if (vt_draw_logo_cpus) size->tp_row -= vt_logo_sprite_height; size->tp_col = vd->vd_width; if (vf != NULL) { size->tp_row /= vf->vf_height; size->tp_col /= vf->vf_width; } } static inline void vt_termrect(struct vt_device *vd, struct vt_font *vf, term_rect_t *rect) { rect->tr_begin.tp_row = rect->tr_begin.tp_col = 0; if (vt_draw_logo_cpus) rect->tr_begin.tp_row = vt_logo_sprite_height; rect->tr_end.tp_row = vd->vd_height; rect->tr_end.tp_col = vd->vd_width; if (vf != NULL) { rect->tr_begin.tp_row = howmany(rect->tr_begin.tp_row, vf->vf_height); rect->tr_end.tp_row /= vf->vf_height; rect->tr_end.tp_col /= vf->vf_width; } } void vt_winsize(struct vt_device *vd, struct vt_font *vf, struct winsize *size) { size->ws_ypixel = vd->vd_height; if (vt_draw_logo_cpus) size->ws_ypixel -= vt_logo_sprite_height; size->ws_row = size->ws_ypixel; size->ws_col = size->ws_xpixel = vd->vd_width; if (vf != NULL) { size->ws_row /= vf->vf_height; size->ws_col /= vf->vf_width; } } void vt_compute_drawable_area(struct vt_window *vw) { struct vt_device *vd; struct vt_font *vf; vt_axis_t height; vd = vw->vw_device; if (vw->vw_font == NULL) { vw->vw_draw_area.tr_begin.tp_col = 0; vw->vw_draw_area.tr_begin.tp_row = 0; if (vt_draw_logo_cpus) vw->vw_draw_area.tr_begin.tp_row = vt_logo_sprite_height; vw->vw_draw_area.tr_end.tp_col = vd->vd_width; vw->vw_draw_area.tr_end.tp_row = vd->vd_height; return; } vf = vw->vw_font; /* * Compute the drawable area, so that the text is centered on * the screen. */ height = vd->vd_height; if (vt_draw_logo_cpus) height -= vt_logo_sprite_height; vw->vw_draw_area.tr_begin.tp_col = (vd->vd_width % vf->vf_width) / 2; vw->vw_draw_area.tr_begin.tp_row = (height % vf->vf_height) / 2; if (vt_draw_logo_cpus) vw->vw_draw_area.tr_begin.tp_row += vt_logo_sprite_height; vw->vw_draw_area.tr_end.tp_col = vw->vw_draw_area.tr_begin.tp_col + rounddown(vd->vd_width, vf->vf_width); vw->vw_draw_area.tr_end.tp_row = vw->vw_draw_area.tr_begin.tp_row + rounddown(height, vf->vf_height); } static void vt_scroll(struct vt_window *vw, int offset, int whence) { int diff; term_pos_t size; if ((vw->vw_flags & VWF_SCROLL) == 0) return; vt_termsize(vw->vw_device, vw->vw_font, &size); diff = vthistory_seek(&vw->vw_buf, offset, whence); if (diff) vw->vw_device->vd_flags |= VDF_INVALID; vt_resume_flush_timer(vw, 0); } static int vt_machine_kbdevent(int c) { switch (c) { case SPCLKEY | DBG: /* kbdmap(5) keyword `debug`. */ if (vt_kbd_debug) kdb_enter(KDB_WHY_BREAK, "manual escape to debugger"); return (1); case SPCLKEY | HALT: /* kbdmap(5) keyword `halt`. */ if (vt_kbd_halt) shutdown_nice(RB_HALT); return (1); case SPCLKEY | PASTE: /* kbdmap(5) keyword `paste`. */ #ifndef SC_NO_CUTPASTE /* Insert text from cut-paste buffer. */ vt_mouse_paste(); #endif break; case SPCLKEY | PDWN: /* kbdmap(5) keyword `pdwn`. */ if (vt_kbd_poweroff) shutdown_nice(RB_HALT|RB_POWEROFF); return (1); case SPCLKEY | PNC: /* kbdmap(5) keyword `panic`. */ /* * Request to immediate panic if sysctl * kern.vt.enable_panic_key allow it. */ if (vt_kbd_panic) panic("Forced by the panic key"); return (1); case SPCLKEY | RBT: /* kbdmap(5) keyword `boot`. */ if (vt_kbd_reboot) shutdown_nice(RB_AUTOBOOT); return (1); case SPCLKEY | SPSC: /* kbdmap(5) keyword `spsc`. */ /* Force activatation/deactivation of the screen saver. */ /* TODO */ return (1); case SPCLKEY | STBY: /* XXX Not present in kbdcontrol parser. */ /* Put machine into Stand-By mode. */ power_pm_suspend(POWER_SLEEP_STATE_STANDBY); return (1); case SPCLKEY | SUSP: /* kbdmap(5) keyword `susp`. */ /* Suspend machine. */ power_pm_suspend(POWER_SLEEP_STATE_SUSPEND); return (1); } return (0); } static void vt_scrollmode_kbdevent(struct vt_window *vw, int c, int console) { struct vt_device *vd; term_pos_t size; vd = vw->vw_device; /* Only special keys handled in ScrollLock mode */ if ((c & SPCLKEY) == 0) return; c &= ~SPCLKEY; if (console == 0) { if (c >= F_SCR && c <= MIN(L_SCR, F_SCR + VT_MAXWINDOWS - 1)) { vw = vd->vd_windows[c - F_SCR]; vt_proc_window_switch(vw); return; } VT_LOCK(vd); } switch (c) { case SLK: { /* Turn scrolling off. */ vt_scroll(vw, 0, VHS_END); VTBUF_SLCK_DISABLE(&vw->vw_buf); vw->vw_flags &= ~VWF_SCROLL; break; } case FKEY | F(49): /* Home key. */ vt_scroll(vw, 0, VHS_SET); break; case FKEY | F(50): /* Arrow up. */ vt_scroll(vw, -1, VHS_CUR); break; case FKEY | F(51): /* Page up. */ vt_termsize(vd, vw->vw_font, &size); vt_scroll(vw, -size.tp_row, VHS_CUR); break; case FKEY | F(57): /* End key. */ vt_scroll(vw, 0, VHS_END); break; case FKEY | F(58): /* Arrow down. */ vt_scroll(vw, 1, VHS_CUR); break; case FKEY | F(59): /* Page down. */ vt_termsize(vd, vw->vw_font, &size); vt_scroll(vw, size.tp_row, VHS_CUR); break; } if (console == 0) VT_UNLOCK(vd); } static int vt_processkey(keyboard_t *kbd, struct vt_device *vd, int c) { struct vt_window *vw = vd->vd_curwindow; random_harvest_queue(&c, sizeof(c), RANDOM_KEYBOARD); #if VT_ALT_TO_ESC_HACK if (c & RELKEY) { switch (c & ~RELKEY) { case (SPCLKEY | RALT): if (vt_enable_altgr != 0) break; case (SPCLKEY | LALT): vd->vd_kbstate &= ~ALKED; } /* Other keys ignored for RELKEY event. */ return (0); } else { switch (c & ~RELKEY) { case (SPCLKEY | RALT): if (vt_enable_altgr != 0) break; case (SPCLKEY | LALT): vd->vd_kbstate |= ALKED; } } #else if (c & RELKEY) /* Other keys ignored for RELKEY event. */ return (0); #endif if (vt_machine_kbdevent(c)) return (0); if (vw->vw_flags & VWF_SCROLL) { vt_scrollmode_kbdevent(vw, c, 0/* Not a console */); /* Scroll mode keys handled, nothing to do more. */ return (0); } if (c & SPCLKEY) { c &= ~SPCLKEY; if (c >= F_SCR && c <= MIN(L_SCR, F_SCR + VT_MAXWINDOWS - 1)) { vw = vd->vd_windows[c - F_SCR]; vt_proc_window_switch(vw); return (0); } switch (c) { case NEXT: /* Switch to next VT. */ c = (vw->vw_number + 1) % VT_MAXWINDOWS; vw = vd->vd_windows[c]; vt_proc_window_switch(vw); return (0); case PREV: /* Switch to previous VT. */ c = (vw->vw_number + VT_MAXWINDOWS - 1) % VT_MAXWINDOWS; vw = vd->vd_windows[c]; vt_proc_window_switch(vw); return (0); case SLK: { vt_save_kbd_state(vw, kbd); VT_LOCK(vd); if (vw->vw_kbdstate & SLKED) { /* Turn scrolling on. */ vw->vw_flags |= VWF_SCROLL; VTBUF_SLCK_ENABLE(&vw->vw_buf); } else { /* Turn scrolling off. */ vw->vw_flags &= ~VWF_SCROLL; VTBUF_SLCK_DISABLE(&vw->vw_buf); vt_scroll(vw, 0, VHS_END); } VT_UNLOCK(vd); break; } case FKEY | F(1): case FKEY | F(2): case FKEY | F(3): case FKEY | F(4): case FKEY | F(5): case FKEY | F(6): case FKEY | F(7): case FKEY | F(8): case FKEY | F(9): case FKEY | F(10): case FKEY | F(11): case FKEY | F(12): /* F1 through F12 keys. */ terminal_input_special(vw->vw_terminal, TKEY_F1 + c - (FKEY | F(1))); break; case FKEY | F(49): /* Home key. */ terminal_input_special(vw->vw_terminal, TKEY_HOME); break; case FKEY | F(50): /* Arrow up. */ terminal_input_special(vw->vw_terminal, TKEY_UP); break; case FKEY | F(51): /* Page up. */ terminal_input_special(vw->vw_terminal, TKEY_PAGE_UP); break; case FKEY | F(53): /* Arrow left. */ terminal_input_special(vw->vw_terminal, TKEY_LEFT); break; case FKEY | F(55): /* Arrow right. */ terminal_input_special(vw->vw_terminal, TKEY_RIGHT); break; case FKEY | F(57): /* End key. */ terminal_input_special(vw->vw_terminal, TKEY_END); break; case FKEY | F(58): /* Arrow down. */ terminal_input_special(vw->vw_terminal, TKEY_DOWN); break; case FKEY | F(59): /* Page down. */ terminal_input_special(vw->vw_terminal, TKEY_PAGE_DOWN); break; case FKEY | F(60): /* Insert key. */ terminal_input_special(vw->vw_terminal, TKEY_INSERT); break; case FKEY | F(61): /* Delete key. */ terminal_input_special(vw->vw_terminal, TKEY_DELETE); break; } } else if (KEYFLAGS(c) == 0) { /* Don't do UTF-8 conversion when doing raw mode. */ if (vw->vw_kbdmode == K_XLATE) { #if VT_ALT_TO_ESC_HACK if (vd->vd_kbstate & ALKED) { /* * Prepend ESC sequence if one of ALT keys down. */ terminal_input_char(vw->vw_terminal, 0x1b); } #endif #if defined(KDB) kdb_alt_break(c, &vd->vd_altbrk); #endif terminal_input_char(vw->vw_terminal, KEYCHAR(c)); } else terminal_input_raw(vw->vw_terminal, c); } return (0); } static int vt_kbdevent(keyboard_t *kbd, int event, void *arg) { struct vt_device *vd = arg; int c; switch (event) { case KBDIO_KEYINPUT: break; case KBDIO_UNLOADING: mtx_lock(&Giant); vd->vd_keyboard = -1; kbd_release(kbd, (void *)vd); mtx_unlock(&Giant); return (0); default: return (EINVAL); } while ((c = kbdd_read_char(kbd, 0)) != NOKEY) vt_processkey(kbd, vd, c); return (0); } static int vt_allocate_keyboard(struct vt_device *vd) { int grabbed, i, idx0, idx; keyboard_t *k0, *k; keyboard_info_t ki; /* * If vt_upgrade() happens while the console is grabbed, we are * potentially going to switch keyboard devices while the keyboard is in * use. Unwind the grabbing of the current keyboard first, then we will * re-grab the new keyboard below, before we return. */ if (vd->vd_curwindow == &vt_conswindow) { grabbed = vd->vd_curwindow->vw_grabbed; for (i = 0; i < grabbed; ++i) vtterm_cnungrab(vd->vd_curwindow->vw_terminal); } idx0 = kbd_allocate("kbdmux", -1, vd, vt_kbdevent, vd); if (idx0 >= 0) { DPRINTF(20, "%s: kbdmux allocated, idx = %d\n", __func__, idx0); k0 = kbd_get_keyboard(idx0); for (idx = kbd_find_keyboard2("*", -1, 0); idx != -1; idx = kbd_find_keyboard2("*", -1, idx + 1)) { k = kbd_get_keyboard(idx); if (idx == idx0 || KBD_IS_BUSY(k)) continue; bzero(&ki, sizeof(ki)); strncpy(ki.kb_name, k->kb_name, sizeof(ki.kb_name)); ki.kb_name[sizeof(ki.kb_name) - 1] = '\0'; ki.kb_unit = k->kb_unit; kbdd_ioctl(k0, KBADDKBD, (caddr_t) &ki); } } else { DPRINTF(20, "%s: no kbdmux allocated\n", __func__); idx0 = kbd_allocate("*", -1, vd, vt_kbdevent, vd); if (idx0 < 0) { DPRINTF(10, "%s: No keyboard found.\n", __func__); return (-1); } } vd->vd_keyboard = idx0; DPRINTF(20, "%s: vd_keyboard = %d\n", __func__, vd->vd_keyboard); if (vd->vd_curwindow == &vt_conswindow) { for (i = 0; i < grabbed; ++i) vtterm_cngrab(vd->vd_curwindow->vw_terminal); } return (idx0); } static void vtterm_bell(struct terminal *tm) { struct vt_window *vw = tm->tm_softc; struct vt_device *vd = vw->vw_device; if (!vt_enable_bell) return; if (vd->vd_flags & VDF_QUIET_BELL) return; sysbeep(1193182 / VT_BELLPITCH, VT_BELLDURATION); } static void vtterm_beep(struct terminal *tm, u_int param) { u_int freq, period; if (!vt_enable_bell) return; if ((param == 0) || ((param & 0xffff) == 0)) { vtterm_bell(tm); return; } period = ((param >> 16) & 0xffff) * hz / 1000; freq = 1193182 / (param & 0xffff); sysbeep(freq, period); } static void vtterm_cursor(struct terminal *tm, const term_pos_t *p) { struct vt_window *vw = tm->tm_softc; vtbuf_cursor_position(&vw->vw_buf, p); } static void vtterm_putchar(struct terminal *tm, const term_pos_t *p, term_char_t c) { struct vt_window *vw = tm->tm_softc; vtbuf_putchar(&vw->vw_buf, p, c); } static void vtterm_fill(struct terminal *tm, const term_rect_t *r, term_char_t c) { struct vt_window *vw = tm->tm_softc; vtbuf_fill(&vw->vw_buf, r, c); } static void vtterm_copy(struct terminal *tm, const term_rect_t *r, const term_pos_t *p) { struct vt_window *vw = tm->tm_softc; vtbuf_copy(&vw->vw_buf, r, p); } static void vtterm_param(struct terminal *tm, int cmd, unsigned int arg) { struct vt_window *vw = tm->tm_softc; switch (cmd) { case TP_SETLOCALCURSOR: /* * 0 means normal (usually block), 1 means hidden, and * 2 means blinking (always block) for compatibility with * syscons. We don't support any changes except hiding, * so must map 2 to 0. */ arg = (arg == 1) ? 0 : 1; /* FALLTHROUGH */ case TP_SHOWCURSOR: vtbuf_cursor_visibility(&vw->vw_buf, arg); vt_resume_flush_timer(vw, 0); break; case TP_MOUSE: vw->vw_mouse_level = arg; break; } } void vt_determine_colors(term_char_t c, int cursor, term_color_t *fg, term_color_t *bg) { term_color_t tmp; int invert; invert = 0; *fg = TCHAR_FGCOLOR(c); if (TCHAR_FORMAT(c) & TF_BOLD) *fg = TCOLOR_LIGHT(*fg); *bg = TCHAR_BGCOLOR(c); if (TCHAR_FORMAT(c) & TF_BLINK) *bg = TCOLOR_LIGHT(*bg); if (TCHAR_FORMAT(c) & TF_REVERSE) invert ^= 1; if (cursor) invert ^= 1; if (invert) { tmp = *fg; *fg = *bg; *bg = tmp; } } #ifndef SC_NO_CUTPASTE int vt_is_cursor_in_area(const struct vt_device *vd, const term_rect_t *area) { unsigned int mx, my; /* * We use the cursor position saved during the current refresh, * in case the cursor moved since. */ mx = vd->vd_mx_drawn + vd->vd_curwindow->vw_draw_area.tr_begin.tp_col; my = vd->vd_my_drawn + vd->vd_curwindow->vw_draw_area.tr_begin.tp_row; if (mx >= area->tr_end.tp_col || mx + vd->vd_mcursor->width <= area->tr_begin.tp_col || my >= area->tr_end.tp_row || my + vd->vd_mcursor->height <= area->tr_begin.tp_row) return (0); return (1); } static void vt_mark_mouse_position_as_dirty(struct vt_device *vd, int locked) { term_rect_t area; struct vt_window *vw; struct vt_font *vf; int x, y; vw = vd->vd_curwindow; vf = vw->vw_font; x = vd->vd_mx_drawn; y = vd->vd_my_drawn; if (vf != NULL) { area.tr_begin.tp_col = x / vf->vf_width; area.tr_begin.tp_row = y / vf->vf_height; area.tr_end.tp_col = ((x + vd->vd_mcursor->width) / vf->vf_width) + 1; area.tr_end.tp_row = ((y + vd->vd_mcursor->height) / vf->vf_height) + 1; } else { /* * No font loaded (ie. vt_vga operating in textmode). * * FIXME: This fake area needs to be revisited once the * mouse cursor is supported in vt_vga's textmode. */ area.tr_begin.tp_col = x; area.tr_begin.tp_row = y; area.tr_end.tp_col = x + 2; area.tr_end.tp_row = y + 2; } if (!locked) vtbuf_lock(&vw->vw_buf); if (vd->vd_driver->vd_invalidate_text) vd->vd_driver->vd_invalidate_text(vd, &area); vtbuf_dirty(&vw->vw_buf, &area); if (!locked) vtbuf_unlock(&vw->vw_buf); } #endif static void vt_set_border(struct vt_device *vd, const term_rect_t *area, const term_color_t c) { vd_drawrect_t *drawrect = vd->vd_driver->vd_drawrect; if (drawrect == NULL) return; /* Top bar */ if (area->tr_begin.tp_row > 0) drawrect(vd, 0, 0, vd->vd_width - 1, area->tr_begin.tp_row - 1, 1, c); /* Left bar */ if (area->tr_begin.tp_col > 0) drawrect(vd, 0, area->tr_begin.tp_row, area->tr_begin.tp_col - 1, area->tr_end.tp_row - 1, 1, c); /* Right bar */ if (area->tr_end.tp_col < vd->vd_width) drawrect(vd, area->tr_end.tp_col, area->tr_begin.tp_row, vd->vd_width - 1, area->tr_end.tp_row - 1, 1, c); /* Bottom bar */ if (area->tr_end.tp_row < vd->vd_height) drawrect(vd, 0, area->tr_end.tp_row, vd->vd_width - 1, vd->vd_height - 1, 1, c); } static int vt_flush(struct vt_device *vd) { struct vt_window *vw; struct vt_font *vf; term_rect_t tarea; #ifndef SC_NO_CUTPASTE int cursor_was_shown, cursor_moved; #endif vw = vd->vd_curwindow; if (vw == NULL) return (0); if (vd->vd_flags & VDF_SPLASH || vw->vw_flags & VWF_BUSY) return (0); vf = vw->vw_font; if (((vd->vd_flags & VDF_TEXTMODE) == 0) && (vf == NULL)) return (0); vtbuf_lock(&vw->vw_buf); #ifndef SC_NO_CUTPASTE cursor_was_shown = vd->vd_mshown; cursor_moved = (vd->vd_mx != vd->vd_mx_drawn || vd->vd_my != vd->vd_my_drawn); /* Check if the cursor should be displayed or not. */ if ((vd->vd_flags & VDF_MOUSECURSOR) && /* Mouse support enabled. */ !(vw->vw_flags & VWF_MOUSE_HIDE) && /* Cursor displayed. */ !kdb_active && panicstr == NULL) { /* DDB inactive. */ vd->vd_mshown = 1; } else { vd->vd_mshown = 0; } /* * If the cursor changed display state or moved, we must mark * the old position as dirty, so that it's erased. */ if (cursor_was_shown != vd->vd_mshown || (vd->vd_mshown && cursor_moved)) vt_mark_mouse_position_as_dirty(vd, true); /* * Save position of the mouse cursor. It's used by backends to * know where to draw the cursor and during the next refresh to * erase the previous position. */ vd->vd_mx_drawn = vd->vd_mx; vd->vd_my_drawn = vd->vd_my; /* * If the cursor is displayed and has moved since last refresh, * mark the new position as dirty. */ if (vd->vd_mshown && cursor_moved) vt_mark_mouse_position_as_dirty(vd, true); #endif vtbuf_undirty(&vw->vw_buf, &tarea); /* Force a full redraw when the screen contents might be invalid. */ if (vd->vd_flags & (VDF_INVALID | VDF_SUSPENDED)) { vd->vd_flags &= ~VDF_INVALID; vt_set_border(vd, &vw->vw_draw_area, TC_BLACK); vt_termrect(vd, vf, &tarea); if (vd->vd_driver->vd_invalidate_text) vd->vd_driver->vd_invalidate_text(vd, &tarea); if (vt_draw_logo_cpus) vtterm_draw_cpu_logos(vd); } if (tarea.tr_begin.tp_col < tarea.tr_end.tp_col) { vd->vd_driver->vd_bitblt_text(vd, vw, &tarea); vtbuf_unlock(&vw->vw_buf); return (1); } vtbuf_unlock(&vw->vw_buf); return (0); } static void vt_timer(void *arg) { struct vt_device *vd; int changed; vd = arg; /* Update screen if required. */ changed = vt_flush(vd); /* Schedule for next update. */ if (changed) vt_schedule_flush(vd, 0); else vd->vd_timer_armed = 0; } static void vtterm_pre_input(struct terminal *tm) { struct vt_window *vw = tm->tm_softc; vtbuf_lock(&vw->vw_buf); } static void vtterm_post_input(struct terminal *tm) { struct vt_window *vw = tm->tm_softc; vtbuf_unlock(&vw->vw_buf); vt_resume_flush_timer(vw, 0); } static void vtterm_done(struct terminal *tm) { struct vt_window *vw = tm->tm_softc; struct vt_device *vd = vw->vw_device; if (kdb_active || panicstr != NULL) { /* Switch to the debugger. */ if (vd->vd_curwindow != vw) { vd->vd_curwindow = vw; vd->vd_flags |= VDF_INVALID; if (vd->vd_driver->vd_postswitch) vd->vd_driver->vd_postswitch(vd); } vd->vd_flags &= ~VDF_SPLASH; vt_flush(vd); } else if (!(vd->vd_flags & VDF_ASYNC)) { vt_flush(vd); } } #ifdef DEV_SPLASH static void vtterm_splash(struct vt_device *vd) { vt_axis_t top, left; /* Display a nice boot splash. */ if (!(vd->vd_flags & VDF_TEXTMODE) && (boothowto & RB_MUTE)) { top = (vd->vd_height - vt_logo_height) / 2; left = (vd->vd_width - vt_logo_width) / 2; switch (vt_logo_depth) { case 1: /* XXX: Unhardcode colors! */ vd->vd_driver->vd_bitblt_bmp(vd, vd->vd_curwindow, vt_logo_image, NULL, vt_logo_width, vt_logo_height, left, top, TC_WHITE, TC_BLACK); } vd->vd_flags |= VDF_SPLASH; } } #endif static void vtterm_cnprobe(struct terminal *tm, struct consdev *cp) { struct vt_driver *vtd, **vtdlist, *vtdbest = NULL; struct vt_window *vw = tm->tm_softc; struct vt_device *vd = vw->vw_device; struct winsize wsz; term_attr_t attr; term_char_t c; if (!vty_enabled(VTY_VT)) return; if (vd->vd_flags & VDF_INITIALIZED) /* Initialization already done. */ return; SET_FOREACH(vtdlist, vt_drv_set) { vtd = *vtdlist; if (vtd->vd_probe == NULL) continue; if (vtd->vd_probe(vd) == CN_DEAD) continue; if ((vtdbest == NULL) || (vtd->vd_priority > vtdbest->vd_priority)) vtdbest = vtd; } if (vtdbest == NULL) { cp->cn_pri = CN_DEAD; vd->vd_flags |= VDF_DEAD; } else { vd->vd_driver = vtdbest; cp->cn_pri = vd->vd_driver->vd_init(vd); } /* Check if driver's vt_init return CN_DEAD. */ if (cp->cn_pri == CN_DEAD) { vd->vd_flags |= VDF_DEAD; } /* Initialize any early-boot keyboard drivers */ kbd_configure(KB_CONF_PROBE_ONLY); vd->vd_unit = atomic_fetchadd_int(&vt_unit, 1); vd->vd_windows[VT_CONSWINDOW] = vw; sprintf(cp->cn_name, "ttyv%r", VT_UNIT(vw)); /* Attach default font if not in TEXTMODE. */ if ((vd->vd_flags & VDF_TEXTMODE) == 0) { vw->vw_font = vtfont_ref(&vt_font_default); vt_compute_drawable_area(vw); } /* * The original screen size was faked (_VTDEFW x _VTDEFH). Now * that we have the real viewable size, fix it in the static * buffer. */ if (vd->vd_width != 0 && vd->vd_height != 0) vt_termsize(vd, vw->vw_font, &vw->vw_buf.vb_scr_size); vtbuf_init_early(&vw->vw_buf); vt_winsize(vd, vw->vw_font, &wsz); c = (boothowto & RB_MUTE) == 0 ? TERMINAL_KERN_ATTR : TERMINAL_NORM_ATTR; attr.ta_format = TCHAR_FORMAT(c); attr.ta_fgcolor = TCHAR_FGCOLOR(c); attr.ta_bgcolor = TCHAR_BGCOLOR(c); terminal_set_winsize_blank(tm, &wsz, 1, &attr); if (vtdbest != NULL) { #ifdef DEV_SPLASH if (!vt_splash_cpu) vtterm_splash(vd); #endif vd->vd_flags |= VDF_INITIALIZED; } } static int vtterm_cngetc(struct terminal *tm) { struct vt_window *vw = tm->tm_softc; struct vt_device *vd = vw->vw_device; keyboard_t *kbd; u_int c; if (vw->vw_kbdsq && *vw->vw_kbdsq) return (*vw->vw_kbdsq++); /* Make sure the splash screen is not there. */ if (vd->vd_flags & VDF_SPLASH) { /* Remove splash */ vd->vd_flags &= ~VDF_SPLASH; /* Mark screen as invalid to force update */ vd->vd_flags |= VDF_INVALID; vt_flush(vd); } /* Stripped down keyboard handler. */ kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd == NULL) return (-1); /* Force keyboard input mode to K_XLATE */ vw->vw_kbdmode = K_XLATE; vt_update_kbd_mode(vw, kbd); /* Switch the keyboard to polling to make it work here. */ kbdd_poll(kbd, TRUE); c = kbdd_read_char(kbd, 0); kbdd_poll(kbd, FALSE); if (c & RELKEY) return (-1); if (vw->vw_flags & VWF_SCROLL) { vt_scrollmode_kbdevent(vw, c, 1/* Console mode */); vt_flush(vd); return (-1); } /* Stripped down handling of vt_kbdevent(), without locking, etc. */ if (c & SPCLKEY) { switch (c) { case SPCLKEY | SLK: vt_save_kbd_state(vw, kbd); if (vw->vw_kbdstate & SLKED) { /* Turn scrolling on. */ vw->vw_flags |= VWF_SCROLL; VTBUF_SLCK_ENABLE(&vw->vw_buf); } else { /* Turn scrolling off. */ vt_scroll(vw, 0, VHS_END); vw->vw_flags &= ~VWF_SCROLL; VTBUF_SLCK_DISABLE(&vw->vw_buf); } break; /* XXX: KDB can handle history. */ case SPCLKEY | FKEY | F(50): /* Arrow up. */ vw->vw_kbdsq = "\x1b[A"; break; case SPCLKEY | FKEY | F(58): /* Arrow down. */ vw->vw_kbdsq = "\x1b[B"; break; case SPCLKEY | FKEY | F(55): /* Arrow right. */ vw->vw_kbdsq = "\x1b[C"; break; case SPCLKEY | FKEY | F(53): /* Arrow left. */ vw->vw_kbdsq = "\x1b[D"; break; } /* Force refresh to make scrollback work. */ vt_flush(vd); } else if (KEYFLAGS(c) == 0) { return (KEYCHAR(c)); } if (vw->vw_kbdsq && *vw->vw_kbdsq) return (*vw->vw_kbdsq++); return (-1); } static void vtterm_cngrab(struct terminal *tm) { struct vt_device *vd; struct vt_window *vw; keyboard_t *kbd; vw = tm->tm_softc; vd = vw->vw_device; if (!cold) vt_window_switch(vw); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd == NULL) return; if (vw->vw_grabbed++ > 0) return; /* * Make sure the keyboard is accessible even when the kbd device * driver is disabled. */ kbdd_enable(kbd); /* We shall always use the keyboard in the XLATE mode here. */ vw->vw_prev_kbdmode = vw->vw_kbdmode; vw->vw_kbdmode = K_XLATE; vt_update_kbd_mode(vw, kbd); kbdd_poll(kbd, TRUE); } static void vtterm_cnungrab(struct terminal *tm) { struct vt_device *vd; struct vt_window *vw; keyboard_t *kbd; vw = tm->tm_softc; vd = vw->vw_device; kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd == NULL) return; if (--vw->vw_grabbed > 0) return; kbdd_poll(kbd, FALSE); vw->vw_kbdmode = vw->vw_prev_kbdmode; vt_update_kbd_mode(vw, kbd); kbdd_disable(kbd); } static void vtterm_opened(struct terminal *tm, int opened) { struct vt_window *vw = tm->tm_softc; struct vt_device *vd = vw->vw_device; VT_LOCK(vd); vd->vd_flags &= ~VDF_SPLASH; if (opened) vw->vw_flags |= VWF_OPENED; else { vw->vw_flags &= ~VWF_OPENED; /* TODO: finish ACQ/REL */ } VT_UNLOCK(vd); } static int vt_change_font(struct vt_window *vw, struct vt_font *vf) { struct vt_device *vd = vw->vw_device; struct terminal *tm = vw->vw_terminal; term_pos_t size; struct winsize wsz; /* * Changing fonts. * * Changing fonts is a little tricky. We must prevent * simultaneous access to the device, so we must stop * the display timer and the terminal from accessing. * We need to switch fonts and grow our screen buffer. * * XXX: Right now the code uses terminal_mute() to * prevent data from reaching the console driver while * resizing the screen buffer. This isn't elegant... */ VT_LOCK(vd); if (vw->vw_flags & VWF_BUSY) { /* Another process is changing the font. */ VT_UNLOCK(vd); return (EBUSY); } vw->vw_flags |= VWF_BUSY; VT_UNLOCK(vd); vt_termsize(vd, vf, &size); vt_winsize(vd, vf, &wsz); /* Grow the screen buffer and terminal. */ terminal_mute(tm, 1); vtbuf_grow(&vw->vw_buf, &size, vw->vw_buf.vb_history_size); terminal_set_winsize_blank(tm, &wsz, 0, NULL); terminal_set_cursor(tm, &vw->vw_buf.vb_cursor); terminal_mute(tm, 0); /* Actually apply the font to the current window. */ VT_LOCK(vd); if (vw->vw_font != vf && vw->vw_font != NULL && vf != NULL) { /* * In case vt_change_font called to update size we don't need * to update font link. */ vtfont_unref(vw->vw_font); vw->vw_font = vtfont_ref(vf); } /* * Compute the drawable area and move the mouse cursor inside * it, in case the new area is smaller than the previous one. */ vt_compute_drawable_area(vw); vd->vd_mx = min(vd->vd_mx, vw->vw_draw_area.tr_end.tp_col - vw->vw_draw_area.tr_begin.tp_col - 1); vd->vd_my = min(vd->vd_my, vw->vw_draw_area.tr_end.tp_row - vw->vw_draw_area.tr_begin.tp_row - 1); /* Force a full redraw the next timer tick. */ if (vd->vd_curwindow == vw) { vd->vd_flags |= VDF_INVALID; vt_resume_flush_timer(vw, 0); } vw->vw_flags &= ~VWF_BUSY; VT_UNLOCK(vd); return (0); } static int vt_proc_alive(struct vt_window *vw) { struct proc *p; if (vw->vw_smode.mode != VT_PROCESS) return (FALSE); if (vw->vw_proc) { if ((p = pfind(vw->vw_pid)) != NULL) PROC_UNLOCK(p); if (vw->vw_proc == p) return (TRUE); vw->vw_proc = NULL; vw->vw_smode.mode = VT_AUTO; DPRINTF(1, "vt controlling process %d died\n", vw->vw_pid); vw->vw_pid = 0; } return (FALSE); } static int signal_vt_rel(struct vt_window *vw) { if (vw->vw_smode.mode != VT_PROCESS) return (FALSE); if (vw->vw_proc == NULL || vt_proc_alive(vw) == FALSE) { vw->vw_proc = NULL; vw->vw_pid = 0; return (TRUE); } vw->vw_flags |= VWF_SWWAIT_REL; PROC_LOCK(vw->vw_proc); kern_psignal(vw->vw_proc, vw->vw_smode.relsig); PROC_UNLOCK(vw->vw_proc); DPRINTF(1, "sending relsig to %d\n", vw->vw_pid); return (TRUE); } static int signal_vt_acq(struct vt_window *vw) { if (vw->vw_smode.mode != VT_PROCESS) return (FALSE); if (vw == vw->vw_device->vd_windows[VT_CONSWINDOW]) cnavailable(vw->vw_terminal->consdev, FALSE); if (vw->vw_proc == NULL || vt_proc_alive(vw) == FALSE) { vw->vw_proc = NULL; vw->vw_pid = 0; return (TRUE); } vw->vw_flags |= VWF_SWWAIT_ACQ; PROC_LOCK(vw->vw_proc); kern_psignal(vw->vw_proc, vw->vw_smode.acqsig); PROC_UNLOCK(vw->vw_proc); DPRINTF(1, "sending acqsig to %d\n", vw->vw_pid); return (TRUE); } static int finish_vt_rel(struct vt_window *vw, int release, int *s) { if (vw->vw_flags & VWF_SWWAIT_REL) { vw->vw_flags &= ~VWF_SWWAIT_REL; if (release) { callout_drain(&vw->vw_proc_dead_timer); - vt_late_window_switch(vw->vw_switch_to); + (void)vt_late_window_switch(vw->vw_switch_to); } return (0); } return (EINVAL); } static int finish_vt_acq(struct vt_window *vw) { if (vw->vw_flags & VWF_SWWAIT_ACQ) { vw->vw_flags &= ~VWF_SWWAIT_ACQ; return (0); } return (EINVAL); } #ifndef SC_NO_CUTPASTE static void vt_mouse_terminput_button(struct vt_device *vd, int button) { struct vt_window *vw; struct vt_font *vf; char mouseb[6] = "\x1B[M"; int i, x, y; vw = vd->vd_curwindow; vf = vw->vw_font; /* Translate to char position. */ x = vd->vd_mx / vf->vf_width; y = vd->vd_my / vf->vf_height; /* Avoid overflow. */ x = MIN(x, 255 - '!'); y = MIN(y, 255 - '!'); mouseb[3] = ' ' + button; mouseb[4] = '!' + x; mouseb[5] = '!' + y; for (i = 0; i < sizeof(mouseb); i++) terminal_input_char(vw->vw_terminal, mouseb[i]); } static void vt_mouse_terminput(struct vt_device *vd, int type, int x, int y, int event, int cnt) { switch (type) { case MOUSE_BUTTON_EVENT: if (cnt > 0) { /* Mouse button pressed. */ if (event & MOUSE_BUTTON1DOWN) vt_mouse_terminput_button(vd, 0); if (event & MOUSE_BUTTON2DOWN) vt_mouse_terminput_button(vd, 1); if (event & MOUSE_BUTTON3DOWN) vt_mouse_terminput_button(vd, 2); } else { /* Mouse button released. */ vt_mouse_terminput_button(vd, 3); } break; #ifdef notyet case MOUSE_MOTION_EVENT: if (mouse->u.data.z < 0) { /* Scroll up. */ sc_mouse_input_button(vd, 64); } else if (mouse->u.data.z > 0) { /* Scroll down. */ sc_mouse_input_button(vd, 65); } break; #endif } } static void vt_mouse_paste() { term_char_t *buf; int i, len; len = VD_PASTEBUFLEN(main_vd); buf = VD_PASTEBUF(main_vd); len /= sizeof(term_char_t); for (i = 0; i < len; i++) { if (buf[i] == '\0') continue; terminal_input_char(main_vd->vd_curwindow->vw_terminal, buf[i]); } } void vt_mouse_event(int type, int x, int y, int event, int cnt, int mlevel) { struct vt_device *vd; struct vt_window *vw; struct vt_font *vf; term_pos_t size; int len, mark; vd = main_vd; vw = vd->vd_curwindow; vf = vw->vw_font; mark = 0; if (vw->vw_flags & (VWF_MOUSE_HIDE | VWF_GRAPHICS)) /* * Either the mouse is disabled, or the window is in * "graphics mode". The graphics mode is usually set by * an X server, using the KDSETMODE ioctl. */ return; if (vf == NULL) /* Text mode. */ return; /* * TODO: add flag about pointer position changed, to not redraw chars * under mouse pointer when nothing changed. */ if (vw->vw_mouse_level > 0) vt_mouse_terminput(vd, type, x, y, event, cnt); switch (type) { case MOUSE_ACTION: case MOUSE_MOTION_EVENT: /* Movement */ x += vd->vd_mx; y += vd->vd_my; vt_termsize(vd, vf, &size); /* Apply limits. */ x = MAX(x, 0); y = MAX(y, 0); x = MIN(x, (size.tp_col * vf->vf_width) - 1); y = MIN(y, (size.tp_row * vf->vf_height) - 1); vd->vd_mx = x; vd->vd_my = y; if (vd->vd_mstate & MOUSE_BUTTON1DOWN) vtbuf_set_mark(&vw->vw_buf, VTB_MARK_MOVE, vd->vd_mx / vf->vf_width, vd->vd_my / vf->vf_height); vt_resume_flush_timer(vw, 0); return; /* Done */ case MOUSE_BUTTON_EVENT: /* Buttons */ break; default: return; /* Done */ } switch (event) { case MOUSE_BUTTON1DOWN: switch (cnt % 4) { case 0: /* up */ mark = VTB_MARK_END; break; case 1: /* single click: start cut operation */ mark = VTB_MARK_START; break; case 2: /* double click: cut a word */ mark = VTB_MARK_WORD; break; case 3: /* triple click: cut a line */ mark = VTB_MARK_ROW; break; } break; case VT_MOUSE_PASTEBUTTON: switch (cnt) { case 0: /* up */ break; default: vt_mouse_paste(); break; } return; /* Done */ case VT_MOUSE_EXTENDBUTTON: switch (cnt) { case 0: /* up */ if (!(vd->vd_mstate & MOUSE_BUTTON1DOWN)) mark = VTB_MARK_EXTEND; else mark = 0; break; default: mark = VTB_MARK_EXTEND; break; } break; default: return; /* Done */ } /* Save buttons state. */ if (cnt > 0) vd->vd_mstate |= event; else vd->vd_mstate &= ~event; if (vtbuf_set_mark(&vw->vw_buf, mark, vd->vd_mx / vf->vf_width, vd->vd_my / vf->vf_height) == 1) { /* * We have something marked to copy, so update pointer to * window with selection. */ vt_resume_flush_timer(vw, 0); switch (mark) { case VTB_MARK_END: case VTB_MARK_WORD: case VTB_MARK_ROW: case VTB_MARK_EXTEND: break; default: /* Other types of mark do not require to copy data. */ return; } /* Get current selection size in bytes. */ len = vtbuf_get_marked_len(&vw->vw_buf); if (len <= 0) return; /* Reallocate buffer only if old one is too small. */ if (len > VD_PASTEBUFSZ(vd)) { VD_PASTEBUF(vd) = realloc(VD_PASTEBUF(vd), len, M_VT, M_WAITOK | M_ZERO); /* Update buffer size. */ VD_PASTEBUFSZ(vd) = len; } /* Request copy/paste buffer data, no more than `len' */ vtbuf_extract_marked(&vw->vw_buf, VD_PASTEBUF(vd), VD_PASTEBUFSZ(vd)); VD_PASTEBUFLEN(vd) = len; /* XXX VD_PASTEBUF(vd) have to be freed on shutdown/unload. */ } } void vt_mouse_state(int show) { struct vt_device *vd; struct vt_window *vw; vd = main_vd; vw = vd->vd_curwindow; switch (show) { case VT_MOUSE_HIDE: vw->vw_flags |= VWF_MOUSE_HIDE; break; case VT_MOUSE_SHOW: vw->vw_flags &= ~VWF_MOUSE_HIDE; break; } /* Mark mouse position as dirty. */ vt_mark_mouse_position_as_dirty(vd, false); vt_resume_flush_timer(vw, 0); } #endif static int vtterm_mmap(struct terminal *tm, vm_ooffset_t offset, vm_paddr_t * paddr, int nprot, vm_memattr_t *memattr) { struct vt_window *vw = tm->tm_softc; struct vt_device *vd = vw->vw_device; if (vd->vd_driver->vd_fb_mmap) return (vd->vd_driver->vd_fb_mmap(vd, offset, paddr, nprot, memattr)); return (ENXIO); } static int vtterm_ioctl(struct terminal *tm, u_long cmd, caddr_t data, struct thread *td) { struct vt_window *vw = tm->tm_softc; struct vt_device *vd = vw->vw_device; keyboard_t *kbd; int error, i, s; #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) || defined(COMPAT_43) int ival; switch (cmd) { case _IO('v', 4): cmd = VT_RELDISP; break; case _IO('v', 5): cmd = VT_ACTIVATE; break; case _IO('v', 6): cmd = VT_WAITACTIVE; break; case _IO('K', 20): cmd = KDSKBSTATE; break; case _IO('K', 67): cmd = KDSETRAD; break; case _IO('K', 7): cmd = KDSKBMODE; break; case _IO('K', 8): cmd = KDMKTONE; break; case _IO('K', 10): cmd = KDSETMODE; break; case _IO('K', 13): cmd = KDSBORDER; break; case _IO('K', 63): cmd = KIOCSOUND; break; case _IO('K', 66): cmd = KDSETLED; break; case _IO('c', 104): cmd = CONS_SETWINORG; break; case _IO('c', 110): cmd = CONS_SETKBD; break; default: goto skip_thunk; } ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; skip_thunk: #endif switch (cmd) { case KDSETRAD: /* set keyboard repeat & delay rates (old) */ if (*(int *)data & ~0x7f) return (EINVAL); /* FALLTHROUGH */ case GIO_KEYMAP: case PIO_KEYMAP: case GIO_DEADKEYMAP: case PIO_DEADKEYMAP: case GETFKEY: case SETFKEY: case KDGKBINFO: case KDGKBTYPE: case KDGETREPEAT: /* get keyboard repeat & delay rates */ case KDSETREPEAT: /* set keyboard repeat & delay rates (new) */ case KBADDKBD: /* add/remove keyboard to/from mux */ case KBRELKBD: { error = 0; mtx_lock(&Giant); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd != NULL) error = kbdd_ioctl(kbd, cmd, data); mtx_unlock(&Giant); if (error == ENOIOCTL) { if (cmd == KDGKBTYPE) { /* always return something? XXX */ *(int *)data = 0; } else { return (ENODEV); } } return (error); } case KDGKBSTATE: { /* get keyboard state (locks) */ error = 0; if (vw == vd->vd_curwindow) { mtx_lock(&Giant); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd != NULL) error = vt_save_kbd_state(vw, kbd); mtx_unlock(&Giant); if (error != 0) return (error); } *(int *)data = vw->vw_kbdstate & LOCK_MASK; return (error); } case KDSKBSTATE: { /* set keyboard state (locks) */ int state; state = *(int *)data; if (state & ~LOCK_MASK) return (EINVAL); vw->vw_kbdstate &= ~LOCK_MASK; vw->vw_kbdstate |= state; error = 0; if (vw == vd->vd_curwindow) { mtx_lock(&Giant); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd != NULL) error = vt_update_kbd_state(vw, kbd); mtx_unlock(&Giant); } return (error); } case KDGETLED: { /* get keyboard LED status */ error = 0; if (vw == vd->vd_curwindow) { mtx_lock(&Giant); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd != NULL) error = vt_save_kbd_leds(vw, kbd); mtx_unlock(&Giant); if (error != 0) return (error); } *(int *)data = vw->vw_kbdstate & LED_MASK; return (error); } case KDSETLED: { /* set keyboard LED status */ int leds; leds = *(int *)data; if (leds & ~LED_MASK) return (EINVAL); vw->vw_kbdstate &= ~LED_MASK; vw->vw_kbdstate |= leds; error = 0; if (vw == vd->vd_curwindow) { mtx_lock(&Giant); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd != NULL) error = vt_update_kbd_leds(vw, kbd); mtx_unlock(&Giant); } return (error); } case KDGETMODE: *(int *)data = (vw->vw_flags & VWF_GRAPHICS) ? KD_GRAPHICS : KD_TEXT; return (0); case KDGKBMODE: { error = 0; if (vw == vd->vd_curwindow) { mtx_lock(&Giant); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd != NULL) error = vt_save_kbd_mode(vw, kbd); mtx_unlock(&Giant); if (error != 0) return (error); } *(int *)data = vw->vw_kbdmode; return (error); } case KDSKBMODE: { int mode; mode = *(int *)data; switch (mode) { case K_XLATE: case K_RAW: case K_CODE: vw->vw_kbdmode = mode; error = 0; if (vw == vd->vd_curwindow) { mtx_lock(&Giant); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd != NULL) error = vt_update_kbd_mode(vw, kbd); mtx_unlock(&Giant); } return (error); default: return (EINVAL); } } case FBIOGTYPE: case FBIO_GETWINORG: /* get frame buffer window origin */ case FBIO_GETDISPSTART: /* get display start address */ case FBIO_GETLINEWIDTH: /* get scan line width in bytes */ case FBIO_BLANK: /* blank display */ if (vd->vd_driver->vd_fb_ioctl) return (vd->vd_driver->vd_fb_ioctl(vd, cmd, data, td)); break; case CONS_BLANKTIME: /* XXX */ return (0); case CONS_HISTORY: if (*(int *)data < 0) return EINVAL; if (*(int *)data != vd->vd_curwindow->vw_buf.vb_history_size) vtbuf_sethistory_size(&vd->vd_curwindow->vw_buf, *(int *)data); return 0; case CONS_GET: /* XXX */ *(int *)data = M_CG640x480; return (0); case CONS_BELLTYPE: /* set bell type sound */ if ((*(int *)data) & CONS_QUIET_BELL) vd->vd_flags |= VDF_QUIET_BELL; else vd->vd_flags &= ~VDF_QUIET_BELL; return (0); case CONS_GETINFO: { vid_info_t *vi = (vid_info_t *)data; if (vi->size != sizeof(struct vid_info)) return (EINVAL); if (vw == vd->vd_curwindow) { mtx_lock(&Giant); kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd != NULL) vt_save_kbd_state(vw, kbd); mtx_unlock(&Giant); } vi->m_num = vd->vd_curwindow->vw_number + 1; vi->mk_keylock = vw->vw_kbdstate & LOCK_MASK; /* XXX: other fields! */ return (0); } case CONS_GETVERS: *(int *)data = 0x200; return (0); case CONS_MODEINFO: /* XXX */ return (0); case CONS_MOUSECTL: { mouse_info_t *mouse = (mouse_info_t*)data; /* * All the commands except MOUSE_SHOW nd MOUSE_HIDE * should not be applied to individual TTYs, but only to * consolectl. */ switch (mouse->operation) { case MOUSE_HIDE: if (vd->vd_flags & VDF_MOUSECURSOR) { vd->vd_flags &= ~VDF_MOUSECURSOR; #ifndef SC_NO_CUTPASTE vt_mouse_state(VT_MOUSE_HIDE); #endif } return (0); case MOUSE_SHOW: if (!(vd->vd_flags & VDF_MOUSECURSOR)) { vd->vd_flags |= VDF_MOUSECURSOR; vd->vd_mx = vd->vd_width / 2; vd->vd_my = vd->vd_height / 2; #ifndef SC_NO_CUTPASTE vt_mouse_state(VT_MOUSE_SHOW); #endif } return (0); default: return (EINVAL); } } case PIO_VFONT: { struct vt_font *vf; if (vd->vd_flags & VDF_TEXTMODE) return (ENOTSUP); error = vtfont_load((void *)data, &vf); if (error != 0) return (error); error = vt_change_font(vw, vf); vtfont_unref(vf); return (error); } case PIO_VFONT_DEFAULT: { /* Reset to default font. */ error = vt_change_font(vw, &vt_font_default); return (error); } case GIO_SCRNMAP: { scrmap_t *sm = (scrmap_t *)data; /* We don't have screen maps, so return a handcrafted one. */ for (i = 0; i < 256; i++) sm->scrmap[i] = i; return (0); } case KDSETMODE: /* * FIXME: This implementation is incomplete compared to * syscons. */ switch (*(int *)data) { case KD_TEXT: case KD_TEXT1: case KD_PIXEL: vw->vw_flags &= ~VWF_GRAPHICS; break; case KD_GRAPHICS: vw->vw_flags |= VWF_GRAPHICS; break; } return (0); case KDENABIO: /* allow io operations */ error = priv_check(td, PRIV_IO); if (error != 0) return (error); error = securelevel_gt(td->td_ucred, 0); if (error != 0) return (error); #if defined(__i386__) td->td_frame->tf_eflags |= PSL_IOPL; #elif defined(__amd64__) td->td_frame->tf_rflags |= PSL_IOPL; #endif return (0); case KDDISABIO: /* disallow io operations (default) */ #if defined(__i386__) td->td_frame->tf_eflags &= ~PSL_IOPL; #elif defined(__amd64__) td->td_frame->tf_rflags &= ~PSL_IOPL; #endif return (0); case KDMKTONE: /* sound the bell */ vtterm_beep(tm, *(u_int *)data); return (0); case KIOCSOUND: /* make tone (*data) hz */ /* TODO */ return (0); case CONS_SETKBD: /* set the new keyboard */ mtx_lock(&Giant); error = 0; if (vd->vd_keyboard != *(int *)data) { kbd = kbd_get_keyboard(*(int *)data); if (kbd == NULL) { mtx_unlock(&Giant); return (EINVAL); } i = kbd_allocate(kbd->kb_name, kbd->kb_unit, (void *)vd, vt_kbdevent, vd); if (i >= 0) { if (vd->vd_keyboard != -1) { kbd = kbd_get_keyboard(vd->vd_keyboard); vt_save_kbd_state(vd->vd_curwindow, kbd); kbd_release(kbd, (void *)vd); } kbd = kbd_get_keyboard(i); vd->vd_keyboard = i; vt_update_kbd_mode(vd->vd_curwindow, kbd); vt_update_kbd_state(vd->vd_curwindow, kbd); } else { error = EPERM; /* XXX */ } } mtx_unlock(&Giant); return (error); case CONS_RELKBD: /* release the current keyboard */ mtx_lock(&Giant); error = 0; if (vd->vd_keyboard != -1) { kbd = kbd_get_keyboard(vd->vd_keyboard); if (kbd == NULL) { mtx_unlock(&Giant); return (EINVAL); } vt_save_kbd_state(vd->vd_curwindow, kbd); error = kbd_release(kbd, (void *)vd); if (error == 0) { vd->vd_keyboard = -1; } } mtx_unlock(&Giant); return (error); case VT_ACTIVATE: { int win; win = *(int *)data - 1; DPRINTF(5, "%s%d: VT_ACTIVATE ttyv%d ", SC_DRIVER_NAME, VT_UNIT(vw), win); if ((win >= VT_MAXWINDOWS) || (win < 0)) return (EINVAL); return (vt_proc_window_switch(vd->vd_windows[win])); } case VT_GETACTIVE: *(int *)data = vd->vd_curwindow->vw_number + 1; return (0); case VT_GETINDEX: *(int *)data = vw->vw_number + 1; return (0); case VT_LOCKSWITCH: /* TODO: Check current state, switching can be in progress. */ if ((*(int *)data) == 0x01) vw->vw_flags |= VWF_VTYLOCK; else if ((*(int *)data) == 0x02) vw->vw_flags &= ~VWF_VTYLOCK; else return (EINVAL); return (0); case VT_OPENQRY: VT_LOCK(vd); for (i = 0; i < VT_MAXWINDOWS; i++) { vw = vd->vd_windows[i]; if (vw == NULL) continue; if (!(vw->vw_flags & VWF_OPENED)) { *(int *)data = vw->vw_number + 1; VT_UNLOCK(vd); return (0); } } VT_UNLOCK(vd); return (EINVAL); case VT_WAITACTIVE: { unsigned int idx; error = 0; idx = *(unsigned int *)data; if (idx > VT_MAXWINDOWS) return (EINVAL); if (idx > 0) vw = vd->vd_windows[idx - 1]; VT_LOCK(vd); while (vd->vd_curwindow != vw && error == 0) error = cv_wait_sig(&vd->vd_winswitch, &vd->vd_lock); VT_UNLOCK(vd); return (error); } case VT_SETMODE: { /* set screen switcher mode */ struct vt_mode *mode; struct proc *p1; mode = (struct vt_mode *)data; DPRINTF(5, "%s%d: VT_SETMODE ", SC_DRIVER_NAME, VT_UNIT(vw)); if (vw->vw_smode.mode == VT_PROCESS) { p1 = pfind(vw->vw_pid); if (vw->vw_proc == p1 && vw->vw_proc != td->td_proc) { if (p1) PROC_UNLOCK(p1); DPRINTF(5, "error EPERM\n"); return (EPERM); } if (p1) PROC_UNLOCK(p1); } if (mode->mode == VT_AUTO) { vw->vw_smode.mode = VT_AUTO; vw->vw_proc = NULL; vw->vw_pid = 0; DPRINTF(5, "VT_AUTO, "); if (vw == vw->vw_device->vd_windows[VT_CONSWINDOW]) cnavailable(vw->vw_terminal->consdev, TRUE); /* were we in the middle of the vty switching process? */ if (finish_vt_rel(vw, TRUE, &s) == 0) DPRINTF(5, "reset WAIT_REL, "); if (finish_vt_acq(vw) == 0) DPRINTF(5, "reset WAIT_ACQ, "); return (0); } else if (mode->mode == VT_PROCESS) { if (!ISSIGVALID(mode->relsig) || !ISSIGVALID(mode->acqsig) || !ISSIGVALID(mode->frsig)) { DPRINTF(5, "error EINVAL\n"); return (EINVAL); } DPRINTF(5, "VT_PROCESS %d, ", td->td_proc->p_pid); bcopy(data, &vw->vw_smode, sizeof(struct vt_mode)); vw->vw_proc = td->td_proc; vw->vw_pid = vw->vw_proc->p_pid; if (vw == vw->vw_device->vd_windows[VT_CONSWINDOW]) cnavailable(vw->vw_terminal->consdev, FALSE); } else { DPRINTF(5, "VT_SETMODE failed, unknown mode %d\n", mode->mode); return (EINVAL); } DPRINTF(5, "\n"); return (0); } case VT_GETMODE: /* get screen switcher mode */ bcopy(&vw->vw_smode, data, sizeof(struct vt_mode)); return (0); case VT_RELDISP: /* screen switcher ioctl */ /* * This must be the current vty which is in the VT_PROCESS * switching mode... */ if ((vw != vd->vd_curwindow) || (vw->vw_smode.mode != VT_PROCESS)) { return (EINVAL); } /* ...and this process is controlling it. */ if (vw->vw_proc != td->td_proc) { return (EPERM); } error = EINVAL; switch(*(int *)data) { case VT_FALSE: /* user refuses to release screen, abort */ if ((error = finish_vt_rel(vw, FALSE, &s)) == 0) DPRINTF(5, "%s%d: VT_RELDISP: VT_FALSE\n", SC_DRIVER_NAME, VT_UNIT(vw)); break; case VT_TRUE: /* user has released screen, go on */ /* finish_vt_rel(..., TRUE, ...) should not be locked */ if (vw->vw_flags & VWF_SWWAIT_REL) { if ((error = finish_vt_rel(vw, TRUE, &s)) == 0) DPRINTF(5, "%s%d: VT_RELDISP: VT_TRUE\n", SC_DRIVER_NAME, VT_UNIT(vw)); } else { error = EINVAL; } return (error); case VT_ACKACQ: /* acquire acknowledged, switch completed */ if ((error = finish_vt_acq(vw)) == 0) DPRINTF(5, "%s%d: VT_RELDISP: VT_ACKACQ\n", SC_DRIVER_NAME, VT_UNIT(vw)); break; default: break; } return (error); } return (ENOIOCTL); } static struct vt_window * vt_allocate_window(struct vt_device *vd, unsigned int window) { struct vt_window *vw; struct terminal *tm; term_pos_t size; struct winsize wsz; vw = malloc(sizeof *vw, M_VT, M_WAITOK|M_ZERO); vw->vw_device = vd; vw->vw_number = window; vw->vw_kbdmode = K_XLATE; if ((vd->vd_flags & VDF_TEXTMODE) == 0) { vw->vw_font = vtfont_ref(&vt_font_default); vt_compute_drawable_area(vw); } vt_termsize(vd, vw->vw_font, &size); vt_winsize(vd, vw->vw_font, &wsz); vtbuf_init(&vw->vw_buf, &size); tm = vw->vw_terminal = terminal_alloc(&vt_termclass, vw); terminal_set_winsize(tm, &wsz); vd->vd_windows[window] = vw; callout_init(&vw->vw_proc_dead_timer, 0); return (vw); } void vt_upgrade(struct vt_device *vd) { struct vt_window *vw; unsigned int i; int register_handlers; if (!vty_enabled(VTY_VT)) return; if (main_vd->vd_driver == NULL) return; for (i = 0; i < VT_MAXWINDOWS; i++) { vw = vd->vd_windows[i]; if (vw == NULL) { /* New window. */ vw = vt_allocate_window(vd, i); } if (!(vw->vw_flags & VWF_READY)) { callout_init(&vw->vw_proc_dead_timer, 0); terminal_maketty(vw->vw_terminal, "v%r", VT_UNIT(vw)); vw->vw_flags |= VWF_READY; if (vw->vw_flags & VWF_CONSOLE) { /* For existing console window. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, vt_window_switch, vw, SHUTDOWN_PRI_DEFAULT); } } } VT_LOCK(vd); if (vd->vd_curwindow == NULL) vd->vd_curwindow = vd->vd_windows[VT_CONSWINDOW]; register_handlers = 0; if (!(vd->vd_flags & VDF_ASYNC)) { /* Attach keyboard. */ vt_allocate_keyboard(vd); /* Init 25 Hz timer. */ callout_init_mtx(&vd->vd_timer, &vd->vd_lock, 0); /* * Start timer when everything ready. * Note that the operations here are purposefully ordered. * We need to ensure vd_timer_armed is non-zero before we set * the VDF_ASYNC flag. That prevents this function from * racing with vt_resume_flush_timer() to update the * callout structure. */ atomic_add_acq_int(&vd->vd_timer_armed, 1); vd->vd_flags |= VDF_ASYNC; callout_reset(&vd->vd_timer, hz / VT_TIMERFREQ, vt_timer, vd); register_handlers = 1; } VT_UNLOCK(vd); /* Refill settings with new sizes. */ vt_resize(vd); if (register_handlers) { /* Register suspend/resume handlers. */ EVENTHANDLER_REGISTER(power_suspend_early, vt_suspend_handler, vd, EVENTHANDLER_PRI_ANY); EVENTHANDLER_REGISTER(power_resume, vt_resume_handler, vd, EVENTHANDLER_PRI_ANY); } } static void vt_resize(struct vt_device *vd) { struct vt_window *vw; int i; for (i = 0; i < VT_MAXWINDOWS; i++) { vw = vd->vd_windows[i]; VT_LOCK(vd); /* Assign default font to window, if not textmode. */ if (!(vd->vd_flags & VDF_TEXTMODE) && vw->vw_font == NULL) vw->vw_font = vtfont_ref(&vt_font_default); VT_UNLOCK(vd); /* Resize terminal windows */ while (vt_change_font(vw, vw->vw_font) == EBUSY) { DPRINTF(100, "%s: vt_change_font() is busy, " "window %d\n", __func__, i); } } } static void vt_replace_backend(const struct vt_driver *drv, void *softc) { struct vt_device *vd; vd = main_vd; if (vd->vd_flags & VDF_ASYNC) { /* Stop vt_flush periodic task. */ VT_LOCK(vd); vt_suspend_flush_timer(vd); VT_UNLOCK(vd); /* * Mute current terminal until we done. vt_change_font (called * from vt_resize) will unmute it. */ terminal_mute(vd->vd_curwindow->vw_terminal, 1); } /* * Reset VDF_TEXTMODE flag, driver who require that flag (vt_vga) will * set it. */ VT_LOCK(vd); vd->vd_flags &= ~VDF_TEXTMODE; if (drv != NULL) { /* * We want to upgrade from the current driver to the * given driver. */ vd->vd_prev_driver = vd->vd_driver; vd->vd_prev_softc = vd->vd_softc; vd->vd_driver = drv; vd->vd_softc = softc; vd->vd_driver->vd_init(vd); } else if (vd->vd_prev_driver != NULL && vd->vd_prev_softc != NULL) { /* * No driver given: we want to downgrade to the previous * driver. */ const struct vt_driver *old_drv; void *old_softc; old_drv = vd->vd_driver; old_softc = vd->vd_softc; vd->vd_driver = vd->vd_prev_driver; vd->vd_softc = vd->vd_prev_softc; vd->vd_prev_driver = NULL; vd->vd_prev_softc = NULL; vd->vd_flags |= VDF_DOWNGRADE; vd->vd_driver->vd_init(vd); if (old_drv->vd_fini) old_drv->vd_fini(vd, old_softc); vd->vd_flags &= ~VDF_DOWNGRADE; } VT_UNLOCK(vd); /* Update windows sizes and initialize last items. */ vt_upgrade(vd); #ifdef DEV_SPLASH if (vd->vd_flags & VDF_SPLASH) vtterm_splash(vd); #endif if (vd->vd_flags & VDF_ASYNC) { /* Allow to put chars now. */ terminal_mute(vd->vd_curwindow->vw_terminal, 0); /* Rerun timer for screen updates. */ vt_resume_flush_timer(vd->vd_curwindow, 0); } /* * Register as console. If it already registered, cnadd() will ignore * it. */ termcn_cnregister(vd->vd_windows[VT_CONSWINDOW]->vw_terminal); } static void vt_suspend_handler(void *priv) { struct vt_device *vd; vd = priv; vd->vd_flags |= VDF_SUSPENDED; if (vd->vd_driver != NULL && vd->vd_driver->vd_suspend != NULL) vd->vd_driver->vd_suspend(vd); } static void vt_resume_handler(void *priv) { struct vt_device *vd; vd = priv; if (vd->vd_driver != NULL && vd->vd_driver->vd_resume != NULL) vd->vd_driver->vd_resume(vd); vd->vd_flags &= ~VDF_SUSPENDED; } void vt_allocate(const struct vt_driver *drv, void *softc) { if (!vty_enabled(VTY_VT)) return; if (main_vd->vd_driver == NULL) { main_vd->vd_driver = drv; printf("VT: initialize with new VT driver \"%s\".\n", drv->vd_name); } else { /* * Check if have rights to replace current driver. For example: * it is bad idea to replace KMS driver with generic VGA one. */ if (drv->vd_priority <= main_vd->vd_driver->vd_priority) { printf("VT: Driver priority %d too low. Current %d\n ", drv->vd_priority, main_vd->vd_driver->vd_priority); return; } printf("VT: Replacing driver \"%s\" with new \"%s\".\n", main_vd->vd_driver->vd_name, drv->vd_name); } vt_replace_backend(drv, softc); } void vt_deallocate(const struct vt_driver *drv, void *softc) { if (!vty_enabled(VTY_VT)) return; if (main_vd->vd_prev_driver == NULL || main_vd->vd_driver != drv || main_vd->vd_softc != softc) return; printf("VT: Switching back from \"%s\" to \"%s\".\n", main_vd->vd_driver->vd_name, main_vd->vd_prev_driver->vd_name); vt_replace_backend(NULL, NULL); } void vt_suspend(struct vt_device *vd) { int error; if (vt_suspendswitch == 0) return; /* Save current window. */ vd->vd_savedwindow = vd->vd_curwindow; /* Ask holding process to free window and switch to console window */ vt_proc_window_switch(vd->vd_windows[VT_CONSWINDOW]); /* Wait for the window switch to complete. */ error = 0; VT_LOCK(vd); while (vd->vd_curwindow != vd->vd_windows[VT_CONSWINDOW] && error == 0) error = cv_wait_sig(&vd->vd_winswitch, &vd->vd_lock); VT_UNLOCK(vd); } void vt_resume(struct vt_device *vd) { if (vt_suspendswitch == 0) return; /* Switch back to saved window, if any */ vt_proc_window_switch(vd->vd_savedwindow); vd->vd_savedwindow = NULL; } Index: projects/clang900-import/sys/fs/nfsclient/nfs_clport.c =================================================================== --- projects/clang900-import/sys/fs/nfsclient/nfs_clport.c (revision 352536) +++ projects/clang900-import/sys/fs/nfsclient/nfs_clport.c (revision 352537) @@ -1,1387 +1,1390 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include /* * generally, I don't like #includes inside .h files, but it seems to * be the easiest way to handle the port. */ #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS dtrace_nfsclient_attrcache_flush_probe_func_t dtrace_nfscl_attrcache_flush_done_probe; uint32_t nfscl_attrcache_flush_done_id; dtrace_nfsclient_attrcache_get_hit_probe_func_t dtrace_nfscl_attrcache_get_hit_probe; uint32_t nfscl_attrcache_get_hit_id; dtrace_nfsclient_attrcache_get_miss_probe_func_t dtrace_nfscl_attrcache_get_miss_probe; uint32_t nfscl_attrcache_get_miss_id; dtrace_nfsclient_attrcache_load_probe_func_t dtrace_nfscl_attrcache_load_done_probe; uint32_t nfscl_attrcache_load_done_id; #endif /* !KDTRACE_HOOKS */ extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1; extern struct vop_vector newnfs_vnodeops; extern struct vop_vector newnfs_fifoops; extern uma_zone_t newnfsnode_zone; extern struct buf_ops buf_ops_newnfs; extern uma_zone_t ncl_pbuf_zone; extern short nfsv4_cbport; extern int nfscl_enablecallb; extern int nfs_numnfscbd; extern int nfscl_inited; struct mtx ncl_iod_mutex; NFSDLOCKMUTEX; extern struct mtx nfsrv_dslock_mtx; extern void (*ncl_call_invalcaches)(struct vnode *); SYSCTL_DECL(_vfs_nfs); static int ncl_fileid_maxwarnings = 10; SYSCTL_INT(_vfs_nfs, OID_AUTO, fileid_maxwarnings, CTLFLAG_RWTUN, &ncl_fileid_maxwarnings, 0, "Limit fileid corruption warnings; 0 is off; -1 is unlimited"); static volatile int ncl_fileid_nwarnings; static void nfscl_warn_fileid(struct nfsmount *, struct nfsvattr *, struct nfsvattr *); /* * Comparison function for vfs_hash functions. */ int newnfs_vncmpf(struct vnode *vp, void *arg) { struct nfsfh *nfhp = (struct nfsfh *)arg; struct nfsnode *np = VTONFS(vp); if (np->n_fhp->nfh_len != nfhp->nfh_len || NFSBCMP(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len)) return (1); return (0); } /* * Look up a vnode/nfsnode by file handle. * Callers must check for mount points!! * In all cases, a pointer to a * nfsnode structure is returned. * This variant takes a "struct nfsfh *" as second argument and uses * that structure up, either by hanging off the nfsnode or FREEing it. */ int nfscl_nget(struct mount *mntp, struct vnode *dvp, struct nfsfh *nfhp, struct componentname *cnp, struct thread *td, struct nfsnode **npp, void *stuff, int lkflags) { struct nfsnode *np, *dnp; struct vnode *vp, *nvp; struct nfsv4node *newd, *oldd; int error; u_int hash; struct nfsmount *nmp; nmp = VFSTONFS(mntp); dnp = VTONFS(dvp); *npp = NULL; hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len, FNV1_32_INIT); error = vfs_hash_get(mntp, hash, lkflags, td, &nvp, newnfs_vncmpf, nfhp); if (error == 0 && nvp != NULL) { /* * I believe there is a slight chance that vgonel() could * get called on this vnode between when NFSVOPLOCK() drops * the VI_LOCK() and vget() acquires it again, so that it * hasn't yet had v_usecount incremented. If this were to * happen, the VI_DOOMED flag would be set, so check for * that here. Since we now have the v_usecount incremented, * we should be ok until we vrele() it, if the VI_DOOMED * flag isn't set now. */ VI_LOCK(nvp); if ((nvp->v_iflag & VI_DOOMED)) { VI_UNLOCK(nvp); vrele(nvp); error = ENOENT; } else { VI_UNLOCK(nvp); } } if (error) { free(nfhp, M_NFSFH); return (error); } if (nvp != NULL) { np = VTONFS(nvp); /* * For NFSv4, check to see if it is the same name and * replace the name, if it is different. */ oldd = newd = NULL; if ((nmp->nm_flag & NFSMNT_NFSV4) && np->n_v4 != NULL && nvp->v_type == VREG && (np->n_v4->n4_namelen != cnp->cn_namelen || NFSBCMP(cnp->cn_nameptr, NFS4NODENAME(np->n_v4), cnp->cn_namelen) || dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen || NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data, dnp->n_fhp->nfh_len))) { newd = malloc( sizeof (struct nfsv4node) + dnp->n_fhp->nfh_len + + cnp->cn_namelen - 1, M_NFSV4NODE, M_WAITOK); NFSLOCKNODE(np); if (newd != NULL && np->n_v4 != NULL && nvp->v_type == VREG && (np->n_v4->n4_namelen != cnp->cn_namelen || NFSBCMP(cnp->cn_nameptr, NFS4NODENAME(np->n_v4), cnp->cn_namelen) || dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen || NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data, dnp->n_fhp->nfh_len))) { oldd = np->n_v4; np->n_v4 = newd; newd = NULL; np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len; np->n_v4->n4_namelen = cnp->cn_namelen; NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data, dnp->n_fhp->nfh_len); NFSBCOPY(cnp->cn_nameptr, NFS4NODENAME(np->n_v4), cnp->cn_namelen); } NFSUNLOCKNODE(np); } if (newd != NULL) free(newd, M_NFSV4NODE); if (oldd != NULL) free(oldd, M_NFSV4NODE); *npp = np; free(nfhp, M_NFSFH); return (0); } np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO); error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp); if (error) { uma_zfree(newnfsnode_zone, np); free(nfhp, M_NFSFH); return (error); } vp = nvp; KASSERT(vp->v_bufobj.bo_bsize != 0, ("nfscl_nget: bo_bsize == 0")); vp->v_bufobj.bo_ops = &buf_ops_newnfs; vp->v_data = np; np->n_vnode = vp; /* * Initialize the mutex even if the vnode is going to be a loser. * This simplifies the logic in reclaim, which can then unconditionally * destroy the mutex (in the case of the loser, or if hash_insert * happened to return an error no special casing is needed). */ mtx_init(&np->n_mtx, "NEWNFSnode lock", NULL, MTX_DEF | MTX_DUPOK); lockinit(&np->n_excl, PVFS, "nfsupg", VLKTIMEOUT, LK_NOSHARE | LK_CANRECURSE); /* * Are we getting the root? If so, make sure the vnode flags * are correct */ if ((nfhp->nfh_len == nmp->nm_fhsize) && !bcmp(nfhp->nfh_fh, nmp->nm_fh, nfhp->nfh_len)) { if (vp->v_type == VNON) vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; } np->n_fhp = nfhp; /* * For NFSv4, we have to attach the directory file handle and * file name, so that Open Ops can be done later. */ if (nmp->nm_flag & NFSMNT_NFSV4) { np->n_v4 = malloc(sizeof (struct nfsv4node) + dnp->n_fhp->nfh_len + cnp->cn_namelen - 1, M_NFSV4NODE, M_WAITOK); np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len; np->n_v4->n4_namelen = cnp->cn_namelen; NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data, dnp->n_fhp->nfh_len); NFSBCOPY(cnp->cn_nameptr, NFS4NODENAME(np->n_v4), cnp->cn_namelen); } else { np->n_v4 = NULL; } /* * NFS supports recursive and shared locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_NOWITNESS, NULL); VN_LOCK_AREC(vp); VN_LOCK_ASHARE(vp); error = insmntque(vp, mntp); if (error != 0) { *npp = NULL; mtx_destroy(&np->n_mtx); lockdestroy(&np->n_excl); free(nfhp, M_NFSFH); if (np->n_v4 != NULL) free(np->n_v4, M_NFSV4NODE); uma_zfree(newnfsnode_zone, np); return (error); } error = vfs_hash_insert(vp, hash, lkflags, td, &nvp, newnfs_vncmpf, nfhp); if (error) return (error); if (nvp != NULL) { *npp = VTONFS(nvp); /* vfs_hash_insert() vput()'s the losing vnode */ return (0); } *npp = np; return (0); } /* * Another variant of nfs_nget(). This one is only used by reopen. It * takes almost the same args as nfs_nget(), but only succeeds if an entry * exists in the cache. (Since files should already be "open" with a * vnode ref cnt on the node when reopen calls this, it should always * succeed.) * Also, don't get a vnode lock, since it may already be locked by some * other process that is handling it. This is ok, since all other threads * on the client are blocked by the nfsc_lock being exclusively held by the * caller of this function. */ int nfscl_ngetreopen(struct mount *mntp, u_int8_t *fhp, int fhsize, struct thread *td, struct nfsnode **npp) { struct vnode *nvp; u_int hash; struct nfsfh *nfhp; int error; *npp = NULL; /* For forced dismounts, just return error. */ if (NFSCL_FORCEDISM(mntp)) return (EINTR); nfhp = malloc(sizeof (struct nfsfh) + fhsize, M_NFSFH, M_WAITOK); bcopy(fhp, &nfhp->nfh_fh[0], fhsize); nfhp->nfh_len = fhsize; hash = fnv_32_buf(fhp, fhsize, FNV1_32_INIT); /* * First, try to get the vnode locked, but don't block for the lock. */ error = vfs_hash_get(mntp, hash, (LK_EXCLUSIVE | LK_NOWAIT), td, &nvp, newnfs_vncmpf, nfhp); if (error == 0 && nvp != NULL) { NFSVOPUNLOCK(nvp, 0); } else if (error == EBUSY) { /* * It is safe so long as a vflush() with * FORCECLOSE has not been done. Since the Renew thread is * stopped and the MNTK_UNMOUNTF flag is set before doing * a vflush() with FORCECLOSE, we should be ok here. */ if (NFSCL_FORCEDISM(mntp)) error = EINTR; else { vfs_hash_ref(mntp, hash, td, &nvp, newnfs_vncmpf, nfhp); if (nvp == NULL) { error = ENOENT; } else if ((nvp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; vrele(nvp); } else { error = 0; } } } free(nfhp, M_NFSFH); if (error) return (error); if (nvp != NULL) { *npp = VTONFS(nvp); return (0); } return (EINVAL); } static void nfscl_warn_fileid(struct nfsmount *nmp, struct nfsvattr *oldnap, struct nfsvattr *newnap) { int off; if (ncl_fileid_maxwarnings >= 0 && ncl_fileid_nwarnings >= ncl_fileid_maxwarnings) return; off = 0; if (ncl_fileid_maxwarnings >= 0) { if (++ncl_fileid_nwarnings >= ncl_fileid_maxwarnings) off = 1; } printf("newnfs: server '%s' error: fileid changed. " "fsid %jx:%jx: expected fileid %#jx, got %#jx. " "(BROKEN NFS SERVER OR MIDDLEWARE)\n", nmp->nm_com.nmcom_hostname, (uintmax_t)nmp->nm_fsid[0], (uintmax_t)nmp->nm_fsid[1], (uintmax_t)oldnap->na_fileid, (uintmax_t)newnap->na_fileid); if (off) printf("newnfs: Logged %d times about fileid corruption; " "going quiet to avoid spamming logs excessively. (Limit " "is: %d).\n", ncl_fileid_nwarnings, ncl_fileid_maxwarnings); } /* * Load the attribute cache (that lives in the nfsnode entry) with * the attributes of the second argument and * Iff vaper not NULL * copy the attributes to *vaper * Similar to nfs_loadattrcache(), except the attributes are passed in * instead of being parsed out of the mbuf list. */ int nfscl_loadattrcache(struct vnode **vpp, struct nfsvattr *nap, void *nvaper, void *stuff, int writeattr, int dontshrink) { struct vnode *vp = *vpp; struct vattr *vap, *nvap = &nap->na_vattr, *vaper = nvaper; struct nfsnode *np; struct nfsmount *nmp; struct timespec mtime_save; + vm_object_t object; u_quad_t nsize; - int setnsize, error, force_fid_err; + int error, force_fid_err; + bool setnsize; error = 0; - setnsize = 0; - nsize = 0; /* * If v_type == VNON it is a new node, so fill in the v_type, * n_mtime fields. Check to see if it represents a special * device, and if so, check for a possible alias. Once the * correct vnode has been obtained, fill in the rest of the * information. */ np = VTONFS(vp); NFSLOCKNODE(np); if (vp->v_type != nvap->va_type) { vp->v_type = nvap->va_type; if (vp->v_type == VFIFO) vp->v_op = &newnfs_fifoops; np->n_mtime = nvap->va_mtime; } nmp = VFSTONFS(vp->v_mount); vap = &np->n_vattr.na_vattr; mtime_save = vap->va_mtime; if (writeattr) { np->n_vattr.na_filerev = nap->na_filerev; np->n_vattr.na_size = nap->na_size; np->n_vattr.na_mtime = nap->na_mtime; np->n_vattr.na_ctime = nap->na_ctime; np->n_vattr.na_fsid = nap->na_fsid; np->n_vattr.na_mode = nap->na_mode; } else { force_fid_err = 0; KFAIL_POINT_ERROR(DEBUG_FP, nfscl_force_fileid_warning, force_fid_err); /* * BROKEN NFS SERVER OR MIDDLEWARE * * Certain NFS servers (certain old proprietary filers ca. * 2006) or broken middleboxes (e.g. WAN accelerator products) * will respond to GETATTR requests with results for a * different fileid. * * The WAN accelerator we've observed not only serves stale * cache results for a given file, it also occasionally serves * results for wholly different files. This causes surprising * problems; for example the cached size attribute of a file * may truncate down and then back up, resulting in zero * regions in file contents read by applications. We observed * this reliably with Clang and .c files during parallel build. * A pcap revealed packet fragmentation and GETATTR RPC * responses with wholly wrong fileids. */ if ((np->n_vattr.na_fileid != 0 && np->n_vattr.na_fileid != nap->na_fileid) || force_fid_err) { nfscl_warn_fileid(nmp, &np->n_vattr, nap); error = EIDRM; goto out; } NFSBCOPY((caddr_t)nap, (caddr_t)&np->n_vattr, sizeof (struct nfsvattr)); } /* * For NFSv4, if the node's fsid is not equal to the mount point's * fsid, return the low order 32bits of the node's fsid. This * allows getcwd(3) to work. There is a chance that the fsid might * be the same as a local fs, but since this is in an NFS mount * point, I don't think that will cause any problems? */ if (NFSHASNFSV4(nmp) && NFSHASHASSETFSID(nmp) && (nmp->nm_fsid[0] != np->n_vattr.na_filesid[0] || nmp->nm_fsid[1] != np->n_vattr.na_filesid[1])) { /* * va_fsid needs to be set to some value derived from * np->n_vattr.na_filesid that is not equal * vp->v_mount->mnt_stat.f_fsid[0], so that it changes * from the value used for the top level server volume * in the mounted subtree. */ vn_fsid(vp, vap); if ((uint32_t)vap->va_fsid == np->n_vattr.na_filesid[0]) vap->va_fsid = hash32_buf( np->n_vattr.na_filesid, 2 * sizeof(uint64_t), 0); } else vn_fsid(vp, vap); np->n_attrstamp = time_second; if (vap->va_size != np->n_size) { if (vap->va_type == VREG) { if (dontshrink && vap->va_size < np->n_size) { /* * We've been told not to shrink the file; * zero np->n_attrstamp to indicate that * the attributes are stale. */ - nsize = vap->va_size = np->n_size; - setnsize = 1; + vap->va_size = np->n_size; np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); } else if (np->n_flag & NMODIFIED) { /* * We've modified the file: Use the larger * of our size, and the server's size. */ if (vap->va_size < np->n_size) { vap->va_size = np->n_size; } else { np->n_size = vap->va_size; np->n_flag |= NSIZECHANGED; } - nsize = np->n_size; - setnsize = 1; - } else if (vap->va_size < np->n_size) { - /* - * When shrinking the size, the call to - * vnode_pager_setsize() cannot be done - * with the mutex held, so delay it until - * after the mtx_unlock call. - */ - nsize = np->n_size = vap->va_size; - np->n_flag |= NSIZECHANGED; - setnsize = 1; } else { - nsize = np->n_size = vap->va_size; + np->n_size = vap->va_size; np->n_flag |= NSIZECHANGED; - setnsize = 1; } } else { np->n_size = vap->va_size; } } /* * The following checks are added to prevent a race between (say) * a READDIR+ and a WRITE. * READDIR+, WRITE requests sent out. * READDIR+ resp, WRITE resp received on client. * However, the WRITE resp was handled before the READDIR+ resp * causing the post op attrs from the write to be loaded first * and the attrs from the READDIR+ to be loaded later. If this * happens, we have stale attrs loaded into the attrcache. * We detect this by for the mtime moving back. We invalidate the * attrcache when this happens. */ if (timespeccmp(&mtime_save, &vap->va_mtime, >)) { /* Size changed or mtime went backwards */ np->n_attrstamp = 0; KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); } if (vaper != NULL) { NFSBCOPY((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); if (np->n_flag & NCHG) { if (np->n_flag & NACC) vaper->va_atime = np->n_atim; if (np->n_flag & NUPD) vaper->va_mtime = np->n_mtim; } } out: #ifdef KDTRACE_HOOKS if (np->n_attrstamp != 0) KDTRACE_NFS_ATTRCACHE_LOAD_DONE(vp, vap, error); #endif + nsize = vap->va_size; + object = vp->v_object; + setnsize = false; + if (object != NULL) { + if (OFF_TO_IDX(nsize + PAGE_MASK) < object->size) { + /* + * When shrinking the size, the call to + * vnode_pager_setsize() cannot be done with + * the mutex held, because we might need to + * wait for a busy page. Delay it until after + * the node is unlocked. + */ + setnsize = true; + } else { + vnode_pager_setsize(vp, nsize); + } + } NFSUNLOCKNODE(np); if (setnsize) vnode_pager_setsize(vp, nsize); return (error); } /* * Fill in the client id name. For these bytes: * 1 - they must be unique * 2 - they should be persistent across client reboots * 1 is more critical than 2 * Use the mount point's unique id plus either the uuid or, if that * isn't set, random junk. */ void nfscl_fillclid(u_int64_t clval, char *uuid, u_int8_t *cp, u_int16_t idlen) { int uuidlen; /* * First, put in the 64bit mount point identifier. */ if (idlen >= sizeof (u_int64_t)) { NFSBCOPY((caddr_t)&clval, cp, sizeof (u_int64_t)); cp += sizeof (u_int64_t); idlen -= sizeof (u_int64_t); } /* * If uuid is non-zero length, use it. */ uuidlen = strlen(uuid); if (uuidlen > 0 && idlen >= uuidlen) { NFSBCOPY(uuid, cp, uuidlen); cp += uuidlen; idlen -= uuidlen; } /* * This only normally happens if the uuid isn't set. */ while (idlen > 0) { *cp++ = (u_int8_t)(arc4random() % 256); idlen--; } } /* * Fill in a lock owner name. For now, pid + the process's creation time. */ void nfscl_filllockowner(void *id, u_int8_t *cp, int flags) { union { u_int32_t lval; u_int8_t cval[4]; } tl; struct proc *p; if (id == NULL) { /* Return the single open_owner of all 0 bytes. */ bzero(cp, NFSV4CL_LOCKNAMELEN); return; } if ((flags & F_POSIX) != 0) { p = (struct proc *)id; tl.lval = p->p_pid; *cp++ = tl.cval[0]; *cp++ = tl.cval[1]; *cp++ = tl.cval[2]; *cp++ = tl.cval[3]; tl.lval = p->p_stats->p_start.tv_sec; *cp++ = tl.cval[0]; *cp++ = tl.cval[1]; *cp++ = tl.cval[2]; *cp++ = tl.cval[3]; tl.lval = p->p_stats->p_start.tv_usec; *cp++ = tl.cval[0]; *cp++ = tl.cval[1]; *cp++ = tl.cval[2]; *cp = tl.cval[3]; } else if ((flags & F_FLOCK) != 0) { bcopy(&id, cp, sizeof(id)); bzero(&cp[sizeof(id)], NFSV4CL_LOCKNAMELEN - sizeof(id)); } else { printf("nfscl_filllockowner: not F_POSIX or F_FLOCK\n"); bzero(cp, NFSV4CL_LOCKNAMELEN); } } /* * Find the parent process for the thread passed in as an argument. * If none exists, return NULL, otherwise return a thread for the parent. * (Can be any of the threads, since it is only used for td->td_proc.) */ NFSPROC_T * nfscl_getparent(struct thread *td) { struct proc *p; struct thread *ptd; if (td == NULL) return (NULL); p = td->td_proc; if (p->p_pid == 0) return (NULL); p = p->p_pptr; if (p == NULL) return (NULL); ptd = TAILQ_FIRST(&p->p_threads); return (ptd); } /* * Start up the renew kernel thread. */ static void start_nfscl(void *arg) { struct nfsclclient *clp; struct thread *td; clp = (struct nfsclclient *)arg; td = TAILQ_FIRST(&clp->nfsc_renewthread->p_threads); nfscl_renewthread(clp, td); kproc_exit(0); } void nfscl_start_renewthread(struct nfsclclient *clp) { kproc_create(start_nfscl, (void *)clp, &clp->nfsc_renewthread, 0, 0, "nfscl"); } /* * Handle wcc_data. * For NFSv4, it assumes that nfsv4_wccattr() was used to set up the getattr * as the first Op after PutFH. * (For NFSv4, the postop attributes are after the Op, so they can't be * parsed here. A separate call to nfscl_postop_attr() is required.) */ int nfscl_wcc_data(struct nfsrv_descript *nd, struct vnode *vp, struct nfsvattr *nap, int *flagp, int *wccflagp, void *stuff) { u_int32_t *tl; struct nfsnode *np = VTONFS(vp); struct nfsvattr nfsva; int error = 0; if (wccflagp != NULL) *wccflagp = 0; if (nd->nd_flag & ND_NFSV3) { *flagp = 0; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); if (*tl == newnfs_true) { NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED); if (wccflagp != NULL) { mtx_lock(&np->n_mtx); *wccflagp = (np->n_mtime.tv_sec == fxdr_unsigned(u_int32_t, *(tl + 2)) && np->n_mtime.tv_nsec == fxdr_unsigned(u_int32_t, *(tl + 3))); mtx_unlock(&np->n_mtx); } } error = nfscl_postop_attr(nd, nap, flagp, stuff); if (wccflagp != NULL && *flagp == 0) *wccflagp = 0; } else if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) == (ND_NFSV4 | ND_V4WCCATTR)) { error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL); if (error) return (error); /* * Get rid of Op# and status for next op. */ NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*++tl) nd->nd_flag |= ND_NOMOREDATA; if (wccflagp != NULL && nfsva.na_vattr.va_mtime.tv_sec != 0) { mtx_lock(&np->n_mtx); *wccflagp = (np->n_mtime.tv_sec == nfsva.na_vattr.va_mtime.tv_sec && np->n_mtime.tv_nsec == nfsva.na_vattr.va_mtime.tv_sec); mtx_unlock(&np->n_mtx); } } nfsmout: return (error); } /* * Get postop attributes. */ int nfscl_postop_attr(struct nfsrv_descript *nd, struct nfsvattr *nap, int *retp, void *stuff) { u_int32_t *tl; int error = 0; *retp = 0; if (nd->nd_flag & ND_NOMOREDATA) return (error); if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); *retp = fxdr_unsigned(int, *tl); } else if (nd->nd_flag & ND_NFSV4) { /* * For NFSv4, the postop attr are at the end, so no point * in looking if nd_repstat != 0. */ if (!nd->nd_repstat) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) /* should never happen since nd_repstat != 0 */ nd->nd_flag |= ND_NOMOREDATA; else *retp = 1; } } else if (!nd->nd_repstat) { /* For NFSv2, the attributes are here iff nd_repstat == 0 */ *retp = 1; } if (*retp) { error = nfsm_loadattr(nd, nap); if (error) *retp = 0; } nfsmout: return (error); } /* * nfscl_request() - mostly a wrapper for newnfs_request(). */ int nfscl_request(struct nfsrv_descript *nd, struct vnode *vp, NFSPROC_T *p, struct ucred *cred, void *stuff) { int ret, vers; struct nfsmount *nmp; nmp = VFSTONFS(vp->v_mount); if (nd->nd_flag & ND_NFSV4) vers = NFS_VER4; else if (nd->nd_flag & ND_NFSV3) vers = NFS_VER3; else vers = NFS_VER2; ret = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, NULL); return (ret); } /* * fill in this bsden's variant of statfs using nfsstatfs. */ void nfscl_loadsbinfo(struct nfsmount *nmp, struct nfsstatfs *sfp, void *statfs) { struct statfs *sbp = (struct statfs *)statfs; if (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) { sbp->f_bsize = NFS_FABLKSIZE; sbp->f_blocks = sfp->sf_tbytes / NFS_FABLKSIZE; sbp->f_bfree = sfp->sf_fbytes / NFS_FABLKSIZE; /* * Although sf_abytes is uint64_t and f_bavail is int64_t, * the value after dividing by NFS_FABLKSIZE is small * enough that it will fit in 63bits, so it is ok to * assign it to f_bavail without fear that it will become * negative. */ sbp->f_bavail = sfp->sf_abytes / NFS_FABLKSIZE; sbp->f_files = sfp->sf_tfiles; /* Since f_ffree is int64_t, clip it to 63bits. */ if (sfp->sf_ffiles > INT64_MAX) sbp->f_ffree = INT64_MAX; else sbp->f_ffree = sfp->sf_ffiles; } else if ((nmp->nm_flag & NFSMNT_NFSV4) == 0) { /* * The type casts to (int32_t) ensure that this code is * compatible with the old NFS client, in that it will * propagate bit31 to the high order bits. This may or may * not be correct for NFSv2, but since it is a legacy * environment, I'd rather retain backwards compatibility. */ sbp->f_bsize = (int32_t)sfp->sf_bsize; sbp->f_blocks = (int32_t)sfp->sf_blocks; sbp->f_bfree = (int32_t)sfp->sf_bfree; sbp->f_bavail = (int32_t)sfp->sf_bavail; sbp->f_files = 0; sbp->f_ffree = 0; } } /* * Use the fsinfo stuff to update the mount point. */ void nfscl_loadfsinfo(struct nfsmount *nmp, struct nfsfsinfo *fsp) { if ((nmp->nm_wsize == 0 || fsp->fs_wtpref < nmp->nm_wsize) && fsp->fs_wtpref >= NFS_FABLKSIZE) nmp->nm_wsize = (fsp->fs_wtpref + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); if (fsp->fs_wtmax < nmp->nm_wsize && fsp->fs_wtmax > 0) { nmp->nm_wsize = fsp->fs_wtmax & ~(NFS_FABLKSIZE - 1); if (nmp->nm_wsize == 0) nmp->nm_wsize = fsp->fs_wtmax; } if (nmp->nm_wsize < NFS_FABLKSIZE) nmp->nm_wsize = NFS_FABLKSIZE; if ((nmp->nm_rsize == 0 || fsp->fs_rtpref < nmp->nm_rsize) && fsp->fs_rtpref >= NFS_FABLKSIZE) nmp->nm_rsize = (fsp->fs_rtpref + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); if (fsp->fs_rtmax < nmp->nm_rsize && fsp->fs_rtmax > 0) { nmp->nm_rsize = fsp->fs_rtmax & ~(NFS_FABLKSIZE - 1); if (nmp->nm_rsize == 0) nmp->nm_rsize = fsp->fs_rtmax; } if (nmp->nm_rsize < NFS_FABLKSIZE) nmp->nm_rsize = NFS_FABLKSIZE; if ((nmp->nm_readdirsize == 0 || fsp->fs_dtpref < nmp->nm_readdirsize) && fsp->fs_dtpref >= NFS_DIRBLKSIZ) nmp->nm_readdirsize = (fsp->fs_dtpref + NFS_DIRBLKSIZ - 1) & ~(NFS_DIRBLKSIZ - 1); if (fsp->fs_rtmax < nmp->nm_readdirsize && fsp->fs_rtmax > 0) { nmp->nm_readdirsize = fsp->fs_rtmax & ~(NFS_DIRBLKSIZ - 1); if (nmp->nm_readdirsize == 0) nmp->nm_readdirsize = fsp->fs_rtmax; } if (nmp->nm_readdirsize < NFS_DIRBLKSIZ) nmp->nm_readdirsize = NFS_DIRBLKSIZ; if (fsp->fs_maxfilesize > 0 && fsp->fs_maxfilesize < nmp->nm_maxfilesize) nmp->nm_maxfilesize = fsp->fs_maxfilesize; nmp->nm_mountp->mnt_stat.f_iosize = newnfs_iosize(nmp); nmp->nm_state |= NFSSTA_GOTFSINFO; } /* * Lookups source address which should be used to communicate with * @nmp and stores it inside @pdst. * * Returns 0 on success. */ u_int8_t * nfscl_getmyip(struct nfsmount *nmp, struct in6_addr *paddr, int *isinet6p) { #if defined(INET6) || defined(INET) int error, fibnum; fibnum = curthread->td_proc->p_fibnum; #endif #ifdef INET if (nmp->nm_nam->sa_family == AF_INET) { struct sockaddr_in *sin; struct nhop4_extended nh_ext; sin = (struct sockaddr_in *)nmp->nm_nam; CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred)); error = fib4_lookup_nh_ext(fibnum, sin->sin_addr, 0, 0, &nh_ext); CURVNET_RESTORE(); if (error != 0) return (NULL); if (IN_LOOPBACK(ntohl(nh_ext.nh_src.s_addr))) { /* Ignore loopback addresses */ return (NULL); } *isinet6p = 0; *((struct in_addr *)paddr) = nh_ext.nh_src; return (u_int8_t *)paddr; } #endif #ifdef INET6 if (nmp->nm_nam->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)nmp->nm_nam; CURVNET_SET(CRED_TO_VNET(nmp->nm_sockreq.nr_cred)); error = in6_selectsrc_addr(fibnum, &sin6->sin6_addr, sin6->sin6_scope_id, NULL, paddr, NULL); CURVNET_RESTORE(); if (error != 0) return (NULL); if (IN6_IS_ADDR_LOOPBACK(paddr)) return (NULL); /* Scope is embedded in */ *isinet6p = 1; return (u_int8_t *)paddr; } #endif return (NULL); } /* * Copy NFS uid, gids from the cred structure. */ void newnfs_copyincred(struct ucred *cr, struct nfscred *nfscr) { int i; KASSERT(cr->cr_ngroups >= 0, ("newnfs_copyincred: negative cr_ngroups")); nfscr->nfsc_uid = cr->cr_uid; nfscr->nfsc_ngroups = MIN(cr->cr_ngroups, NFS_MAXGRPS + 1); for (i = 0; i < nfscr->nfsc_ngroups; i++) nfscr->nfsc_groups[i] = cr->cr_groups[i]; } /* * Do any client specific initialization. */ void nfscl_init(void) { static int inited = 0; if (inited) return; inited = 1; nfscl_inited = 1; ncl_pbuf_zone = pbuf_zsecond_create("nfspbuf", nswbuf / 2); } /* * Check each of the attributes to be set, to ensure they aren't already * the correct value. Disable setting ones already correct. */ int nfscl_checksattr(struct vattr *vap, struct nfsvattr *nvap) { if (vap->va_mode != (mode_t)VNOVAL) { if (vap->va_mode == nvap->na_mode) vap->va_mode = (mode_t)VNOVAL; } if (vap->va_uid != (uid_t)VNOVAL) { if (vap->va_uid == nvap->na_uid) vap->va_uid = (uid_t)VNOVAL; } if (vap->va_gid != (gid_t)VNOVAL) { if (vap->va_gid == nvap->na_gid) vap->va_gid = (gid_t)VNOVAL; } if (vap->va_size != VNOVAL) { if (vap->va_size == nvap->na_size) vap->va_size = VNOVAL; } /* * We are normally called with only a partially initialized * VAP. Since the NFSv3 spec says that server may use the * file attributes to store the verifier, the spec requires * us to do a SETATTR RPC. FreeBSD servers store the verifier * in atime, but we can't really assume that all servers will * so we ensure that our SETATTR sets both atime and mtime. * Set the VA_UTIMES_NULL flag for this case, so that * the server's time will be used. This is needed to * work around a bug in some Solaris servers, where * setting the time TOCLIENT causes the Setattr RPC * to return NFS_OK, but not set va_mode. */ if (vap->va_mtime.tv_sec == VNOVAL) { vfs_timestamp(&vap->va_mtime); vap->va_vaflags |= VA_UTIMES_NULL; } if (vap->va_atime.tv_sec == VNOVAL) vap->va_atime = vap->va_mtime; return (1); } /* * Map nfsv4 errors to errno.h errors. * The uid and gid arguments are only used for NFSERR_BADOWNER and that * error should only be returned for the Open, Create and Setattr Ops. * As such, most calls can just pass in 0 for those arguments. */ APPLESTATIC int nfscl_maperr(struct thread *td, int error, uid_t uid, gid_t gid) { struct proc *p; if (error < 10000 || error >= NFSERR_STALEWRITEVERF) return (error); if (td != NULL) p = td->td_proc; else p = NULL; switch (error) { case NFSERR_BADOWNER: tprintf(p, LOG_INFO, "No name and/or group mapping for uid,gid:(%d,%d)\n", uid, gid); return (EPERM); case NFSERR_BADNAME: case NFSERR_BADCHAR: printf("nfsv4 char/name not handled by server\n"); return (ENOENT); case NFSERR_STALECLIENTID: case NFSERR_STALESTATEID: case NFSERR_EXPIRED: case NFSERR_BADSTATEID: case NFSERR_BADSESSION: printf("nfsv4 recover err returned %d\n", error); return (EIO); case NFSERR_BADHANDLE: case NFSERR_SERVERFAULT: case NFSERR_BADTYPE: case NFSERR_FHEXPIRED: case NFSERR_RESOURCE: case NFSERR_MOVED: case NFSERR_NOFILEHANDLE: case NFSERR_MINORVERMISMATCH: case NFSERR_OLDSTATEID: case NFSERR_BADSEQID: case NFSERR_LEASEMOVED: case NFSERR_RECLAIMBAD: case NFSERR_BADXDR: case NFSERR_OPILLEGAL: printf("nfsv4 client/server protocol prob err=%d\n", error); return (EIO); default: tprintf(p, LOG_INFO, "nfsv4 err=%d\n", error); return (EIO); }; } /* * Check to see if the process for this owner exists. Return 1 if it doesn't * and 0 otherwise. */ int nfscl_procdoesntexist(u_int8_t *own) { union { u_int32_t lval; u_int8_t cval[4]; } tl; struct proc *p; pid_t pid; int i, ret = 0; /* For the single open_owner of all 0 bytes, just return 0. */ for (i = 0; i < NFSV4CL_LOCKNAMELEN; i++) if (own[i] != 0) break; if (i == NFSV4CL_LOCKNAMELEN) return (0); tl.cval[0] = *own++; tl.cval[1] = *own++; tl.cval[2] = *own++; tl.cval[3] = *own++; pid = tl.lval; p = pfind_any_locked(pid); if (p == NULL) return (1); if (p->p_stats == NULL) { PROC_UNLOCK(p); return (0); } tl.cval[0] = *own++; tl.cval[1] = *own++; tl.cval[2] = *own++; tl.cval[3] = *own++; if (tl.lval != p->p_stats->p_start.tv_sec) { ret = 1; } else { tl.cval[0] = *own++; tl.cval[1] = *own++; tl.cval[2] = *own++; tl.cval[3] = *own; if (tl.lval != p->p_stats->p_start.tv_usec) ret = 1; } PROC_UNLOCK(p); return (ret); } /* * - nfs pseudo system call for the client */ /* * MPSAFE */ static int nfssvc_nfscl(struct thread *td, struct nfssvc_args *uap) { struct file *fp; struct nfscbd_args nfscbdarg; struct nfsd_nfscbd_args nfscbdarg2; struct nameidata nd; struct nfscl_dumpmntopts dumpmntopts; cap_rights_t rights; char *buf; int error; struct mount *mp; struct nfsmount *nmp; if (uap->flag & NFSSVC_CBADDSOCK) { error = copyin(uap->argp, (caddr_t)&nfscbdarg, sizeof(nfscbdarg)); if (error) return (error); /* * Since we don't know what rights might be required, * pretend that we need them all. It is better to be too * careful than too reckless. */ error = fget(td, nfscbdarg.sock, cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); if (error) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); return (EPERM); } error = nfscbd_addsock(fp); fdrop(fp, td); if (!error && nfscl_enablecallb == 0) { nfsv4_cbport = nfscbdarg.port; nfscl_enablecallb = 1; } } else if (uap->flag & NFSSVC_NFSCBD) { if (uap->argp == NULL) return (EINVAL); error = copyin(uap->argp, (caddr_t)&nfscbdarg2, sizeof(nfscbdarg2)); if (error) return (error); error = nfscbd_nfsd(td, &nfscbdarg2); } else if (uap->flag & NFSSVC_DUMPMNTOPTS) { error = copyin(uap->argp, &dumpmntopts, sizeof(dumpmntopts)); if (error == 0 && (dumpmntopts.ndmnt_blen < 256 || dumpmntopts.ndmnt_blen > 1024)) error = EINVAL; if (error == 0) error = nfsrv_lookupfilename(&nd, dumpmntopts.ndmnt_fname, td); if (error == 0 && strcmp(nd.ni_vp->v_mount->mnt_vfc->vfc_name, "nfs") != 0) { vput(nd.ni_vp); error = EINVAL; } if (error == 0) { buf = malloc(dumpmntopts.ndmnt_blen, M_TEMP, M_WAITOK); nfscl_retopts(VFSTONFS(nd.ni_vp->v_mount), buf, dumpmntopts.ndmnt_blen); vput(nd.ni_vp); error = copyout(buf, dumpmntopts.ndmnt_buf, dumpmntopts.ndmnt_blen); free(buf, M_TEMP); } } else if (uap->flag & NFSSVC_FORCEDISM) { buf = malloc(MNAMELEN + 1, M_TEMP, M_WAITOK); error = copyinstr(uap->argp, buf, MNAMELEN + 1, NULL); if (error == 0) { nmp = NULL; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, buf) == 0 && strcmp(mp->mnt_stat.f_fstypename, "nfs") == 0 && mp->mnt_data != NULL) { nmp = VFSTONFS(mp); NFSDDSLOCK(); if (nfsv4_findmirror(nmp) != NULL) { NFSDDSUNLOCK(); error = ENXIO; nmp = NULL; break; } mtx_lock(&nmp->nm_mtx); if ((nmp->nm_privflag & NFSMNTP_FORCEDISM) == 0) { nmp->nm_privflag |= (NFSMNTP_FORCEDISM | NFSMNTP_CANCELRPCS); mtx_unlock(&nmp->nm_mtx); } else { mtx_unlock(&nmp->nm_mtx); nmp = NULL; } NFSDDSUNLOCK(); break; } } mtx_unlock(&mountlist_mtx); if (nmp != NULL) { /* * Call newnfs_nmcancelreqs() to cause * any RPCs in progress on the mount point to * fail. * This will cause any process waiting for an * RPC to complete while holding a vnode lock * on the mounted-on vnode (such as "df" or * a non-forced "umount") to fail. * This will unlock the mounted-on vnode so * a forced dismount can succeed. * Then clear NFSMNTP_CANCELRPCS and wakeup(), * so that nfs_unmount() can complete. */ newnfs_nmcancelreqs(nmp); mtx_lock(&nmp->nm_mtx); nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS; wakeup(nmp); mtx_unlock(&nmp->nm_mtx); } else if (error == 0) error = EINVAL; } free(buf, M_TEMP); } else { error = EINVAL; } return (error); } extern int (*nfsd_call_nfscl)(struct thread *, struct nfssvc_args *); /* * Called once to initialize data structures... */ static int nfscl_modevent(module_t mod, int type, void *data) { int error = 0; static int loaded = 0; switch (type) { case MOD_LOAD: if (loaded) return (0); newnfs_portinit(); mtx_init(&ncl_iod_mutex, "ncl_iod_mutex", NULL, MTX_DEF); nfscl_init(); NFSD_LOCK(); nfsrvd_cbinit(0); NFSD_UNLOCK(); ncl_call_invalcaches = ncl_invalcaches; nfsd_call_nfscl = nfssvc_nfscl; loaded = 1; break; case MOD_UNLOAD: if (nfs_numnfscbd != 0) { error = EBUSY; break; } /* * XXX: Unloading of nfscl module is unsupported. */ #if 0 ncl_call_invalcaches = NULL; nfsd_call_nfscl = NULL; uma_zdestroy(ncl_pbuf_zone); /* and get rid of the mutexes */ mtx_destroy(&ncl_iod_mutex); loaded = 0; break; #else /* FALLTHROUGH */ #endif default: error = EOPNOTSUPP; break; } return error; } static moduledata_t nfscl_mod = { "nfscl", nfscl_modevent, NULL, }; DECLARE_MODULE(nfscl, nfscl_mod, SI_SUB_VFS, SI_ORDER_FIRST); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfscl, 1); MODULE_DEPEND(nfscl, nfscommon, 1, 1, 1); MODULE_DEPEND(nfscl, krpc, 1, 1, 1); MODULE_DEPEND(nfscl, nfssvc, 1, 1, 1); MODULE_DEPEND(nfscl, nfslock, 1, 1, 1); Index: projects/clang900-import/sys/kern/kern_sysctl.c =================================================================== --- projects/clang900-import/sys/kern/kern_sysctl.c (revision 352536) +++ projects/clang900-import/sys/kern/kern_sysctl.c (revision 352537) @@ -1,2838 +1,2843 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef DDB #include #include #endif #include #include #include #include static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic"); static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer"); /* * The sysctllock protects the MIB tree. It also protects sysctl * contexts used with dynamic sysctls. The sysctl_register_oid() and * sysctl_unregister_oid() routines require the sysctllock to already * be held, so the sysctl_wlock() and sysctl_wunlock() routines are * provided for the few places in the kernel which need to use that * API rather than using the dynamic API. Use of the dynamic API is * strongly encouraged for most code. * * The sysctlmemlock is used to limit the amount of user memory wired for * sysctl requests. This is implemented by serializing any userland * sysctl requests larger than a single page via an exclusive lock. */ static struct rmlock sysctllock; static struct sx __exclusive_cache_line sysctlmemlock; #define SYSCTL_WLOCK() rm_wlock(&sysctllock) #define SYSCTL_WUNLOCK() rm_wunlock(&sysctllock) #define SYSCTL_RLOCK(tracker) rm_rlock(&sysctllock, (tracker)) #define SYSCTL_RUNLOCK(tracker) rm_runlock(&sysctllock, (tracker)) #define SYSCTL_WLOCKED() rm_wowned(&sysctllock) #define SYSCTL_ASSERT_LOCKED() rm_assert(&sysctllock, RA_LOCKED) #define SYSCTL_ASSERT_WLOCKED() rm_assert(&sysctllock, RA_WLOCKED) #define SYSCTL_ASSERT_RLOCKED() rm_assert(&sysctllock, RA_RLOCKED) #define SYSCTL_INIT() rm_init_flags(&sysctllock, "sysctl lock", \ RM_SLEEPABLE) #define SYSCTL_SLEEP(ch, wmesg, timo) \ rm_sleep(ch, &sysctllock, 0, wmesg, timo) static int sysctl_root(SYSCTL_HANDLER_ARGS); /* Root list */ struct sysctl_oid_list sysctl__children = SLIST_HEAD_INITIALIZER(&sysctl__children); static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse); static int sysctl_old_kernel(struct sysctl_req *, const void *, size_t); static int sysctl_new_kernel(struct sysctl_req *, void *, size_t); static struct sysctl_oid * sysctl_find_oidname(const char *name, struct sysctl_oid_list *list) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SLIST_FOREACH(oidp, list, oid_link) { if (strcmp(oidp->oid_name, name) == 0) { return (oidp); } } return (NULL); } /* * Initialization of the MIB tree. * * Order by number in each list. */ void sysctl_wlock(void) { SYSCTL_WLOCK(); } void sysctl_wunlock(void) { SYSCTL_WUNLOCK(); } static int sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intmax_t arg2, struct sysctl_req *req, struct rm_priotracker *tracker) { int error; if (oid->oid_kind & CTLFLAG_DYN) atomic_add_int(&oid->oid_running, 1); if (tracker != NULL) SYSCTL_RUNLOCK(tracker); else SYSCTL_WUNLOCK(); if (!(oid->oid_kind & CTLFLAG_MPSAFE)) mtx_lock(&Giant); error = oid->oid_handler(oid, arg1, arg2, req); if (!(oid->oid_kind & CTLFLAG_MPSAFE)) mtx_unlock(&Giant); KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error); if (tracker != NULL) SYSCTL_RLOCK(tracker); else SYSCTL_WLOCK(); if (oid->oid_kind & CTLFLAG_DYN) { if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 && (oid->oid_kind & CTLFLAG_DYING) != 0) wakeup(&oid->oid_running); } return (error); } static void sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp) { struct sysctl_req req; struct sysctl_oid *curr; char *penv = NULL; char path[96]; ssize_t rem = sizeof(path); ssize_t len; uint8_t data[512] __aligned(sizeof(uint64_t)); int size; int error; path[--rem] = 0; for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) { len = strlen(curr->oid_name); rem -= len; if (curr != oidp) rem -= 1; if (rem < 0) { printf("OID path exceeds %d bytes\n", (int)sizeof(path)); return; } memcpy(path + rem, curr->oid_name, len); if (curr != oidp) path[rem + len] = '.'; } memset(&req, 0, sizeof(req)); req.td = curthread; req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_INT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_UINT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_LONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_ULONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int8_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int16_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int32_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int64_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint8_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint16_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint32_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint64_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_STRING: penv = kern_getenv(path + rem); if (penv == NULL) return; req.newlen = strlen(penv); req.newptr = penv; break; default: return; } error = sysctl_root_handler_locked(oidp, oidp->oid_arg1, oidp->oid_arg2, &req, NULL); if (error != 0) printf("Setting sysctl %s failed: %d\n", path + rem, error); if (penv != NULL) freeenv(penv); } /* * Locate the path to a given oid. Returns the length of the resulting path, * or -1 if the oid was not found. nodes must have room for CTL_MAXNAME * elements and be NULL initialized. */ static int sysctl_search_oid(struct sysctl_oid **nodes, struct sysctl_oid *needle) { int indx; SYSCTL_ASSERT_LOCKED(); indx = 0; while (indx < CTL_MAXNAME && indx >= 0) { if (nodes[indx] == NULL && indx == 0) nodes[indx] = SLIST_FIRST(&sysctl__children); else if (nodes[indx] == NULL) nodes[indx] = SLIST_FIRST(&nodes[indx - 1]->oid_children); else nodes[indx] = SLIST_NEXT(nodes[indx], oid_link); if (nodes[indx] == needle) return (indx + 1); if (nodes[indx] == NULL) { indx--; continue; } if ((nodes[indx]->oid_kind & CTLTYPE) == CTLTYPE_NODE) { indx++; continue; } } return (-1); } static void sysctl_warn_reuse(const char *func, struct sysctl_oid *leaf) { struct sysctl_oid *nodes[CTL_MAXNAME]; char buf[128]; struct sbuf sb; int rc, i; (void)sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_printf(&sb, "%s: can't re-use a leaf (", __func__); memset(nodes, 0, sizeof(nodes)); rc = sysctl_search_oid(nodes, leaf); if (rc > 0) { for (i = 0; i < rc; i++) sbuf_printf(&sb, "%s%.*s", nodes[i]->oid_name, i != (rc - 1), "."); } else { sbuf_printf(&sb, "%s", leaf->oid_name); } sbuf_printf(&sb, ")!\n"); (void)sbuf_finish(&sb); } #ifdef SYSCTL_DEBUG static int sysctl_reuse_test(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; SYSCTL_RLOCK(&tracker); sysctl_warn_reuse(__func__, oidp); SYSCTL_RUNLOCK(&tracker); return (0); } SYSCTL_PROC(_sysctl, 0, reuse_test, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_reuse_test, "-", ""); #endif void sysctl_register_oid(struct sysctl_oid *oidp) { struct sysctl_oid_list *parent = oidp->oid_parent; struct sysctl_oid *p; struct sysctl_oid *q; int oid_number; int timeout = 2; /* * First check if another oid with the same name already * exists in the parent's list. */ SYSCTL_ASSERT_WLOCKED(); p = sysctl_find_oidname(oidp->oid_name, parent); if (p != NULL) { if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) { p->oid_refcnt++; return; } else { sysctl_warn_reuse(__func__, p); return; } } /* get current OID number */ oid_number = oidp->oid_number; #if (OID_AUTO >= 0) #error "OID_AUTO is expected to be a negative value" #endif /* * Any negative OID number qualifies as OID_AUTO. Valid OID * numbers should always be positive. * * NOTE: DO NOT change the starting value here, change it in * , and make sure it is at least 256 to * accommodate e.g. net.inet.raw as a static sysctl node. */ if (oid_number < 0) { static int newoid; /* * By decrementing the next OID number we spend less * time inserting the OIDs into a sorted list. */ if (--newoid < CTL_AUTO_START) newoid = 0x7fffffff; oid_number = newoid; } /* * Insert the OID into the parent's list sorted by OID number. */ retry: q = NULL; SLIST_FOREACH(p, parent, oid_link) { /* check if the current OID number is in use */ if (oid_number == p->oid_number) { /* get the next valid OID number */ if (oid_number < CTL_AUTO_START || oid_number == 0x7fffffff) { /* wraparound - restart */ oid_number = CTL_AUTO_START; /* don't loop forever */ if (!timeout--) panic("sysctl: Out of OID numbers\n"); goto retry; } else { oid_number++; } } else if (oid_number < p->oid_number) break; q = p; } /* check for non-auto OID number collision */ if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START && oid_number >= CTL_AUTO_START) { printf("sysctl: OID number(%d) is already in use for '%s'\n", oidp->oid_number, oidp->oid_name); } /* update the OID number, if any */ oidp->oid_number = oid_number; if (q != NULL) SLIST_INSERT_AFTER(q, oidp, oid_link); else SLIST_INSERT_HEAD(parent, oidp, oid_link); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && #ifdef VIMAGE (oidp->oid_kind & CTLFLAG_VNET) == 0 && #endif (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { /* only fetch value once */ oidp->oid_kind |= CTLFLAG_NOFETCH; /* try to fetch value from kernel environment */ sysctl_load_tunable_by_oid_locked(oidp); } } void sysctl_register_disabled_oid(struct sysctl_oid *oidp) { /* * Mark the leaf as dormant if it's not to be immediately enabled. * We do not disable nodes as they can be shared between modules * and it is always safe to access a node. */ KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("internal flag is set in oid_kind")); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) oidp->oid_kind |= CTLFLAG_DORMANT; sysctl_register_oid(oidp); } void sysctl_enable_oid(struct sysctl_oid *oidp) { SYSCTL_ASSERT_WLOCKED(); if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("sysctl node is marked as dormant")); return; } KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) != 0, ("enabling already enabled sysctl oid")); oidp->oid_kind &= ~CTLFLAG_DORMANT; } void sysctl_unregister_oid(struct sysctl_oid *oidp) { struct sysctl_oid *p; int error; SYSCTL_ASSERT_WLOCKED(); if (oidp->oid_number == OID_AUTO) { error = EINVAL; } else { error = ENOENT; SLIST_FOREACH(p, oidp->oid_parent, oid_link) { if (p == oidp) { SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); error = 0; break; } } } /* * This can happen when a module fails to register and is * being unloaded afterwards. It should not be a panic() * for normal use. */ if (error) { printf("%s: failed(%d) to unregister sysctl(%s)\n", __func__, error, oidp->oid_name); } } /* Initialize a new context to keep track of dynamically added sysctls. */ int sysctl_ctx_init(struct sysctl_ctx_list *c) { if (c == NULL) { return (EINVAL); } /* * No locking here, the caller is responsible for not adding * new nodes to a context until after this function has * returned. */ TAILQ_INIT(c); return (0); } /* Free the context, and destroy all dynamic oids registered in this context */ int sysctl_ctx_free(struct sysctl_ctx_list *clist) { struct sysctl_ctx_entry *e, *e1; int error; error = 0; /* * First perform a "dry run" to check if it's ok to remove oids. * XXX FIXME * XXX This algorithm is a hack. But I don't know any * XXX better solution for now... */ SYSCTL_WLOCK(); TAILQ_FOREACH(e, clist, link) { error = sysctl_remove_oid_locked(e->entry, 0, 0); if (error) break; } /* * Restore deregistered entries, either from the end, * or from the place where error occurred. * e contains the entry that was not unregistered */ if (error) e1 = TAILQ_PREV(e, sysctl_ctx_list, link); else e1 = TAILQ_LAST(clist, sysctl_ctx_list); while (e1 != NULL) { sysctl_register_oid(e1->entry); e1 = TAILQ_PREV(e1, sysctl_ctx_list, link); } if (error) { SYSCTL_WUNLOCK(); return(EBUSY); } /* Now really delete the entries */ e = TAILQ_FIRST(clist); while (e != NULL) { e1 = TAILQ_NEXT(e, link); error = sysctl_remove_oid_locked(e->entry, 1, 0); if (error) panic("sysctl_remove_oid: corrupt tree, entry: %s", e->entry->oid_name); free(e, M_SYSCTLOID); e = e1; } SYSCTL_WUNLOCK(); return (error); } /* Add an entry to the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK); e->entry = oidp; TAILQ_INSERT_HEAD(clist, e, link); return (e); } /* Find an entry in the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); TAILQ_FOREACH(e, clist, link) { if(e->entry == oidp) return(e); } return (e); } /* * Delete an entry from the context. * NOTE: this function doesn't free oidp! You have to remove it * with sysctl_remove_oid(). */ int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; if (clist == NULL || oidp == NULL) return (EINVAL); SYSCTL_WLOCK(); e = sysctl_ctx_entry_find(clist, oidp); if (e != NULL) { TAILQ_REMOVE(clist, e, link); SYSCTL_WUNLOCK(); free(e, M_SYSCTLOID); return (0); } else { SYSCTL_WUNLOCK(); return (ENOENT); } } /* * Remove dynamically created sysctl trees. * oidp - top of the tree to be removed * del - if 0 - just deregister, otherwise free up entries as well * recurse - if != 0 traverse the subtree to be deleted */ int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse) { int error; SYSCTL_WLOCK(); error = sysctl_remove_oid_locked(oidp, del, recurse); SYSCTL_WUNLOCK(); return (error); } int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; error = ENOENT; SYSCTL_WLOCK(); SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) { if (strcmp(p->oid_name, name) == 0) { error = sysctl_remove_oid_locked(p, del, recurse); break; } } SYSCTL_WUNLOCK(); return (error); } static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; SYSCTL_ASSERT_WLOCKED(); if (oidp == NULL) return(EINVAL); if ((oidp->oid_kind & CTLFLAG_DYN) == 0) { printf("Warning: can't remove non-dynamic nodes (%s)!\n", oidp->oid_name); return (EINVAL); } /* * WARNING: normal method to do this should be through * sysctl_ctx_free(). Use recursing as the last resort * method to purge your sysctl tree of leftovers... * However, if some other code still references these nodes, * it will panic. */ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oidp->oid_refcnt == 1) { SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(oidp), oid_link, tmp) { if (!recurse) { printf("Warning: failed attempt to " "remove oid %s with child %s\n", oidp->oid_name, p->oid_name); return (ENOTEMPTY); } error = sysctl_remove_oid_locked(p, del, recurse); if (error) return (error); } } } if (oidp->oid_refcnt > 1 ) { oidp->oid_refcnt--; } else { if (oidp->oid_refcnt == 0) { printf("Warning: bad oid_refcnt=%u (%s)!\n", oidp->oid_refcnt, oidp->oid_name); return (EINVAL); } sysctl_unregister_oid(oidp); if (del) { /* * Wait for all threads running the handler to drain. * This preserves the previous behavior when the * sysctl lock was held across a handler invocation, * and is necessary for module unload correctness. */ while (oidp->oid_running > 0) { oidp->oid_kind |= CTLFLAG_DYING; SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0); } if (oidp->oid_descr) free(__DECONST(char *, oidp->oid_descr), M_SYSCTLOID); if (oidp->oid_label) free(__DECONST(char *, oidp->oid_label), M_SYSCTLOID); free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID); free(oidp, M_SYSCTLOID); } } return (0); } /* * Create new sysctls at run time. * clist may point to a valid context initialized with sysctl_ctx_init(). */ struct sysctl_oid * sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int number, const char *name, int kind, void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr, const char *label) { struct sysctl_oid *oidp; /* You have to hook up somewhere.. */ if (parent == NULL) return(NULL); /* Check if the node already exists, otherwise create it */ SYSCTL_WLOCK(); oidp = sysctl_find_oidname(name, parent); if (oidp != NULL) { if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { oidp->oid_refcnt++; /* Update the context */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); SYSCTL_WUNLOCK(); return (oidp); } else { sysctl_warn_reuse(__func__, oidp); SYSCTL_WUNLOCK(); return (NULL); } } oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO); oidp->oid_parent = parent; SLIST_INIT(&oidp->oid_children); oidp->oid_number = number; oidp->oid_refcnt = 1; oidp->oid_name = strdup(name, M_SYSCTLOID); oidp->oid_handler = handler; oidp->oid_kind = CTLFLAG_DYN | kind; oidp->oid_arg1 = arg1; oidp->oid_arg2 = arg2; oidp->oid_fmt = fmt; if (descr != NULL) oidp->oid_descr = strdup(descr, M_SYSCTLOID); if (label != NULL) oidp->oid_label = strdup(label, M_SYSCTLOID); /* Update the context, if used */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); /* Register this oid */ sysctl_register_oid(oidp); SYSCTL_WUNLOCK(); return (oidp); } /* * Rename an existing oid. */ void sysctl_rename_oid(struct sysctl_oid *oidp, const char *name) { char *newname; char *oldname; newname = strdup(name, M_SYSCTLOID); SYSCTL_WLOCK(); oldname = __DECONST(char *, oidp->oid_name); oidp->oid_name = newname; SYSCTL_WUNLOCK(); free(oldname, M_SYSCTLOID); } /* * Reparent an existing oid. */ int sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent) { struct sysctl_oid *oidp; SYSCTL_WLOCK(); if (oid->oid_parent == parent) { SYSCTL_WUNLOCK(); return (0); } oidp = sysctl_find_oidname(oid->oid_name, parent); if (oidp != NULL) { SYSCTL_WUNLOCK(); return (EEXIST); } sysctl_unregister_oid(oid); oid->oid_parent = parent; oid->oid_number = OID_AUTO; sysctl_register_oid(oid); SYSCTL_WUNLOCK(); return (0); } /* * Register the kernel's oids on startup. */ SET_DECLARE(sysctl_set, struct sysctl_oid); static void sysctl_register_all(void *arg) { struct sysctl_oid **oidp; sx_init(&sysctlmemlock, "sysctl mem"); SYSCTL_INIT(); SYSCTL_WLOCK(); SET_FOREACH(oidp, sysctl_set) sysctl_register_oid(*oidp); SYSCTL_WUNLOCK(); } SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL); /* * "Staff-functions" * * These functions implement a presently undocumented interface * used by the sysctl program to walk the tree, and get the type * so it can print the value. * This interface is under work and consideration, and should probably * be killed with a big axe by the first person who can find the time. * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * - * {0,0} printf the entire MIB-tree. - * {0,1,...} return the name of the "..." OID. - * {0,2,...} return the next OID. - * {0,3} return the OID of the name in "new" - * {0,4,...} return the kind & format info for the "..." OID. - * {0,5,...} return the description of the "..." OID. - * {0,6,...} return the aggregation label of the "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_DEBUG} printf the entire MIB-tree. + * {CTL_SYSCTL, CTL_SYSCTL_NAME, ...} return the name of the "..." + * OID. + * {CTL_SYSCTL, CTL_SYSCTL_NEXT, ...} return the next OID. + * {CTL_SYSCTL, CTL_SYSCTL_NAME2OID} return the OID of the name in + * "new" + * {CTL_SYSCTL, CTL_SYSCTL_OIDFMT, ...} return the kind & format info + * for the "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_OIDDESCR, ...} return the description of the + * "..." OID. + * {CTL_SYSCTL, CTL_SYSCTL_OIDLABEL, ...} return the aggregation label of + * the "..." OID. */ #ifdef SYSCTL_DEBUG static void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { int k; struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SLIST_FOREACH(oidp, l, oid_link) { for (k=0; koid_number, oidp->oid_name); printf("%c%c", oidp->oid_kind & CTLFLAG_RD ? 'R':' ', oidp->oid_kind & CTLFLAG_WR ? 'W':' '); if (oidp->oid_handler) printf(" *Handler"); switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_NODE: printf(" Node\n"); if (!oidp->oid_handler) { sysctl_sysctl_debug_dump_node( SYSCTL_CHILDREN(oidp), i + 2); } break; case CTLTYPE_INT: printf(" Int\n"); break; case CTLTYPE_UINT: printf(" u_int\n"); break; case CTLTYPE_LONG: printf(" Long\n"); break; case CTLTYPE_ULONG: printf(" u_long\n"); break; case CTLTYPE_STRING: printf(" String\n"); break; case CTLTYPE_S8: printf(" int8_t\n"); break; case CTLTYPE_S16: printf(" int16_t\n"); break; case CTLTYPE_S32: printf(" int32_t\n"); break; case CTLTYPE_S64: printf(" int64_t\n"); break; case CTLTYPE_U8: printf(" uint8_t\n"); break; case CTLTYPE_U16: printf(" uint16_t\n"); break; case CTLTYPE_U32: printf(" uint32_t\n"); break; case CTLTYPE_U64: printf(" uint64_t\n"); break; case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; default: printf("\n"); } } } static int sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; int error; error = priv_check(req->td, PRIV_SYSCTL_DEBUG); if (error) return (error); SYSCTL_RLOCK(&tracker); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); SYSCTL_RUNLOCK(&tracker); return (ENOENT); } -SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE, - 0, 0, sysctl_sysctl_debug, "-", ""); +SYSCTL_PROC(_sysctl, CTL_SYSCTL_DEBUG, debug, CTLTYPE_STRING | CTLFLAG_RD | + CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int error = 0; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; struct rm_priotracker tracker; char buf[10]; SYSCTL_RLOCK(&tracker); while (namelen) { if (!lsp) { snprintf(buf,sizeof(buf),"%d",*name); if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, buf, strlen(buf)); if (error) goto out; namelen--; name++; continue; } lsp2 = NULL; SLIST_FOREACH(oid, lsp, oid_link) { if (oid->oid_number != *name) continue; if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, oid->oid_name, strlen(oid->oid_name)); if (error) goto out; namelen--; name++; if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oid->oid_handler) break; lsp2 = SYSCTL_CHILDREN(oid); break; } lsp = lsp2; } error = SYSCTL_OUT(req, "", 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } /* * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in * capability mode. */ -static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, - sysctl_sysctl_name, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NAME, name, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, ""); static int sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, int *next, int *len, int level, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); *len = level; SLIST_FOREACH(oidp, lsp, oid_link) { *next = oidp->oid_number; *oidpp = oidp; if ((oidp->oid_kind & (CTLFLAG_SKIP | CTLFLAG_DORMANT)) != 0) continue; if (!namelen) { if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (0); if (oidp->oid_handler) /* We really should call the handler here...*/ return (0); lsp = SYSCTL_CHILDREN(oidp); if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, len, level+1, oidpp)) return (0); goto emptynode; } if (oidp->oid_number < *name) continue; if (oidp->oid_number > *name) { if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (0); if (oidp->oid_handler) return (0); lsp = SYSCTL_CHILDREN(oidp); if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, len, level+1, oidpp)) return (0); goto next; } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) continue; if (oidp->oid_handler) continue; lsp = SYSCTL_CHILDREN(oidp); if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, len, level+1, oidpp)) return (0); next: namelen = 1; emptynode: *len = level; } return (1); } static int sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int i, j, error; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children; struct rm_priotracker tracker; int newoid[CTL_MAXNAME]; SYSCTL_RLOCK(&tracker); i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid); SYSCTL_RUNLOCK(&tracker); if (i) return (ENOENT); error = SYSCTL_OUT(req, newoid, j * sizeof (int)); return (error); } /* * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in * capability mode. */ -static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, - sysctl_sysctl_next, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXT, next, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static int name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; struct sysctl_oid_list *lsp = &sysctl__children; char *p; SYSCTL_ASSERT_LOCKED(); for (*len = 0; *len < CTL_MAXNAME;) { p = strsep(&name, "."); oidp = SLIST_FIRST(lsp); for (;; oidp = SLIST_NEXT(oidp, oid_link)) { if (oidp == NULL) return (ENOENT); if (strcmp(p, oidp->oid_name) == 0) break; } *oid++ = oidp->oid_number; (*len)++; if (name == NULL || *name == '\0') { if (oidpp) *oidpp = oidp; return (0); } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oidp->oid_handler) break; lsp = SYSCTL_CHILDREN(oidp); } return (ENOENT); } static int sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) { char *p; int error, oid[CTL_MAXNAME], len = 0; struct sysctl_oid *op = NULL; struct rm_priotracker tracker; char buf[32]; if (!req->newlen) return (ENOENT); if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); p = buf; if (req->newlen >= sizeof(buf)) p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); error = SYSCTL_IN(req, p, req->newlen); if (error) { if (p != buf) free(p, M_SYSCTL); return (error); } p [req->newlen] = '\0'; SYSCTL_RLOCK(&tracker); error = name2oid(p, oid, &len, &op); SYSCTL_RUNLOCK(&tracker); if (p != buf) free(p, M_SYSCTL); if (error) return (error); error = SYSCTL_OUT(req, oid, len * sizeof *oid); return (error); } /* * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in * capability mode. */ -SYSCTL_PROC(_sysctl, 3, name2oid, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE - | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); +SYSCTL_PROC(_sysctl, CTL_SYSCTL_NAME2OID, name2oid, CTLTYPE_INT | CTLFLAG_RW | + CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0, + sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_fmt == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); if (error) goto out; error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } -static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, - sysctl_sysctl_oidfmt, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDFMT, oidfmt, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_descr == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } -static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD, - sysctl_sysctl_oiddescr, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDDESCR, oiddescr, CTLFLAG_RD | + CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, ""); static int sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_label == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_label, strlen(oid->oid_label) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } -static SYSCTL_NODE(_sysctl, 6, oidlabel, - CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); +static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDLABEL, oidlabel, CTLFLAG_RD | + CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); /* * Default "handler" functions. */ /* * Handle a bool. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_bool(SYSCTL_HANDLER_ARGS) { uint8_t temp; int error; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) temp = *(bool *)arg1 ? 1 : 0; else temp = arg2 ? 1 : 0; error = SYSCTL_OUT(req, &temp, sizeof(temp)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else { error = SYSCTL_IN(req, &temp, sizeof(temp)); if (!error) *(bool *)arg1 = temp ? 1 : 0; } return (error); } /* * Handle an int8_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_8(SYSCTL_HANDLER_ARGS) { int8_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int8_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int16_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_16(SYSCTL_HANDLER_ARGS) { int16_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int16_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int32_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_32(SYSCTL_HANDLER_ARGS) { int32_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int32_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_int(SYSCTL_HANDLER_ARGS) { int tmpout, error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(int)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(int)); return (error); } /* * Based on on sysctl_handle_int() convert milliseconds into ticks. * Note: this is used by TCP. */ int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) { int error, s, tt; tt = *(int *)arg1; s = (int)((int64_t)tt * 1000 / hz); error = sysctl_handle_int(oidp, &s, 0, req); if (error || !req->newptr) return (error); tt = (int)((int64_t)s * hz / 1000); if (tt < 1) return (EINVAL); *(int *)arg1 = tt; return (0); } /* * Handle a long, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_long(SYSCTL_HANDLER_ARGS) { int error = 0; long tmplong; #ifdef SCTL_MASK32 int tmpint; #endif /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmplong = *(long *)arg1; else tmplong = arg2; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { tmpint = tmplong; error = SYSCTL_OUT(req, &tmpint, sizeof(int)); } else #endif error = SYSCTL_OUT(req, &tmplong, sizeof(long)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; #ifdef SCTL_MASK32 else if (req->flags & SCTL_MASK32) { error = SYSCTL_IN(req, &tmpint, sizeof(int)); *(long *)arg1 = (long)tmpint; } #endif else error = SYSCTL_IN(req, arg1, sizeof(long)); return (error); } /* * Handle a 64 bit int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_64(SYSCTL_HANDLER_ARGS) { int error = 0; uint64_t tmpout; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(uint64_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(uint64_t)); return (error); } /* * Handle our generic '\0' terminated 'C' string. * Two cases: * a variable string: point arg1 at it, arg2 is max length. * a constant string: point arg1 at it, arg2 is zero. */ int sysctl_handle_string(SYSCTL_HANDLER_ARGS) { size_t outlen; int error = 0, ro_string = 0; /* * A zero-length buffer indicates a fixed size read-only * string. In ddb, don't worry about trying to make a malloced * snapshot. */ if (arg2 == 0 || kdb_active) { arg2 = strlen((char *)arg1) + 1; ro_string = 1; } if (req->oldptr != NULL) { char *tmparg; if (ro_string) { tmparg = arg1; } else { /* try to make a coherent snapshot of the string */ tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK); memcpy(tmparg, arg1, arg2); } outlen = strnlen(tmparg, arg2 - 1) + 1; error = SYSCTL_OUT(req, tmparg, outlen); if (!ro_string) free(tmparg, M_SYSCTLTMP); } else { outlen = strnlen((char *)arg1, arg2 - 1) + 1; error = SYSCTL_OUT(req, NULL, outlen); } if (error || !req->newptr) return (error); if ((req->newlen - req->newidx) >= arg2) { error = EINVAL; } else { arg2 = (req->newlen - req->newidx); error = SYSCTL_IN(req, arg1, arg2); ((char *)arg1)[arg2] = '\0'; } return (error); } /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. */ int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS) { int error, tries; u_int generation; struct sysctl_req req2; /* * Attempt to get a coherent snapshot, by using the thread * pre-emption counter updated from within mi_switch() to * determine if we were pre-empted during a bcopy() or * copyout(). Make 3 attempts at doing this before giving up. * If we encounter an error, stop immediately. */ tries = 0; req2 = *req; retry: generation = curthread->td_generation; error = SYSCTL_OUT(req, arg1, arg2); if (error) return (error); tries++; if (generation != curthread->td_generation && tries < 3) { *req = req2; goto retry; } error = SYSCTL_IN(req, arg1, arg2); return (error); } /* * Based on on sysctl_handle_int() convert microseconds to a sbintime. */ int sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS) { int error; int64_t tt; sbintime_t sb; tt = *(int64_t *)arg1; sb = sbttous(tt); error = sysctl_handle_64(oidp, &sb, 0, req); if (error || !req->newptr) return (error); tt = ustosbt(sb); *(int64_t *)arg1 = tt; return (0); } /* * Based on on sysctl_handle_int() convert milliseconds to a sbintime. */ int sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS) { int error; int64_t tt; sbintime_t sb; tt = *(int64_t *)arg1; sb = sbttoms(tt); error = sysctl_handle_64(oidp, &sb, 0, req); if (error || !req->newptr) return (error); tt = mstosbt(sb); *(int64_t *)arg1 = tt; return (0); } /* * Convert seconds to a struct timeval. Intended for use with * intervals and thus does not permit negative seconds. */ int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS) { struct timeval *tv; int error, secs; tv = arg1; secs = tv->tv_sec; error = sysctl_handle_int(oidp, &secs, 0, req); if (error || req->newptr == NULL) return (error); if (secs < 0) return (EINVAL); tv->tv_sec = secs; return (0); } /* * Transfer functions to/from kernel space. * XXX: rather untested at this point */ static int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; if (req->oldptr) { i = l; if (req->oldlen <= req->oldidx) i = 0; else if (i > req->oldlen - req->oldidx) i = req->oldlen - req->oldidx; if (i > 0) bcopy(p, (char *)req->oldptr + req->oldidx, i); } req->oldidx += l; if (req->oldptr && i != l) return (ENOMEM); return (0); } static int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); bcopy((const char *)req->newptr + req->newidx, p, l); req->newidx += l; return (0); } int kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int error = 0; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr= old; } if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; error = sysctl_root(0, name, namelen, &req); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } int kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int oid[CTL_MAXNAME]; size_t oidlen, plen; int error; - oid[0] = 0; /* sysctl internal magic */ - oid[1] = 3; /* name2oid */ + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, strlen(name), &plen, flags); if (error) return (error); error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp, new, newlen, retval, flags); return (error); } /* * Transfer function to/from user space. */ static int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { size_t i, len, origidx; int error; origidx = req->oldidx; req->oldidx += l; if (req->oldptr == NULL) return (0); /* * If we have not wired the user supplied buffer and we are currently * holding locks, drop a witness warning, as it's possible that * write operations to the user page can sleep. */ if (req->lock != REQ_WIRED) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_old_user()"); i = l; len = req->validlen; if (len <= origidx) i = 0; else { if (i > len - origidx) i = len - origidx; if (req->lock == REQ_WIRED) { error = copyout_nofault(p, (char *)req->oldptr + origidx, i); } else error = copyout(p, (char *)req->oldptr + origidx, i); if (error != 0) return (error); } if (i < l) return (ENOMEM); return (0); } static int sysctl_new_user(struct sysctl_req *req, void *p, size_t l) { int error; if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_new_user()"); error = copyin((const char *)req->newptr + req->newidx, p, l); req->newidx += l; return (error); } /* * Wire the user space destination buffer. If set to a value greater than * zero, the len parameter limits the maximum amount of wired memory. */ int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len) { int ret; size_t wiredlen; wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen; ret = 0; if (req->lock != REQ_WIRED && req->oldptr && req->oldfunc == sysctl_old_user) { if (wiredlen != 0) { ret = vslock(req->oldptr, wiredlen); if (ret != 0) { if (ret != ENOMEM) return (ret); wiredlen = 0; } } req->lock = REQ_WIRED; req->validlen = wiredlen; } return (0); } int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req) { struct sysctl_oid_list *lsp; struct sysctl_oid *oid; int indx; SYSCTL_ASSERT_LOCKED(); lsp = &sysctl__children; indx = 0; while (indx < CTL_MAXNAME) { SLIST_FOREACH(oid, lsp, oid_link) { if (oid->oid_number == name[indx]) break; } if (oid == NULL) return (ENOENT); indx++; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oid->oid_handler != NULL || indx == namelen) { *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } lsp = SYSCTL_CHILDREN(oid); } else if (indx == namelen) { if ((oid->oid_kind & CTLFLAG_DORMANT) != 0) return (ENOENT); *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } else { return (ENOTDIR); } } return (ENOENT); } /* * Traverse our tree, and find the right node, execute whatever it points * to, and return the resulting error code. */ static int sysctl_root(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error, indx, lvl; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, &indx, req); if (error) goto out; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { /* * You can't call a sysctl when it's a node, but has * no handler. Inform the user that it's a node. * The indx may or may not be the same as namelen. */ if (oid->oid_handler == NULL) { error = EISDIR; goto out; } } /* Is this sysctl writable? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) { error = EPERM; goto out; } KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL")); #ifdef CAPABILITY_MODE /* * If the process is in capability mode, then don't permit reading or * writing unless specifically granted for the node. */ if (IN_CAPABILITY_MODE(req->td)) { if ((req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) || (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))) { error = EPERM; goto out; } } #endif /* Is this sysctl sensitive to securelevels? */ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) { lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE; error = securelevel_gt(req->td->td_ucred, lvl); if (error) goto out; } /* Is this sysctl writable by only privileged users? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) { int priv; if (oid->oid_kind & CTLFLAG_PRISON) priv = PRIV_SYSCTL_WRITEJAIL; #ifdef VIMAGE else if ((oid->oid_kind & CTLFLAG_VNET) && prison_owns_vnet(req->td->td_ucred)) priv = PRIV_SYSCTL_WRITEJAIL; #endif else priv = PRIV_SYSCTL_WRITE; error = priv_check(req->td, priv); if (error) goto out; } if (!oid->oid_handler) { error = EINVAL; goto out; } if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { arg1 = (int *)arg1 + indx; arg2 -= indx; } else { arg1 = oid->oid_arg1; arg2 = oid->oid_arg2; } #ifdef MAC error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2, req); if (error != 0) goto out; #endif #ifdef VIMAGE if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL) arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); #endif error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker); out: SYSCTL_RUNLOCK(&tracker); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sysctl_args { int *name; u_int namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctl(struct thread *td, struct sysctl_args *uap) { int error, i, name[CTL_MAXNAME]; size_t j; if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); error = copyin(uap->name, &name, uap->namelen * sizeof(int)); if (error) return (error); error = userland_sysctl(td, name, uap->namelen, uap->old, uap->oldlenp, 0, uap->new, uap->newlen, &j, 0); if (error && error != ENOMEM) return (error); if (uap->oldlenp) { i = copyout(&j, uap->oldlenp, sizeof(j)); if (i) return (i); } return (error); } int kern___sysctlbyname(struct thread *td, const char *oname, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags, bool inkernel) { int oid[CTL_MAXNAME]; char namebuf[16]; char *name; size_t oidlen; int error; if (namelen > MAXPATHLEN || namelen == 0) return (EINVAL); name = namebuf; if (namelen > sizeof(namebuf)) name = malloc(namelen, M_SYSCTL, M_WAITOK); error = copyin(oname, name, namelen); if (error != 0) goto out; - oid[0] = 0; - oid[1] = 3; + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, namelen, retval, flags); if (error != 0) goto out; error = userland_sysctl(td, oid, *retval / sizeof(int), old, oldlenp, inkernel, new, newlen, retval, flags); out: if (namelen > sizeof(namebuf)) free(name, M_SYSCTL); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __sysctlbyname_args { const char *name; size_t namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctlbyname(struct thread *td, struct __sysctlbyname_args *uap) { size_t rv; int error; error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old, uap->oldlenp, uap->new, uap->newlen, &rv, 0, 0); if (error != 0) return (error); if (uap->oldlenp != NULL) error = copyout(&rv, uap->oldlenp, sizeof(rv)); return (error); } /* * This is used from various compatibility syscalls too. That's why name * must be in kernel space. */ int userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, const void *new, size_t newlen, size_t *retval, int flags) { int error = 0, memlocked; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { if (inkernel) { req.oldlen = *oldlenp; } else { error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); if (error) return (error); } } req.validlen = req.oldlen; req.oldptr = old; if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_user; req.newfunc = sysctl_new_user; req.lock = REQ_UNWIRED; #ifdef KTRACE if (KTRPOINT(curthread, KTR_SYSCTL)) ktrsysctl(name, namelen); #endif memlocked = 0; if (req.oldptr && req.oldlen > 4 * PAGE_SIZE) { memlocked = 1; sx_xlock(&sysctlmemlock); } CURVNET_SET(TD_TO_VNET(td)); for (;;) { req.oldidx = 0; req.newidx = 0; error = sysctl_root(0, name, namelen, &req); if (error != EAGAIN) break; kern_yield(PRI_USER); } CURVNET_RESTORE(); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (memlocked) sx_xunlock(&sysctlmemlock); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Drain into a sysctl struct. The user buffer should be wired if a page * fault would cause issue. */ static int sbuf_sysctl_drain(void *arg, const char *data, int len) { struct sysctl_req *req = arg; int error; error = SYSCTL_OUT(req, data, len); KASSERT(error >= 0, ("Got unexpected negative value %d", error)); return (error == 0 ? len : -error); } struct sbuf * sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length, struct sysctl_req *req) { /* Supply a default buffer size if none given. */ if (buf == NULL && length == 0) length = 64; s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(s, sbuf_sysctl_drain, req); return (s); } #ifdef DDB /* The current OID the debugger is working with */ static struct sysctl_oid *g_ddb_oid; /* The current flags specified by the user */ static int g_ddb_sysctl_flags; /* Check to see if the last sysctl printed */ static int g_ddb_sysctl_printed; static const int ctl_sign[CTLTYPE+1] = { [CTLTYPE_INT] = 1, [CTLTYPE_LONG] = 1, [CTLTYPE_S8] = 1, [CTLTYPE_S16] = 1, [CTLTYPE_S32] = 1, [CTLTYPE_S64] = 1, }; static const int ctl_size[CTLTYPE+1] = { [CTLTYPE_INT] = sizeof(int), [CTLTYPE_UINT] = sizeof(u_int), [CTLTYPE_LONG] = sizeof(long), [CTLTYPE_ULONG] = sizeof(u_long), [CTLTYPE_S8] = sizeof(int8_t), [CTLTYPE_S16] = sizeof(int16_t), [CTLTYPE_S32] = sizeof(int32_t), [CTLTYPE_S64] = sizeof(int64_t), [CTLTYPE_U8] = sizeof(uint8_t), [CTLTYPE_U16] = sizeof(uint16_t), [CTLTYPE_U32] = sizeof(uint32_t), [CTLTYPE_U64] = sizeof(uint64_t), }; #define DB_SYSCTL_NAME_ONLY 0x001 /* Compare with -N */ #define DB_SYSCTL_VALUE_ONLY 0x002 /* Compare with -n */ #define DB_SYSCTL_OPAQUE 0x004 /* Compare with -o */ #define DB_SYSCTL_HEX 0x008 /* Compare with -x */ #define DB_SYSCTL_SAFE_ONLY 0x100 /* Only simple types */ static const char db_sysctl_modifs[] = { 'N', 'n', 'o', 'x', }; static const int db_sysctl_modif_values[] = { DB_SYSCTL_NAME_ONLY, DB_SYSCTL_VALUE_ONLY, DB_SYSCTL_OPAQUE, DB_SYSCTL_HEX, }; /* Handlers considered safe to print while recursing */ static int (* const db_safe_handlers[])(SYSCTL_HANDLER_ARGS) = { sysctl_handle_bool, sysctl_handle_8, sysctl_handle_16, sysctl_handle_32, sysctl_handle_64, sysctl_handle_int, sysctl_handle_long, sysctl_handle_string, sysctl_handle_opaque, }; /* * Use in place of sysctl_old_kernel to print sysctl values. * * Compare to the output handling in show_var from sbin/sysctl/sysctl.c */ static int sysctl_old_ddb(struct sysctl_req *req, const void *ptr, size_t len) { const u_char *val, *p; const char *sep1; size_t intlen, slen; uintmax_t umv; intmax_t mv; int sign, ctltype, hexlen, xflag, error; /* Suppress false-positive GCC uninitialized variable warnings */ mv = 0; umv = 0; slen = len; val = p = ptr; if (ptr == NULL) { error = 0; goto out; } /* We are going to print */ g_ddb_sysctl_printed = 1; xflag = g_ddb_sysctl_flags & DB_SYSCTL_HEX; ctltype = (g_ddb_oid->oid_kind & CTLTYPE); sign = ctl_sign[ctltype]; intlen = ctl_size[ctltype]; switch (ctltype) { case CTLTYPE_NODE: case CTLTYPE_STRING: db_printf("%.*s", (int) len, (const char *) p); error = 0; goto out; case CTLTYPE_INT: case CTLTYPE_UINT: case CTLTYPE_LONG: case CTLTYPE_ULONG: case CTLTYPE_S8: case CTLTYPE_S16: case CTLTYPE_S32: case CTLTYPE_S64: case CTLTYPE_U8: case CTLTYPE_U16: case CTLTYPE_U32: case CTLTYPE_U64: hexlen = 2 + (intlen * CHAR_BIT + 3) / 4; sep1 = ""; while (len >= intlen) { switch (ctltype) { case CTLTYPE_INT: case CTLTYPE_UINT: umv = *(const u_int *)p; mv = *(const int *)p; break; case CTLTYPE_LONG: case CTLTYPE_ULONG: umv = *(const u_long *)p; mv = *(const long *)p; break; case CTLTYPE_S8: case CTLTYPE_U8: umv = *(const uint8_t *)p; mv = *(const int8_t *)p; break; case CTLTYPE_S16: case CTLTYPE_U16: umv = *(const uint16_t *)p; mv = *(const int16_t *)p; break; case CTLTYPE_S32: case CTLTYPE_U32: umv = *(const uint32_t *)p; mv = *(const int32_t *)p; break; case CTLTYPE_S64: case CTLTYPE_U64: umv = *(const uint64_t *)p; mv = *(const int64_t *)p; break; } db_printf("%s", sep1); if (xflag) db_printf("%#0*jx", hexlen, umv); else if (!sign) db_printf("%ju", umv); else if (g_ddb_oid->oid_fmt[1] == 'K') { /* Kelvins are currently unsupported. */ error = EOPNOTSUPP; goto out; } else db_printf("%jd", mv); sep1 = " "; len -= intlen; p += intlen; } error = 0; goto out; case CTLTYPE_OPAQUE: /* TODO: Support struct functions. */ /* FALLTHROUGH */ default: db_printf("Format:%s Length:%zu Dump:0x", g_ddb_oid->oid_fmt, len); while (len-- && (xflag || p < val + 16)) db_printf("%02x", *p++); if (!xflag && len > 16) db_printf("..."); error = 0; goto out; } out: req->oldidx += slen; return (error); } /* * Avoid setting new sysctl values from the debugger */ static int sysctl_new_ddb(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); /* Changing sysctls from the debugger is currently unsupported */ return (EPERM); } /* * Run a sysctl handler with the DDB oldfunc and newfunc attached. * Instead of copying any output to a buffer we'll dump it right to * the console. */ static int db_sysctl(struct sysctl_oid *oidp, int *name, u_int namelen, void *old, size_t *oldlenp, size_t *retval, int flags) { struct sysctl_req req; int error; /* Setup the request */ bzero(&req, sizeof req); req.td = kdb_thread; req.oldfunc = sysctl_old_ddb; req.newfunc = sysctl_new_ddb; req.lock = REQ_UNWIRED; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr = old; } /* Setup our globals for sysctl_old_ddb */ g_ddb_oid = oidp; g_ddb_sysctl_flags = flags; g_ddb_sysctl_printed = 0; error = sysctl_root(0, name, namelen, &req); /* Reset globals */ g_ddb_oid = NULL; g_ddb_sysctl_flags = 0; if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Show a sysctl's name */ static void db_show_oid_name(int *oid, size_t nlen) { struct sysctl_oid *oidp; int qoid[CTL_MAXNAME+2]; int error; qoid[0] = 0; memcpy(qoid + 2, oid, nlen * sizeof(int)); qoid[1] = 1; error = sysctl_find_oid(qoid, nlen + 2, &oidp, NULL, NULL); if (error) db_error("sysctl name oid"); error = db_sysctl(oidp, qoid, nlen + 2, NULL, NULL, NULL, 0); if (error) db_error("sysctl name"); } /* * Check to see if an OID is safe to print from ddb. */ static bool db_oid_safe(const struct sysctl_oid *oidp) { for (unsigned int i = 0; i < nitems(db_safe_handlers); ++i) { if (oidp->oid_handler == db_safe_handlers[i]) return (true); } return (false); } /* * Show a sysctl at a specific OID * Compare to the input handling in show_var from sbin/sysctl/sysctl.c */ static int db_show_oid(struct sysctl_oid *oidp, int *oid, size_t nlen, int flags) { int error, xflag, oflag, Nflag, nflag; size_t len; xflag = flags & DB_SYSCTL_HEX; oflag = flags & DB_SYSCTL_OPAQUE; nflag = flags & DB_SYSCTL_VALUE_ONLY; Nflag = flags & DB_SYSCTL_NAME_ONLY; if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_OPAQUE && (!xflag && !oflag)) return (0); if (Nflag) { db_show_oid_name(oid, nlen); error = 0; goto out; } if (!nflag) { db_show_oid_name(oid, nlen); db_printf(": "); } if ((flags & DB_SYSCTL_SAFE_ONLY) && !db_oid_safe(oidp)) { db_printf("Skipping, unsafe to print while recursing."); error = 0; goto out; } /* Try once, and ask about the size */ len = 0; error = db_sysctl(oidp, oid, nlen, NULL, NULL, &len, flags); if (error) goto out; if (!g_ddb_sysctl_printed) /* Lie about the size */ error = db_sysctl(oidp, oid, nlen, (void *) 1, &len, NULL, flags); out: db_printf("\n"); return (error); } /* * Show all sysctls under a specific OID * Compare to sysctl_all from sbin/sysctl/sysctl.c */ static int db_show_sysctl_all(int *oid, size_t len, int flags) { struct sysctl_oid *oidp; int name1[CTL_MAXNAME + 2], name2[CTL_MAXNAME + 2]; size_t l1, l2; - name1[0] = 0; - name1[1] = 2; + name1[0] = CTL_SYSCTL; + name1[1] = CTL_SYSCTL_NEXT; l1 = 2; if (len) { memcpy(name1+2, oid, len * sizeof(int)); l1 +=len; } else { name1[2] = 1; l1++; } for (;;) { int i, error; l2 = sizeof(name2); error = kernel_sysctl(kdb_thread, name1, l1, name2, &l2, NULL, 0, &l2, 0); if (error != 0) { if (error == ENOENT) return (0); else db_error("sysctl(getnext)"); } l2 /= sizeof(int); if (l2 < (unsigned int)len) return (0); for (i = 0; i < len; i++) if (name2[i] != oid[i]) return (0); /* Find the OID in question */ error = sysctl_find_oid(name2, l2, &oidp, NULL, NULL); if (error) return (error); i = db_show_oid(oidp, name2, l2, flags | DB_SYSCTL_SAFE_ONLY); if (db_pager_quit) return (0); memcpy(name1+2, name2, l2 * sizeof(int)); l1 = 2 + l2; } } /* * Show a sysctl by its user facing string */ static int db_sysctlbyname(char *name, int flags) { struct sysctl_oid *oidp; int oid[CTL_MAXNAME]; int error, nlen; error = name2oid(name, oid, &nlen, &oidp); if (error) { return (error); } if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { db_show_sysctl_all(oid, nlen, flags); } else { error = db_show_oid(oidp, oid, nlen, flags); } return (error); } static void db_sysctl_cmd_usage(void) { db_printf( " sysctl [/Nnox] \n" " \n" " The name of the sysctl to show. \n" " \n" " Show a sysctl by hooking into SYSCTL_IN and SYSCTL_OUT. \n" " This will work for most sysctls, but should not be used \n" " with sysctls that are known to malloc. \n" " \n" " While recursing any \"unsafe\" sysctls will be skipped. \n" " Call sysctl directly on the sysctl to try printing the \n" " skipped sysctl. This is unsafe and may make the ddb \n" " session unusable. \n" " \n" " Arguments: \n" " /N Display only the name of the sysctl. \n" " /n Display only the value of the sysctl. \n" " /o Display opaque values. \n" " /x Display the sysctl in hex. \n" " \n" "For example: \n" "sysctl vm.v_free_min \n" "vn.v_free_min: 12669 \n" ); } /* * Show a specific sysctl similar to sysctl (8). */ DB_FUNC(sysctl, db_sysctl_cmd, db_cmd_table, CS_OWN, NULL) { char name[TOK_STRING_SIZE]; int error, i, t, flags; /* Parse the modifiers */ t = db_read_token(); if (t == tSLASH || t == tMINUS) { t = db_read_token(); if (t != tIDENT) { db_printf("Bad modifier\n"); error = EINVAL; goto out; } db_strcpy(modif, db_tok_string); } else { db_unread_token(t); modif[0] = '\0'; } flags = 0; for (i = 0; i < nitems(db_sysctl_modifs); i++) { if (strchr(modif, db_sysctl_modifs[i])) { flags |= db_sysctl_modif_values[i]; } } /* Parse the sysctl names */ t = db_read_token(); if (t != tIDENT) { db_printf("Need sysctl name\n"); error = EINVAL; goto out; } /* Copy the name into a temporary buffer */ db_strcpy(name, db_tok_string); /* Ensure there is no trailing cruft */ t = db_read_token(); if (t != tEOL) { db_printf("Unexpected sysctl argument\n"); error = EINVAL; goto out; } error = db_sysctlbyname(name, flags); if (error == ENOENT) { db_printf("unknown oid: '%s'\n", db_tok_string); goto out; } else if (error) { db_printf("%s: error: %d\n", db_tok_string, error); goto out; } out: /* Ensure we eat all of our text */ db_flush_lex(); if (error == EINVAL) { db_sysctl_cmd_usage(); } } #endif /* DDB */ Index: projects/clang900-import/sys/kern/vfs_cluster.c =================================================================== --- projects/clang900-import/sys/kern/vfs_cluster.c (revision 352536) +++ projects/clang900-import/sys/kern/vfs_cluster.c (revision 352537) @@ -1,1079 +1,1097 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_debug_cluster.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CLUSTERDEBUG) static int rcluster= 0; SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "Debug VFS clustering code"); #endif static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); static uma_zone_t cluster_pbuf_zone; static void cluster_init(void *); static struct cluster_save *cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags); static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp); static void cluster_callback(struct buf *); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); static int read_max = 64; SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, "Cluster read-ahead max block count"); static int read_min = 1; SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, "Cluster read min block count"); SYSINIT(cluster, SI_SUB_CPU, SI_ORDER_ANY, cluster_init, NULL); static void cluster_init(void *dummy) { cluster_pbuf_zone = pbuf_zsecond_create("clpbuf", nswbuf / 2); } /* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. */ int cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, struct ucred *cred, long totread, int seqcount, int gbflags, struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; struct thread *td; daddr_t blkno, origblkno; int maxra, racluster; int error, ncontig; int i; error = 0; td = curthread; bo = &vp->v_bufobj; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; maxra = min(read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; /* * get the requested block */ error = getblkx(vp, lblkno, size, 0, 0, gbflags, &bp); if (error != 0) { *bpp = NULL; return (error); } gbflags &= ~GB_NOSPARSE; origblkno = lblkno; *bpp = reqbp = bp; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { bp->b_flags &= ~B_RAM; BO_RLOCK(bo); for (i = 1; i < maxra; i++) { /* * Stop if the buffer does not exist or it * is invalid (about to go away?) */ rbp = gbincore(&vp->v_bufobj, lblkno+i); if (rbp == NULL || (rbp->b_flags & B_INVAL)) break; /* * Set another read-ahead mark so we know * to check again. (If we can lock the * buffer without waiting) */ if ((((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) && (0 == BUF_LOCK(rbp, LK_EXCLUSIVE | LK_NOWAIT, NULL))) { rbp->b_flags |= B_RAM; BUF_UNLOCK(rbp); } } BO_RUNLOCK(bo); if (i >= maxra) { return 0; } lblkno += i; } reqbp = bp = NULL; /* * If it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ } else { off_t firstread = bp->b_offset; int nblks; long minread; KASSERT(bp->b_offset != NOOFFSET, ("cluster_read: no buffer offset")); ncontig = 0; /* * Adjust totread if needed */ minread = read_min * size; if (minread > totread) totread = minread; /* * Compute the total number of blocks that we should read * synchronously. */ if (firstread + totread > filesize) totread = filesize - firstread; nblks = howmany(totread, size); if (nblks > racluster) nblks = racluster; /* * Now compute the number of contiguous blocks. */ if (nblks > 1) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); /* * If this failed to map just do the original block. */ if (error || blkno == -1) ncontig = 0; } /* * If we have contiguous data available do a cluster * otherwise just read the requested block. */ if (ncontig) { /* Account for our first block. */ ncontig = min(ncontig + 1, nblks); if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; bp->b_iocmd = BIO_READ; lblkno += 1; } } /* * handle the synchronous read so that it is available ASAP. */ if (bp) { if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, bp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; } /* * If we have been doing sequential I/O, then do some read-ahead. */ while (lblkno < (origblkno + maxra)) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); if (error) break; if (blkno == -1) break; /* * We could throttle ncontig here by maxra but we might as * well read the data if it is contiguous. We're throttled * by racluster anyway. */ if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; } if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; bqrelse(rbp); continue; } if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); bstrategy(rbp); #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, rbp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; } if (reqbp) { /* * Like bread, always brelse() the buffer when * returning an error. */ error = bufwait(reqbp); if (error != 0) { brelse(reqbp); *bpp = NULL; } } return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) { struct buf *bp, *tbp; daddr_t bn; off_t off; long tinc, tsize; int i, inc, j, k, toff; KASSERT(size == vp->v_mount->mnt_stat.f_iosize, ("cluster_rbuild: size %ld != f_iosize %jd\n", size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); /* * avoid a division */ while ((u_quad_t) size * (lbn + run) > filesize) { --run; } if (fbp) { tbp = fbp; tbp->b_iocmd = BIO_READ; } else { tbp = getblk(vp, lbn, size, 0, 0, gbflags); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; tbp->b_iocmd = BIO_READ; } tbp->b_blkno = blkno; if( (tbp->b_flags & B_MALLOC) || ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT); if (bp == NULL) return tbp; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; } else { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; bp->b_lblkno = lbn; bp->b_offset = tbp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); pbgetvp(vp, bp); TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; inc = btodb(size); for (bn = blkno, i = 0; i < run; ++i, bn += inc) { if (i == 0) { VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); vfs_drain_busy_pages(tbp); vm_object_pip_add(tbp->b_bufobj->bo_object, tbp->b_npages); for (k = 0; k < tbp->b_npages; k++) vm_page_sbusy(tbp->b_pages[k]); VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); } else { if ((bp->b_npages * PAGE_SIZE) + round_page(size) > vp->v_mount->mnt_iosize_max) { break; } tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | (gbflags & GB_UNMAPPED)); /* Don't wait around for locked bufs. */ if (tbp == NULL) break; /* * Stop scanning if the buffer is fully valid * (marked B_CACHE), or locked (may be doing a * background write), or if the buffer is not * VMIO backed. The clustering code can only deal * with VMIO-backed buffers. The bo lock is not * required for the BKGRDINPROG check since it * can not be set without the buf lock. */ if ((tbp->b_vflags & BV_BKGRDINPROG) || (tbp->b_flags & B_CACHE) || (tbp->b_flags & B_VMIO) == 0) { bqrelse(tbp); break; } /* * The buffer must be completely invalid in order to * take part in the cluster. If it is partially valid * then we stop. */ off = tbp->b_offset; tsize = size; VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); for (j = 0; tsize > 0; j++) { toff = off & PAGE_MASK; tinc = tsize; if (toff + tinc > PAGE_SIZE) tinc = PAGE_SIZE - toff; VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object); if ((tbp->b_pages[j]->valid & vm_page_bits(toff, tinc)) != 0) break; if (vm_page_xbusied(tbp->b_pages[j])) break; vm_object_pip_add(tbp->b_bufobj->bo_object, 1); vm_page_sbusy(tbp->b_pages[j]); off += tinc; tsize -= tinc; } if (tsize > 0) { clean_sbusy: vm_object_pip_wakeupn(tbp->b_bufobj->bo_object, j); for (k = 0; k < j; k++) vm_page_sunbusy(tbp->b_pages[k]); VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); bqrelse(tbp); break; } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); /* * Set a read-ahead mark as appropriate */ if ((fbp && (i == 1)) || (i == (run - 1))) tbp->b_flags |= B_RAM; /* * Set the buffer up for an async read (XXX should * we do this only if we do not wind up brelse()ing?). * Set the block number if it isn't set, otherwise * if it is make sure it matches the block number we * expect. */ tbp->b_flags |= B_ASYNC; tbp->b_iocmd = BIO_READ; if (tbp->b_blkno == tbp->b_lblkno) { tbp->b_blkno = bn; } else if (tbp->b_blkno != bn) { VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); goto clean_sbusy; } } /* * XXX fbp from caller may not be B_ASYNC, but we are going * to biodone() it in cluster_callback() anyway */ BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); for (j = 0; j < tbp->b_npages; j += 1) { vm_page_t m; m = tbp->b_pages[j]; if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages-1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } if (m->valid == VM_PAGE_BITS_ALL) tbp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); /* * Don't inherit tbp->b_bufsize as it may be larger due to * a non-page-aligned size. Instead just aggregate using * 'size'. */ if (tbp->b_bcount != size) printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); if (tbp->b_bufsize != size) printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); bp->b_bcount += size; bp->b_bufsize += size; } /* * Fully valid pages in the cluster are already good and do not need * to be re-read from disk. Replace the page with bogus_page */ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (j = 0; j < bp->b_npages; j++) { VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object); if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL) bp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); if (bp->b_bufsize > bp->b_kvasize) panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. */ static void cluster_callback(struct buf *bp) { struct buf *nbp, *tbp; int error = 0; /* * Must propagate errors to all the components. */ if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; if (buf_mapped(bp)) { pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); } /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); tbp; tbp = nbp) { nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); if (error) { tbp->b_ioflags |= BIO_ERROR; tbp->b_error = error; } else { tbp->b_dirtyoff = tbp->b_dirtyend = 0; tbp->b_flags &= ~B_INVAL; tbp->b_ioflags &= ~BIO_ERROR; /* * XXX the bdwrite()/bqrelse() issued during * cluster building clears B_RELBUF (see bqrelse() * comment). If direct I/O was specified, we have * to restore it here to allow the buffer and VM * to be freed. */ if (tbp->b_flags & B_DIRECT) tbp->b_flags |= B_RELBUF; } bufdone(tbp); } pbrelvp(bp); uma_zfree(cluster_pbuf_zone, bp); } /* * cluster_wbuild_wb: * * Implement modified write build for cluster. * * write_behind = 0 write behind disabled * write_behind = 1 write behind normal (default) * write_behind = 2 write behind backed-off */ static __inline int cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { int r = 0; switch (write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* FALLTHROUGH */ case 1: r = cluster_wbuild(vp, size, start_lbn, len, gbflags); /* FALLTHROUGH */ default: /* FALLTHROUGH */ break; } return(r); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, int gbflags) { daddr_t lbn; int maxclen, cursize; int lblocksize; int async; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; if (vp->v_type == VREG) { async = DOINGASYNC(vp); lblocksize = vp->v_mount->mnt_stat.f_iosize; } else { async = 0; lblocksize = bp->b_bufsize; } lbn = bp->b_lblkno; KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. * * Change to algorithm: only push previous cluster if * it was sequential from the point of view of the * seqcount heuristic, otherwise leave the buffer * intact so we can potentially optimize the I/O * later on in the buf_daemon or update daemon * flush. */ cursize = vp->v_lastw - vp->v_cstart + 1; if (((u_quad_t) bp->b_offset + lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize, gbflags); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, bp, gbflags); + if (buflist == NULL) { + /* + * Cluster build failed so just write + * it now. + */ + bawrite(bp); + return; + } endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster * if *really* writing sequentially * in the logical file (seqcount > 1), * otherwise delay it in the hopes that * the low level disk driver can * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize, gbflags); } } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); free(buflist, M_SEGMENT); vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; return; } } } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if ((vp->v_type == VREG) && ((u_quad_t) bp->b_offset + lblocksize) != filesize && (bp->b_blkno == bp->b_lblkno) && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_blkno; vp->v_cstart = lbn + 1; vp->v_lastw = lbn; return; } vp->v_clen = maxclen; if (!async && maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; bawrite(bp); } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out if seqcount tells us we * are operating sequentially, otherwise let the buf or * update daemon handle it. */ bdwrite(bp); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1, gbflags); } vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { /* * We are low on memory, get it going NOW */ bawrite(bp); } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); } vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ int cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { struct buf *bp, *tbp; struct bufobj *bo; int i, j; int totalwritten = 0; int dbsize = btodb(size); if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; bo = &vp->v_bufobj; while (len > 0) { /* * If the buffer is not delayed-write (i.e. dirty), or it * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ BO_LOCK(bo); if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); ++start_lbn; --len; continue; } if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) { ++start_lbn; --len; continue; } if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { BUF_UNLOCK(tbp); ++start_lbn; --len; continue; } bremfree(tbp); tbp->b_flags &= ~B_DONE; /* * Extra memory in the buffer, punt on this buffer. * XXX we could handle this in most cases, but we would * have to push the extra memory down to after our max * possible cluster size and then potentially pull it back * up if the cluster was terminated prematurely--too much * hassle. */ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != (B_CLUSTEROK | B_VMIO)) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || ((bp = uma_zalloc(cluster_pbuf_zone, (vp->v_vflag & VV_MD) != 0 ? M_NOWAIT : M_WAITOK)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; --len; continue; } /* * We got a pbuf to make the cluster in. * so initialise it. */ TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; if (tbp->b_wcred != NOCRED) bp->b_wcred = crhold(tbp->b_wcred); bp->b_blkno = tbp->b_blkno; bp->b_lblkno = tbp->b_lblkno; bp->b_offset = tbp->b_offset; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ if ((gbflags & GB_UNMAPPED) == 0 || (tbp->b_flags & B_VMIO) == 0) { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } else { bp->b_data = unmapped_buf; } bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* * From this location in the file, scan forward to see * if there are buffers with adjacent data that need to * be written as well. */ for (i = 0; i < len; ++i, ++start_lbn) { if (i != 0) { /* If not the first buffer */ /* * If the adjacent data is not even in core it * can't need to be written. */ BO_LOCK(bo); if ((tbp = gbincore(bo, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); break; } /* * If it IS in core, but has different * characteristics, or is locked (which * means it could be undergoing a background * I/O or be in a weird state), then don't * cluster with it. */ if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) break; if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | B_INVAL | B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_CLUSTEROK | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || tbp->b_wcred != bp->b_wcred) { BUF_UNLOCK(tbp); break; } /* * Check that the combined cluster * would make sense with regard to pages * and would not be too large */ if ((tbp->b_bcount != size) || ((bp->b_blkno + (dbsize * i)) != tbp->b_blkno) || ((tbp->b_npages + bp->b_npages) > (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { BUF_UNLOCK(tbp); break; } /* * Ok, it's passed all the tests, * so remove it from the free list * and mark it busy. We will use it. */ bremfree(tbp); tbp->b_flags &= ~B_DONE; } /* end of code for non-first buffers only */ /* * If the IO is via the VM then we do some * special VM hackery (yuck). Since the buffer's * block size may not be page-aligned it is possible * for a page to be shared between two buffers. We * have to get rid of the duplication when building * the cluster. */ if (tbp->b_flags & B_VMIO) { vm_page_t m; VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); if (i == 0) { vfs_drain_busy_pages(tbp); } else { /* if not first buffer */ for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; if (vm_page_xbusied(m)) { VM_OBJECT_WUNLOCK( tbp->b_object); bqrelse(tbp); goto finishcluster; } } } for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; vm_page_sbusy(m); vm_object_pip_add(m->object, 1); if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages - 1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); } bp->b_bcount += size; bp->b_bufsize += size; /* * If any of the clustered buffers have their * B_BARRIER flag set, transfer that request to * the cluster. */ bp->b_flags |= (tbp->b_flags & B_BARRIER); tbp->b_flags &= ~(B_DONE | B_BARRIER); tbp->b_flags |= B_ASYNC; tbp->b_ioflags &= ~BIO_ERROR; tbp->b_iocmd = BIO_WRITE; bundirty(tbp); reassignbuf(tbp); /* put on clean list */ bufobj_wref(tbp->b_bufobj); BUF_KERNPROC(tbp); buf_track(tbp, __func__); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); } finishcluster: if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); totalwritten += bp->b_bufsize; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bufsize; bawrite(bp); len -= i; } return totalwritten; } /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ static struct cluster_save * cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags) { struct cluster_save *buflist; struct buf *bp; daddr_t lbn; - int i, len; + int i, j, len, error; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { - (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, + error = bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, gbflags, &bp); + if (error != 0) { + /* + * If read fails, release collected buffers + * and return failure. + */ + for (j = 0; j < i; j++) + brelse(buflist->bs_children[j]); + free(buflist, M_SEGMENT); + return (NULL); + } buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buflist->bs_children[i] = bp = last_bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); buflist->bs_nchildren = i + 1; return (buflist); } Index: projects/clang900-import/sys/kern/vfs_default.c =================================================================== --- projects/clang900-import/sys/kern/vfs_default.c (revision 352536) +++ projects/clang900-import/sys/kern/vfs_default.c (revision 352537) @@ -1,1404 +1,1408 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed * to Berkeley by John Heidemann of the UCLA Ficus project. * * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vop_nolookup(struct vop_lookup_args *); static int vop_norename(struct vop_rename_args *); static int vop_nostrategy(struct vop_strategy_args *); static int get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf, int dirbuflen, off_t *off, char **cpos, int *len, int *eofflag, struct thread *td); static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td); #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) static int vop_stdis_text(struct vop_is_text_args *ap); static int vop_stdunset_text(struct vop_unset_text_args *ap); static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap); static int vop_stdfdatasync(struct vop_fdatasync_args *ap); static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); static int vop_stdioctl(struct vop_ioctl_args *ap); /* * This vnode table stores what we want to do if the filesystem doesn't * implement a particular VOP. * * If there is no specific entry here, we will return EOPNOTSUPP. * * Note that every filesystem has to implement either vop_access * or vop_accessx; failing to do so will result in immediate crash * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(), * which calls vop_stdaccess() etc. */ struct vop_vector default_vnodeops = { .vop_default = NULL, .vop_bypass = VOP_EOPNOTSUPP, .vop_access = vop_stdaccess, .vop_accessx = vop_stdaccessx, .vop_advise = vop_stdadvise, .vop_advlock = vop_stdadvlock, .vop_advlockasync = vop_stdadvlockasync, .vop_advlockpurge = vop_stdadvlockpurge, .vop_allocate = vop_stdallocate, .vop_bmap = vop_stdbmap, .vop_close = VOP_NULL, .vop_fsync = VOP_NULL, .vop_fdatasync = vop_stdfdatasync, .vop_getpages = vop_stdgetpages, .vop_getpages_async = vop_stdgetpages_async, .vop_getwritemount = vop_stdgetwritemount, .vop_inactive = VOP_NULL, .vop_need_inactive = vop_stdneed_inactive, .vop_ioctl = vop_stdioctl, .vop_kqfilter = vop_stdkqfilter, .vop_islocked = vop_stdislocked, .vop_lock1 = vop_stdlock, .vop_lookup = vop_nolookup, .vop_open = VOP_NULL, .vop_pathconf = VOP_EINVAL, .vop_poll = vop_nopoll, .vop_putpages = vop_stdputpages, .vop_readlink = VOP_EINVAL, .vop_rename = vop_norename, .vop_revoke = VOP_PANIC, .vop_strategy = vop_nostrategy, .vop_unlock = vop_stdunlock, .vop_vptocnp = vop_stdvptocnp, .vop_vptofh = vop_stdvptofh, .vop_unp_bind = vop_stdunp_bind, .vop_unp_connect = vop_stdunp_connect, .vop_unp_detach = vop_stdunp_detach, .vop_is_text = vop_stdis_text, .vop_set_text = vop_stdset_text, .vop_unset_text = vop_stdunset_text, .vop_add_writecount = vop_stdadd_writecount, .vop_copy_file_range = vop_stdcopy_file_range, }; /* * Series of placeholder functions for various error returns for * VOPs. */ int vop_eopnotsupp(struct vop_generic_args *ap) { /* printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); */ return (EOPNOTSUPP); } int vop_ebadf(struct vop_generic_args *ap) { return (EBADF); } int vop_enotty(struct vop_generic_args *ap) { return (ENOTTY); } int vop_einval(struct vop_generic_args *ap) { return (EINVAL); } int vop_enoent(struct vop_generic_args *ap) { return (ENOENT); } int vop_null(struct vop_generic_args *ap) { return (0); } /* * Helper function to panic on some bad VOPs in some filesystems. */ int vop_panic(struct vop_generic_args *ap) { panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name); } /* * vop_std and vop_no are default functions for use by * filesystems that need the "default reasonable" implementation for a * particular operation. * * The documentation for the operations they implement exists (if it exists) * in the VOP_(9) manpage (all uppercase). */ /* * Default vop for filesystems that do not support name lookup */ static int vop_nolookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { *ap->a_vpp = NULL; return (ENOTDIR); } /* * vop_norename: * * Handle unlock and reference counting for arguments of vop_rename * for filesystems that do not implement rename operation. */ static int vop_norename(struct vop_rename_args *ap) { vop_rename_fail(ap); return (EOPNOTSUPP); } /* * vop_nostrategy: * * Strategy routine for VFS devices that have none. * * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy * routine. Typically this is done for a BIO_READ strategy call. * Typically B_INVAL is assumed to already be clear prior to a write * and should not be cleared manually unless you just made the buffer * invalid. BIO_ERROR should be cleared either way. */ static int vop_nostrategy (struct vop_strategy_args *ap) { printf("No strategy for buffer at %p\n", ap->a_bp); vn_printf(ap->a_vp, "vnode "); ap->a_bp->b_ioflags |= BIO_ERROR; ap->a_bp->b_error = EOPNOTSUPP; bufdone(ap->a_bp); return (EOPNOTSUPP); } static int get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf, int dirbuflen, off_t *off, char **cpos, int *len, int *eofflag, struct thread *td) { int error, reclen; struct uio uio; struct iovec iov; struct dirent *dp; KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); if (*len == 0) { iov.iov_base = dirbuf; iov.iov_len = dirbuflen; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = *off; uio.uio_resid = dirbuflen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; *eofflag = 0; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag, NULL, NULL); if (error) return (error); *off = uio.uio_offset; *cpos = dirbuf; *len = (dirbuflen - uio.uio_resid); if (*len == 0) return (ENOENT); } dp = (struct dirent *)(*cpos); reclen = dp->d_reclen; *dpp = dp; /* check for malformed directory.. */ if (reclen < DIRENT_MINSIZE) return (EINVAL); *cpos += reclen; *len -= reclen; return (0); } /* * Check if a named file exists in a given directory vnode. */ static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td) { char *dirbuf, *cpos; int error, eofflag, dirbuflen, len, found; off_t off; struct dirent *dp; struct vattr va; KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); found = 0; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error) return (found); dirbuflen = DEV_BSIZE; if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); off = 0; len = 0; do { error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off, &cpos, &len, &eofflag, td); if (error) goto out; if (dp->d_type != DT_WHT && dp->d_fileno != 0 && strcmp(dp->d_name, dirname) == 0) { found = 1; goto out; } } while (len > 0 || !eofflag); out: free(dirbuf, M_TEMP); return (found); } int vop_stdaccess(struct vop_access_args *ap) { KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td)); } int vop_stdaccessx(struct vop_accessx_args *ap) { int error; accmode_t accmode = ap->a_accmode; error = vfs_unixify_accmode(&accmode); if (error != 0) return (error); if (accmode == 0) return (0); return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td)); } /* * Advisory record locking support */ int vop_stdadvlock(struct vop_advlock_args *ap) { struct vnode *vp; struct vattr vattr; int error; vp = ap->a_vp; if (ap->a_fl->l_whence == SEEK_END) { /* * The NFSv4 server must avoid doing a vn_lock() here, since it * can deadlock the nfsd threads, due to a LOR. Fortunately * the NFSv4 server always uses SEEK_SET and this code is * only required for the SEEK_END case. */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); VOP_UNLOCK(vp, 0); if (error) return (error); } else vattr.va_size = 0; return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size)); } int vop_stdadvlockasync(struct vop_advlockasync_args *ap) { struct vnode *vp; struct vattr vattr; int error; vp = ap->a_vp; if (ap->a_fl->l_whence == SEEK_END) { /* The size argument is only needed for SEEK_END. */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); VOP_UNLOCK(vp, 0); if (error) return (error); } else vattr.va_size = 0; return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size)); } int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap) { struct vnode *vp; vp = ap->a_vp; lf_purgelocks(vp, &vp->v_lockf); return (0); } /* * vop_stdpathconf: * * Standard implementation of POSIX pathconf, to get information about limits * for a filesystem. * Override per filesystem for the case where the filesystem has smaller * limits. */ int vop_stdpathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_ASYNC_IO: *ap->a_retval = _POSIX_ASYNCHRONOUS_IO; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_CAP_PRESENT: case _PC_INF_PRESENT: case _PC_MAC_PRESENT: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Standard lock, unlock and islocked functions. */ int vop_stdlock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { struct vnode *vp = ap->a_vp; struct mtx *ilk; ilk = VI_MTX(vp); return (lockmgr_lock_fast_path(vp->v_vnlock, ap->a_flags, &ilk->lock_object, ap->a_file, ap->a_line)); } /* See above. */ int vop_stdunlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; int a_flags; } */ *ap; { struct vnode *vp = ap->a_vp; struct mtx *ilk; ilk = VI_MTX(vp); return (lockmgr_unlock_fast_path(vp->v_vnlock, ap->a_flags, &ilk->lock_object)); } /* See above. */ int vop_stdislocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap; { return (lockstatus(ap->a_vp->v_vnlock)); } /* * Return true for select/poll. */ int vop_nopoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { return (poll_no_poll(ap->a_events)); } /* * Implement poll for local filesystems that support it. */ int vop_stdpoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { if (ap->a_events & ~POLLSTANDARD) return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events)); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Return our mount point, as we will take charge of the writes. */ int vop_stdgetwritemount(ap) struct vop_getwritemount_args /* { struct vnode *a_vp; struct mount **a_mpp; } */ *ap; { struct mount *mp; struct vnode *vp; /* * Note that having a reference does not prevent forced unmount from * setting ->v_mount to NULL after the lock gets released. This is of * no consequence for typical consumers (most notably vn_start_write) * since in this case the vnode is VI_DOOMED. Unmount might have * progressed far enough that its completion is only delayed by the * reference obtained here. The consumer only needs to concern itself * with releasing it. */ vp = ap->a_vp; mp = vp->v_mount; if (mp == NULL) { *(ap->a_mpp) = NULL; return (0); } if (vfs_op_thread_enter(mp)) { - if (mp == vp->v_mount) + if (mp == vp->v_mount) { vfs_mp_count_add_pcpu(mp, ref, 1); - else + vfs_op_thread_exit(mp); + } else { + vfs_op_thread_exit(mp); mp = NULL; - vfs_op_thread_exit(mp); + } } else { MNT_ILOCK(mp); - if (mp == vp->v_mount) + if (mp == vp->v_mount) { MNT_REF(mp); - else + MNT_IUNLOCK(mp); + } else { + MNT_IUNLOCK(mp); mp = NULL; - MNT_IUNLOCK(mp); + } } *(ap->a_mpp) = mp; return (0); } /* * If the file system doesn't implement VOP_BMAP, then return sensible defaults: * - Return the vnode's bufobj instead of any underlying device's bufobj * - Calculate the physical block number as if there were equal size * consecutive blocks, but * - Report no contiguous runs of blocks. */ int vop_stdbmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } int vop_stdfsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { return (vn_fsync_buf(ap->a_vp, ap->a_waitfor)); } static int vop_stdfdatasync(struct vop_fdatasync_args *ap) { return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td)); } int vop_stdfdatasync_buf(struct vop_fdatasync_args *ap) { return (vn_fsync_buf(ap->a_vp, MNT_WAIT)); } /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */ int vop_stdgetpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; } */ *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL); } static int vop_stdgetpages_async(struct vop_getpages_async_args *ap) { int error; error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead); ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } int vop_stdkqfilter(struct vop_kqfilter_args *ap) { return vfs_kqfilter(ap); } /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */ int vop_stdputpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } int vop_stdvptofh(struct vop_vptofh_args *ap) { return (EOPNOTSUPP); } int vop_stdvptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct ucred *cred = ap->a_cred; char *buf = ap->a_buf; int *buflen = ap->a_buflen; char *dirbuf, *cpos; int i, error, eofflag, dirbuflen, flags, locked, len, covered; off_t off; ino_t fileno; struct vattr va; struct nameidata nd; struct thread *td; struct dirent *dp; struct vnode *mvp; i = *buflen; error = 0; covered = 0; td = curthread; if (vp->v_type != VDIR) return (ENOENT); error = VOP_GETATTR(vp, &va, cred); if (error) return (error); VREF(vp); locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, "..", vp, td); flags = FREAD; error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL); if (error) { vn_lock(vp, locked | LK_RETRY); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); mvp = *dvp = nd.ni_vp; if (vp->v_mount != (*dvp)->v_mount && ((*dvp)->v_vflag & VV_ROOT) && ((*dvp)->v_mount->mnt_flag & MNT_UNION)) { *dvp = (*dvp)->v_mount->mnt_vnodecovered; VREF(mvp); VOP_UNLOCK(mvp, 0); vn_close(mvp, FREAD, cred, td); VREF(*dvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); covered = 1; } fileno = va.va_fileid; dirbuflen = DEV_BSIZE; if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); if ((*dvp)->v_type != VDIR) { error = ENOENT; goto out; } off = 0; len = 0; do { /* call VOP_READDIR of parent */ error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off, &cpos, &len, &eofflag, td); if (error) goto out; if ((dp->d_type != DT_WHT) && (dp->d_fileno == fileno)) { if (covered) { VOP_UNLOCK(*dvp, 0); vn_lock(mvp, LK_SHARED | LK_RETRY); if (dirent_exists(mvp, dp->d_name, td)) { error = ENOENT; VOP_UNLOCK(mvp, 0); vn_lock(*dvp, LK_SHARED | LK_RETRY); goto out; } VOP_UNLOCK(mvp, 0); vn_lock(*dvp, LK_SHARED | LK_RETRY); } i -= dp->d_namlen; if (i < 0) { error = ENOMEM; goto out; } if (dp->d_namlen == 1 && dp->d_name[0] == '.') { error = ENOENT; } else { bcopy(dp->d_name, buf + i, dp->d_namlen); error = 0; } goto out; } } while (len > 0 || !eofflag); error = ENOENT; out: free(dirbuf, M_TEMP); if (!error) { *buflen = i; vref(*dvp); } if (covered) { vput(*dvp); vrele(mvp); } else { VOP_UNLOCK(mvp, 0); vn_close(mvp, FREAD, cred, td); } vn_lock(vp, locked | LK_RETRY); return (error); } int vop_stdallocate(struct vop_allocate_args *ap) { #ifdef __notyet__ struct statfs *sfs; off_t maxfilesize = 0; #endif struct iovec aiov; struct vattr vattr, *vap; struct uio auio; off_t fsize, len, cur, offset; uint8_t *buf; struct thread *td; struct vnode *vp; size_t iosize; int error; buf = NULL; error = 0; td = curthread; vap = &vattr; vp = ap->a_vp; len = *ap->a_len; offset = *ap->a_offset; error = VOP_GETATTR(vp, vap, td->td_ucred); if (error != 0) goto out; fsize = vap->va_size; iosize = vap->va_blocksize; if (iosize == 0) iosize = BLKDEV_IOSIZE; if (iosize > MAXPHYS) iosize = MAXPHYS; buf = malloc(iosize, M_TEMP, M_WAITOK); #ifdef __notyet__ /* * Check if the filesystem sets f_maxfilesize; if not use * VOP_SETATTR to perform the check. */ sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = VFS_STATFS(vp->v_mount, sfs, td); if (error == 0) maxfilesize = sfs->f_maxfilesize; free(sfs, M_STATFS); if (error != 0) goto out; if (maxfilesize) { if (offset > maxfilesize || len > maxfilesize || offset + len > maxfilesize) { error = EFBIG; goto out; } } else #endif if (offset + len > vap->va_size) { /* * Test offset + len against the filesystem's maxfilesize. */ VATTR_NULL(vap); vap->va_size = offset + len; error = VOP_SETATTR(vp, vap, td->td_ucred); if (error != 0) goto out; VATTR_NULL(vap); vap->va_size = fsize; error = VOP_SETATTR(vp, vap, td->td_ucred); if (error != 0) goto out; } for (;;) { /* * Read and write back anything below the nominal file * size. There's currently no way outside the filesystem * to know whether this area is sparse or not. */ cur = iosize; if ((offset % iosize) != 0) cur -= (offset % iosize); if (cur > len) cur = len; if (offset < fsize) { aiov.iov_base = buf; aiov.iov_len = cur; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = cur; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; error = VOP_READ(vp, &auio, 0, td->td_ucred); if (error != 0) break; if (auio.uio_resid > 0) { bzero(buf + cur - auio.uio_resid, auio.uio_resid); } } else { bzero(buf, cur); } aiov.iov_base = buf; aiov.iov_len = cur; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = cur; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; error = VOP_WRITE(vp, &auio, 0, td->td_ucred); if (error != 0) break; len -= cur; offset += cur; if (len == 0) break; if (should_yield()) break; } out: *ap->a_len = len; *ap->a_offset = offset; free(buf, M_TEMP); return (error); } int vop_stdadvise(struct vop_advise_args *ap) { struct vnode *vp; struct bufobj *bo; daddr_t startn, endn; off_t bstart, bend, start, end; int bsize, error; vp = ap->a_vp; switch (ap->a_advice) { case POSIX_FADV_WILLNEED: /* * Do nothing for now. Filesystems should provide a * custom method which starts an asynchronous read of * the requested region. */ error = 0; break; case POSIX_FADV_DONTNEED: error = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); break; } /* * Round to block boundaries (and later possibly further to * page boundaries). Applications cannot reasonably be aware * of the boundaries, and the rounding must be to expand at * both extremities to cover enough. It still doesn't cover * read-ahead. For partial blocks, this gives unnecessary * discarding of buffers but is efficient enough since the * pages usually remain in VMIO for some time. */ bsize = vp->v_bufobj.bo_bsize; bstart = rounddown(ap->a_start, bsize); bend = roundup(ap->a_end, bsize); /* * Deactivate pages in the specified range from the backing VM * object. Pages that are resident in the buffer cache will * remain wired until their corresponding buffers are released * below. */ if (vp->v_object != NULL) { start = trunc_page(bstart); end = round_page(bend); VM_OBJECT_RLOCK(vp->v_object); vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start), OFF_TO_IDX(end)); VM_OBJECT_RUNLOCK(vp->v_object); } bo = &vp->v_bufobj; BO_RLOCK(bo); startn = bstart / bsize; endn = bend / bsize; error = bnoreuselist(&bo->bo_clean, bo, startn, endn); if (error == 0) error = bnoreuselist(&bo->bo_dirty, bo, startn, endn); BO_RUNLOCK(bo); VOP_UNLOCK(vp, 0); break; default: error = EINVAL; break; } return (error); } int vop_stdunp_bind(struct vop_unp_bind_args *ap) { ap->a_vp->v_unpcb = ap->a_unpcb; return (0); } int vop_stdunp_connect(struct vop_unp_connect_args *ap) { *ap->a_unpcb = ap->a_vp->v_unpcb; return (0); } int vop_stdunp_detach(struct vop_unp_detach_args *ap) { ap->a_vp->v_unpcb = NULL; return (0); } static int vop_stdis_text(struct vop_is_text_args *ap) { return (ap->a_vp->v_writecount < 0); } int vop_stdset_text(struct vop_set_text_args *ap) { struct vnode *vp; struct mount *mp; int error; vp = ap->a_vp; VI_LOCK(vp); if (vp->v_writecount > 0) { error = ETXTBSY; } else { /* * If requested by fs, keep a use reference to the * vnode until the last text reference is released. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_kern_flag & MNTK_TEXT_REFS) != 0 && vp->v_writecount == 0) { vp->v_iflag |= VI_TEXT_REF; vrefl(vp); } vp->v_writecount--; error = 0; } VI_UNLOCK(vp); return (error); } static int vop_stdunset_text(struct vop_unset_text_args *ap) { struct vnode *vp; int error; bool last; vp = ap->a_vp; last = false; VI_LOCK(vp); if (vp->v_writecount < 0) { if ((vp->v_iflag & VI_TEXT_REF) != 0 && vp->v_writecount == -1) { last = true; vp->v_iflag &= ~VI_TEXT_REF; } vp->v_writecount++; error = 0; } else { error = EINVAL; } VI_UNLOCK(vp); if (last) vunref(vp); return (error); } static int vop_stdadd_writecount(struct vop_add_writecount_args *ap) { struct vnode *vp; int error; vp = ap->a_vp; VI_LOCK_FLAGS(vp, MTX_DUPOK); if (vp->v_writecount < 0) { error = ETXTBSY; } else { VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, ("neg writecount increment %d", ap->a_inc)); vp->v_writecount += ap->a_inc; error = 0; } VI_UNLOCK(vp); return (error); } int vop_stdneed_inactive(struct vop_need_inactive_args *ap) { return (1); } static int vop_stdioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct vattr va; off_t *offp; int error; switch (ap->a_command) { case FIOSEEKDATA: case FIOSEEKHOLE: vp = ap->a_vp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (EBADF); if (vp->v_type == VREG) error = VOP_GETATTR(vp, &va, ap->a_cred); else error = ENOTTY; if (error == 0) { offp = ap->a_data; if (*offp < 0 || *offp >= va.va_size) error = ENXIO; else if (ap->a_command == FIOSEEKHOLE) *offp = va.va_size; } VOP_UNLOCK(vp, 0); break; default: error = ENOTTY; break; } return (error); } /* * vfs default ops * used to fill the vfs function table to get reasonable default return values. */ int vfs_stdroot (mp, flags, vpp) struct mount *mp; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdstatfs (mp, sbp) struct mount *mp; struct statfs *sbp; { return (EOPNOTSUPP); } int vfs_stdquotactl (mp, cmds, uid, arg) struct mount *mp; int cmds; uid_t uid; void *arg; { return (EOPNOTSUPP); } int vfs_stdsync(mp, waitfor) struct mount *mp; int waitfor; { struct vnode *vp, *mvp; struct thread *td; int error, lockreq, allerror = 0; td = curthread; lockreq = LK_EXCLUSIVE | LK_INTERLOCK; if (waitfor != MNT_WAIT) lockreq |= LK_NOWAIT; /* * Force stale buffer cache information to be flushed. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq, td)) != 0) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; vput(vp); } return (allerror); } int vfs_stdnosync (mp, waitfor) struct mount *mp; int waitfor; { return (0); } static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap) { int error; error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred, ap->a_outcred, ap->a_fsizetd); return (error); } int vfs_stdvget (mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdfhtovp (mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdinit (vfsp) struct vfsconf *vfsp; { return (0); } int vfs_stduninit (vfsp) struct vfsconf *vfsp; { return(0); } int vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname) struct mount *mp; int cmd; struct vnode *filename_vp; int attrnamespace; const char *attrname; { if (filename_vp != NULL) VOP_UNLOCK(filename_vp, 0); return (EOPNOTSUPP); } int vfs_stdsysctl(mp, op, req) struct mount *mp; fsctlop_t op; struct sysctl_req *req; { return (EOPNOTSUPP); } static vop_bypass_t * bp_by_off(struct vop_vector *vop, struct vop_generic_args *a) { return (*(vop_bypass_t **)((char *)vop + a->a_desc->vdesc_vop_offset)); } int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a) { vop_bypass_t *bp; int prev_stops, rc; for (; vop != NULL; vop = vop->vop_default) { bp = bp_by_off(vop, a); if (bp != NULL) break; /* * Bypass is not really supported. It is done for * fallback to unimplemented vops in the default * vector. */ bp = vop->vop_bypass; if (bp != NULL) break; } MPASS(bp != NULL); prev_stops = sigdeferstop(SIGDEFERSTOP_SILENT); rc = bp(a); sigallowstop(prev_stops); return (rc); } Index: projects/clang900-import/sys/kern/vfs_vnops.c =================================================================== --- projects/clang900-import/sys/kern/vfs_vnops.c (revision 352536) +++ projects/clang900-import/sys/kern/vfs_vnops.c (revision 352537) @@ -1,3128 +1,3132 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2012 Konstantin Belousov * Copyright (c) 2013, 2014 The FreeBSD Foundation * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_rdwr_t vn_io_fault; static fo_truncate_t vn_truncate; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; static fo_stat_t vn_statfile; static fo_close_t vn_closefile; static fo_mmap_t vn_mmap; struct fileops vnops = { .fo_read = vn_io_fault, .fo_write = vn_io_fault, .fo_truncate = vn_truncate, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = vn_mmap, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; static const int io_hold_cnt = 16; static int vn_io_fault_enable = 1; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); static int vn_io_fault_prefault = 0; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW, &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); static u_long vn_io_faults_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); /* * Returns true if vn_io_fault mode of handling the i/o request should * be used. */ static bool do_vn_io_fault(struct vnode *vp, struct uio *uio) { struct mount *mp; return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && (mp = vp->v_mount) != NULL && (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); } /* * Structure used to pass arguments to vn_io_fault1(), to do either * file- or vnode-based I/O calls. */ struct vn_io_fault_args { enum { VN_IO_FAULT_FOP, VN_IO_FAULT_VOP } kind; struct ucred *cred; int flags; union { struct fop_args_tag { struct file *fp; fo_rdwr_t *doio; } fop_args; struct vop_args_tag { struct vnode *vp; } vop_args; } args; }; static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td); int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp) { struct thread *td = ndp->ni_cnd.cn_thread; return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); } /* * Common code for vnode open operations via a name lookup. * Lookup the vnode and invoke VOP_CREATE if needed. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp) { struct vnode *vp; struct mount *mp; struct thread *td = ndp->ni_cnd.cn_thread; struct vattr vat; struct vattr *vap = &vat; int fmode, error; restart: fmode = *flagp; if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | O_EXCL | O_DIRECTORY)) return (EINVAL); else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; /* * Set NOCACHE to avoid flushing the cache when * rolling in many files at once. */ ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; if ((fmode & O_BENEATH) != 0) ndp->ni_cnd.cn_flags |= BENEATH; if (!(vn_open_flags & VN_OPEN_NOAUDIT)) ndp->ni_cnd.cn_flags |= AUDITVNODE1; if (vn_open_flags & VN_OPEN_NOCAPCHECK) ndp->ni_cnd.cn_flags |= NOCAPCHECK; bwillwrite(); if ((error = namei(ndp)) != 0) return (error); if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(ndp->ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) ndp->ni_cnd.cn_flags |= MAKEENTRY; #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, &ndp->ni_cnd, vap); if (error == 0) #endif error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); vput(ndp->ni_dvp); vn_finished_write(mp); if (error) { NDFREE(ndp, NDF_ONLY_PNBUF); return (error); } fmode &= ~O_TRUNC; vp = ndp->ni_vp; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } + if (vp->v_type == VDIR) { + error = EISDIR; + goto bad; + } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = ISOPEN | ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; if (!(fmode & FWRITE)) ndp->ni_cnd.cn_flags |= LOCKSHARED; if ((fmode & O_BENEATH) != 0) ndp->ni_cnd.cn_flags |= BENEATH; if (!(vn_open_flags & VN_OPEN_NOAUDIT)) ndp->ni_cnd.cn_flags |= AUDITVNODE1; if (vn_open_flags & VN_OPEN_NOCAPCHECK) ndp->ni_cnd.cn_flags |= NOCAPCHECK; if ((error = namei(ndp)) != 0) return (error); vp = ndp->ni_vp; } error = vn_open_vnode(vp, fmode, cred, td, fp); if (error) goto bad; *flagp = fmode; return (0); bad: NDFREE(ndp, NDF_ONLY_PNBUF); vput(vp); *flagp = fmode; ndp->ni_vp = NULL; return (error); } static int vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp) { struct flock lf; int error, lock_flags, type; ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock"); if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0) return (0); KASSERT(fp != NULL, ("open with flock requires fp")); if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) return (EOPNOTSUPP); lock_flags = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); if (error == 0) fp->f_flag |= FHASLOCK; vn_lock(vp, lock_flags | LK_RETRY); if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) error = ENOENT; return (error); } /* * Common code for vnode open operations once a vnode is located. * Check permissions, and call the VOP_OPEN routine. */ int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp) { accmode_t accmode; int error; if (vp->v_type == VLNK) return (EMLINK); if (vp->v_type == VSOCK) return (EOPNOTSUPP); if (vp->v_type != VDIR && fmode & O_DIRECTORY) return (ENOTDIR); accmode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) return (EISDIR); accmode |= VWRITE; } if (fmode & FREAD) accmode |= VREAD; if (fmode & FEXEC) accmode |= VEXEC; if ((fmode & O_APPEND) && (fmode & FWRITE)) accmode |= VAPPEND; #ifdef MAC if (fmode & O_CREAT) accmode |= VCREAT; if (fmode & O_VERIFY) accmode |= VVERIFY; error = mac_vnode_check_open(cred, vp, accmode); if (error) return (error); accmode &= ~(VCREAT | VVERIFY); #endif if ((fmode & O_CREAT) == 0 && accmode != 0) { error = VOP_ACCESS(vp, accmode, cred, td); if (error != 0) return (error); } if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); error = VOP_OPEN(vp, fmode, cred, td, fp); if (error != 0) return (error); error = vn_open_vnode_advlock(vp, fmode, fp); if (error == 0 && (fmode & FWRITE) != 0) { error = VOP_ADD_WRITECOUNT(vp, 1); if (error == 0) { CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } } /* * Error from advlock or VOP_ADD_WRITECOUNT() still requires * calling VOP_CLOSE() to pair with earlier VOP_OPEN(). * Arrange for that by having fdrop() to use vn_closefile(). */ if (error != 0) { fp->f_flag |= FOPENFAILED; fp->f_vnode = vp; if (fp->f_ops == &badfileops) { fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; } vref(vp); } ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); return (error); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. * It is racy. */ int vn_writechk(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, "vn_writechk"); /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (VOP_IS_TEXT(vp)) return (ETXTBSY); return (0); } /* * Vnode close call */ static int vn_close1(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td, bool keep_ref) { struct mount *mp; int error, lock_flags; if (vp->v_type != VFIFO && (flags & FWRITE) == 0 && MNT_EXTENDED_SHARED(vp->v_mount)) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, lock_flags | LK_RETRY); AUDIT_ARG_VNODE1(vp); if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } error = VOP_CLOSE(vp, flags, file_cred, td); if (keep_ref) VOP_UNLOCK(vp, 0); else vput(vp); vn_finished_write(mp); return (error); } int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td) { return (vn_close1(vp, flags, file_cred, td, false)); } /* * Heuristic to detect sequential operation. */ static int sequential_heuristic(struct uio *uio, struct file *fp) { ASSERT_VOP_LOCKED(fp->f_vnode, __func__); if (fp->f_flag & FRDAHEAD) return (fp->f_seqcount << IO_SEQSHIFT); /* * Offset 0 is handled specially. open() sets f_seqcount to 1 so * that the first I/O is normally considered to be slightly * sequential. Seeking to offset 0 doesn't change sequentiality * unless previous seeks have reduced f_seqcount to 0, in which * case offset 0 is not special. */ if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || uio->uio_offset == fp->f_nextoff) { /* * f_seqcount is in units of fixed-size blocks so that it * depends mainly on the amount of sequential I/O and not * much on the number of sequential I/O's. The fixed size * of 16384 is hard-coded here since it is (not quite) just * a magic size that works well here. This size is more * closely related to the best I/O size for real disks than * to any block size used by software. */ if (uio->uio_resid >= IO_SEQMAX * 16384) fp->f_seqcount = IO_SEQMAX; else { fp->f_seqcount += howmany(uio->uio_resid, 16384); if (fp->f_seqcount > IO_SEQMAX) fp->f_seqcount = IO_SEQMAX; } return (fp->f_seqcount << IO_SEQSHIFT); } /* Not sequential. Quickly draw-down sequentiality. */ if (fp->f_seqcount > 1) fp->f_seqcount = 1; else fp->f_seqcount = 0; return (0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; void *rl_cookie; struct vn_io_fault_args args; int error, lock_flags; if (offset < 0 && vp->v_type != VCHR) return (EINVAL); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; error = 0; if ((ioflg & IO_NODELOCKED) == 0) { if ((ioflg & IO_RANGELOCKED) == 0) { if (rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, offset, offset + len); } else { rl_cookie = vn_rangelock_wlock(vp, offset, offset + len); } } else rl_cookie = NULL; mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (MNT_SHARED_WRITES(mp) || ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; } else lock_flags = LK_SHARED; vn_lock(vp, lock_flags | LK_RETRY); } else rl_cookie = NULL; ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) error = mac_vnode_check_read(active_cred, file_cred, vp); else error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { if (file_cred != NULL) cred = file_cred; else cred = active_cred; if (do_vn_io_fault(vp, &auio)) { args.kind = VN_IO_FAULT_VOP; args.cred = cred; args.flags = ioflg; args.args.vop_args.vp = vp; error = vn_io_fault1(vp, &auio, &args, td); } else if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); } else /* if (rw == UIO_WRITE) */ { error = VOP_WRITE(vp, &auio, ioflg, cred); } } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp, 0); if (mp != NULL) vn_finished_write(mp); } out: if (rl_cookie != NULL) vn_rangelock_unlock(vp, rl_cookie); return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td) { int error = 0; ssize_t iaresid; do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); iaresid = 0; error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, active_cred, file_cred, &iaresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base = (char *)base + chunk; kern_yield(PRI_USER); } while (len); if (aresid) *aresid = len + iaresid; return (error); } off_t foffset_lock(struct file *fp, int flags) { struct mtx *mtxp; off_t res; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); #if OFF_MAX <= LONG_MAX /* * Caller only wants the current f_offset value. Assume that * the long and shorter integer types reads are atomic. */ if ((flags & FOF_NOLOCK) != 0) return (fp->f_offset); #endif /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOLOCK) == 0) { while (fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; msleep(&fp->f_vnread_flags, mtxp, PUSER -1, "vofflock", 0); } fp->f_vnread_flags |= FOFFSET_LOCKED; } res = fp->f_offset; mtx_unlock(mtxp); return (res); } void foffset_unlock(struct file *fp, off_t val, int flags) { struct mtx *mtxp; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); #if OFF_MAX <= LONG_MAX if ((flags & FOF_NOLOCK) != 0) { if ((flags & FOF_NOUPDATE) == 0) fp->f_offset = val; if ((flags & FOF_NEXTOFF) != 0) fp->f_nextoff = val; return; } #endif mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOUPDATE) == 0) fp->f_offset = val; if ((flags & FOF_NEXTOFF) != 0) fp->f_nextoff = val; if ((flags & FOF_NOLOCK) == 0) { KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, ("Lost FOFFSET_LOCKED")); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; } mtx_unlock(mtxp); } void foffset_lock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) uio->uio_offset = foffset_lock(fp, flags); } void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) foffset_unlock(fp, uio->uio_offset, flags); } static int get_advice(struct file *fp, struct uio *uio) { struct mtx *mtxp; int ret; ret = POSIX_FADV_NORMAL; if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG) return (ret); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if (fp->f_advice != NULL && uio->uio_offset >= fp->f_advice->fa_start && uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) ret = fp->f_advice->fa_advice; mtx_unlock(mtxp); return (ret); } /* * File table vnode read routine. */ static int vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct vnode *vp; off_t orig_offset; int error, ioflag; int advice; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; advice = get_advice(fp, uio); vn_lock(vp, LK_SHARED | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* Disable read-ahead for random I/O. */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_READ(vp, uio, ioflag, fp->f_cred); fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * read(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); return (error); } /* * File table vnode write routine. */ static int vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct vnode *vp; struct mount *mp; off_t orig_offset; int error, ioflag, lock_flags; int advice; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; if (vp->v_type == VREG) bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; mp = NULL; if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; advice = get_advice(fp, uio); if (MNT_SHARED_WRITES(mp) || (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) { lock_flags = LK_SHARED; } else { lock_flags = LK_EXCLUSIVE; } vn_lock(vp, lock_flags | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* XXX: Is this correct? */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); if (vp->v_type != VCHR) vn_finished_write(mp); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * write(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); unlock: return (error); } /* * The vn_io_fault() is a wrapper around vn_read() and vn_write() to * prevent the following deadlock: * * Assume that the thread A reads from the vnode vp1 into userspace * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is * currently not resident, then system ends up with the call chain * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) * which establishes lock order vp1->vn_lock, then vp2->vn_lock. * If, at the same time, thread B reads from vnode vp2 into buffer buf2 * backed by the pages of vnode vp1, and some page in buf2 is not * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. * * To prevent the lock order reversal and deadlock, vn_io_fault() does * not allow page faults to happen during VOP_READ() or VOP_WRITE(). * Instead, it first tries to do the whole range i/o with pagefaults * disabled. If all pages in the i/o buffer are resident and mapped, * VOP will succeed (ignoring the genuine filesystem errors). * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do * i/o in chunks, with all pages in the chunk prefaulted and held * using vm_fault_quick_hold_pages(). * * Filesystems using this deadlock avoidance scheme should use the * array of the held pages from uio, saved in the curthread->td_ma, * instead of doing uiomove(). A helper function * vn_io_fault_uiomove() converts uiomove request into * uiomove_fromphys() over td_ma array. * * Since vnode locks do not cover the whole i/o anymore, rangelocks * make the current i/o request atomic with respect to other i/os and * truncations. */ /* * Decode vn_io_fault_args and perform the corresponding i/o. */ static int vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, struct thread *td) { int error, save; error = 0; save = vm_fault_disable_pagefaults(); switch (args->kind) { case VN_IO_FAULT_FOP: error = (args->args.fop_args.doio)(args->args.fop_args.fp, uio, args->cred, args->flags, td); break; case VN_IO_FAULT_VOP: if (uio->uio_rw == UIO_READ) { error = VOP_READ(args->args.vop_args.vp, uio, args->flags, args->cred); } else if (uio->uio_rw == UIO_WRITE) { error = VOP_WRITE(args->args.vop_args.vp, uio, args->flags, args->cred); } break; default: panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind, uio->uio_rw); } vm_fault_enable_pagefaults(save); return (error); } static int vn_io_fault_touch(char *base, const struct uio *uio) { int r; r = fubyte(base); if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) return (EFAULT); return (0); } static int vn_io_fault_prefault_user(const struct uio *uio) { char *base; const struct iovec *iov; size_t len; ssize_t resid; int error, i; KASSERT(uio->uio_segflg == UIO_USERSPACE, ("vn_io_fault_prefault userspace")); error = i = 0; iov = uio->uio_iov; resid = uio->uio_resid; base = iov->iov_base; len = iov->iov_len; while (resid > 0) { error = vn_io_fault_touch(base, uio); if (error != 0) break; if (len < PAGE_SIZE) { if (len != 0) { error = vn_io_fault_touch(base + len - 1, uio); if (error != 0) break; resid -= len; } if (++i >= uio->uio_iovcnt) break; iov = uio->uio_iov + i; base = iov->iov_base; len = iov->iov_len; } else { len -= PAGE_SIZE; base += PAGE_SIZE; resid -= PAGE_SIZE; } } return (error); } /* * Common code for vn_io_fault(), agnostic to the kind of i/o request. * Uses vn_io_fault_doio() to make the call to an actual i/o function. * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request * into args and call vn_io_fault1() to handle faults during the user * mode buffer accesses. */ static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td) { vm_page_t ma[io_hold_cnt + 2]; struct uio *uio_clone, short_uio; struct iovec short_iovec[1]; vm_page_t *prev_td_ma; vm_prot_t prot; vm_offset_t addr, end; size_t len, resid; ssize_t adv; int error, cnt, saveheld, prev_td_ma_cnt; if (vn_io_fault_prefault) { error = vn_io_fault_prefault_user(uio); if (error != 0) return (error); /* Or ignore ? */ } prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; /* * The UFS follows IO_UNIT directive and replays back both * uio_offset and uio_resid if an error is encountered during the * operation. But, since the iovec may be already advanced, * uio is still in an inconsistent state. * * Cache a copy of the original uio, which is advanced to the redo * point using UIO_NOCOPY below. */ uio_clone = cloneuio(uio); resid = uio->uio_resid; short_uio.uio_segflg = UIO_USERSPACE; short_uio.uio_rw = uio->uio_rw; short_uio.uio_td = uio->uio_td; error = vn_io_fault_doio(args, uio, td); if (error != EFAULT) goto out; atomic_add_long(&vn_io_faults_cnt, 1); uio_clone->uio_segflg = UIO_NOCOPY; uiomove(NULL, resid - uio->uio_resid, uio_clone); uio_clone->uio_segflg = uio->uio_segflg; saveheld = curthread_pflags_set(TDP_UIOHELD); prev_td_ma = td->td_ma; prev_td_ma_cnt = td->td_ma_cnt; while (uio_clone->uio_resid != 0) { len = uio_clone->uio_iov->iov_len; if (len == 0) { KASSERT(uio_clone->uio_iovcnt >= 1, ("iovcnt underflow")); uio_clone->uio_iov++; uio_clone->uio_iovcnt--; continue; } if (len > io_hold_cnt * PAGE_SIZE) len = io_hold_cnt * PAGE_SIZE; addr = (uintptr_t)uio_clone->uio_iov->iov_base; end = round_page(addr + len); if (end < addr) { error = EFAULT; break; } cnt = atop(end - trunc_page(addr)); /* * A perfectly misaligned address and length could cause * both the start and the end of the chunk to use partial * page. +2 accounts for such a situation. */ cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, addr, len, prot, ma, io_hold_cnt + 2); if (cnt == -1) { error = EFAULT; break; } short_uio.uio_iov = &short_iovec[0]; short_iovec[0].iov_base = (void *)addr; short_uio.uio_iovcnt = 1; short_uio.uio_resid = short_iovec[0].iov_len = len; short_uio.uio_offset = uio_clone->uio_offset; td->td_ma = ma; td->td_ma_cnt = cnt; error = vn_io_fault_doio(args, &short_uio, td); vm_page_unhold_pages(ma, cnt); adv = len - short_uio.uio_resid; uio_clone->uio_iov->iov_base = (char *)uio_clone->uio_iov->iov_base + adv; uio_clone->uio_iov->iov_len -= adv; uio_clone->uio_resid -= adv; uio_clone->uio_offset += adv; uio->uio_resid -= adv; uio->uio_offset += adv; if (error != 0 || adv == 0) break; } td->td_ma = prev_td_ma; td->td_ma_cnt = prev_td_ma_cnt; curthread_pflags_restore(saveheld); out: free(uio_clone, M_IOV); return (error); } static int vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { fo_rdwr_t *doio; struct vnode *vp; void *rl_cookie; struct vn_io_fault_args args; int error; doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; vp = fp->f_vnode; foffset_lock_uio(fp, uio, flags); if (do_vn_io_fault(vp, uio)) { args.kind = VN_IO_FAULT_FOP; args.args.fop_args.fp = fp; args.args.fop_args.doio = doio; args.cred = active_cred; args.flags = flags | FOF_OFFSET; if (uio->uio_rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } else if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) { /* For appenders, punt and lock the whole range. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); } else { rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } error = vn_io_fault1(vp, uio, &args, td); vn_rangelock_unlock(vp, rl_cookie); } else { error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); } foffset_unlock_uio(fp, uio, flags); return (error); } /* * Helper function to perform the requested uiomove operation using * the held pages for io->uio_iov[0].iov_base buffer instead of * copyin/copyout. Access to the pages with uiomove_fromphys() * instead of iov_base prevents page faults that could occur due to * pmap_collect() invalidating the mapping created by * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or * object cleanup revoking the write access from page mappings. * * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() * instead of plain uiomove(). */ int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) { struct uio transp_uio; struct iovec transp_iov[1]; struct thread *td; size_t adv; int error, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove(data, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); transp_iov[0].iov_base = data; transp_uio.uio_iov = &transp_iov[0]; transp_uio.uio_iovcnt = 1; if (xfersize > uio->uio_resid) xfersize = uio->uio_resid; transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; transp_uio.uio_offset = 0; transp_uio.uio_segflg = UIO_SYSSPACE; /* * Since transp_iov points to data, and td_ma page array * corresponds to original uio->uio_iov, we need to invert the * direction of the i/o operation as passed to * uiomove_fromphys(). */ switch (uio->uio_rw) { case UIO_WRITE: transp_uio.uio_rw = UIO_READ; break; case UIO_READ: transp_uio.uio_rw = UIO_WRITE; break; } transp_uio.uio_td = uio->uio_td; error = uiomove_fromphys(td->td_ma, ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, xfersize, &transp_uio); adv = xfersize - transp_uio.uio_resid; pgadv = (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; uio->uio_iov->iov_len -= adv; uio->uio_resid -= adv; uio->uio_offset += adv; return (error); } int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio) { struct thread *td; vm_offset_t iov_base; int cnt, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove_fromphys(ma, offset, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; iov_base = (vm_offset_t)uio->uio_iov->iov_base; switch (uio->uio_rw) { case UIO_WRITE: pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, offset, cnt); break; case UIO_READ: pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, cnt); break; } pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)(iov_base + cnt); uio->uio_iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; return (0); } /* * File table truncate routine. */ static int vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct mount *mp; struct vnode *vp; void *rl_cookie; int error; vp = fp->f_vnode; /* * Lock the whole range for truncation. Otherwise split i/o * might happen partly before and partly after the truncation. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) goto out1; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error) goto out; #endif error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0, fp->f_cred); out: VOP_UNLOCK(vp, 0); vn_finished_write(mp); out1: vn_rangelock_unlock(vp, rl_cookie); return (error); } /* * Truncate a file that is already locked. */ int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, struct ucred *cred) { struct vattr vattr; int error; error = VOP_ADD_WRITECOUNT(vp, 1); if (error == 0) { VATTR_NULL(&vattr); vattr.va_size = length; if (sync) vattr.va_vaflags |= VA_SYNC; error = VOP_SETATTR(vp, &vattr, cred); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); } return (error); } /* * File table vnode stat routine. */ static int vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct vnode *vp = fp->f_vnode; int error; vn_lock(vp, LK_SHARED | LK_RETRY); error = vn_stat(vp, sb, active_cred, fp->f_cred, td); VOP_UNLOCK(vp, 0); return (error); } /* * Stat a vnode; implementation for the stat syscall */ int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td) { struct vattr vattr; struct vattr *vap; int error; u_short mode; AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_stat(active_cred, file_cred, vp); if (error) return (error); #endif vap = &vattr; /* * Initialize defaults for new and unusual fields, so that file * systems which don't support these fields don't need to know * about them. */ vap->va_birthtime.tv_sec = -1; vap->va_birthtime.tv_nsec = 0; vap->va_fsid = VNOVAL; vap->va_rdev = NODEV; error = VOP_GETATTR(vp, vap, active_cred); if (error) return (error); /* * Zero the spare stat fields */ bzero(sb, sizeof *sb); /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: return (EBADF); } sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) return (EOVERFLOW); sb->st_size = vap->va_size; sb->st_atim = vap->va_atime; sb->st_mtim = vap->va_mtime; sb->st_ctim = vap->va_ctime; sb->st_birthtim = vap->va_birthtime; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Use miminum/default of PAGE_SIZE (e.g. for VCHR). */ sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); sb->st_flags = vap->va_flags; if (priv_check(td, PRIV_VFS_GENERATION)) sb->st_gen = 0; else sb->st_gen = vap->va_gen; sb->st_blocks = vap->va_bytes / S_BLKSIZE; return (0); } /* * File table vnode ioctl routine. */ static int vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { struct vattr vattr; struct vnode *vp; struct fiobmap2_arg *bmarg; int error; vp = fp->f_vnode; switch (vp->v_type) { case VDIR: case VREG: switch (com) { case FIONREAD: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, active_cred); VOP_UNLOCK(vp, 0); if (error == 0) *(int *)data = vattr.va_size - fp->f_offset; return (error); case FIOBMAP2: bmarg = (struct fiobmap2_arg *)data; vn_lock(vp, LK_SHARED | LK_RETRY); #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_BMAP(vp, bmarg->bn, NULL, &bmarg->bn, &bmarg->runp, &bmarg->runb); VOP_UNLOCK(vp, 0); return (error); case FIONBIO: case FIOASYNC: return (0); default: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); } break; case VCHR: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); default: return (ENOTTY); } } /* * File table vnode poll routine. */ static int vn_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct vnode *vp; int error; vp = fp->f_vnode; #ifdef MAC vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); VOP_UNLOCK(vp, 0); if (!error) #endif error = VOP_POLL(vp, events, fp->f_cred, td); return (error); } /* * Acquire the requested lock and then check for validity. LK_RETRY * permits vn_lock to return doomed vnodes. */ int _vn_lock(struct vnode *vp, int flags, char *file, int line) { int error; VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vn_lock: no locktype")); VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count")); retry: error = VOP_LOCK1(vp, flags, file, line); flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */ KASSERT((flags & LK_RETRY) == 0 || error == 0, ("vn_lock: error %d incompatible with flags %#x", error, flags)); if ((flags & LK_RETRY) == 0) { if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) { VOP_UNLOCK(vp, 0); error = ENOENT; } } else if (error != 0) goto retry; return (error); } /* * File table vnode close routine. */ static int vn_closefile(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; int error; bool ref; vp = fp->f_vnode; fp->f_ops = &badfileops; ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE; error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); if (__predict_false(ref)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); vrele(vp); } return (error); } static bool vn_suspendable(struct mount *mp) { return (mp->mnt_op->vfs_susp_clean != NULL); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ static int vn_start_write_refed(struct mount *mp, int flags, bool mplocked) { int error, mflags; if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 && vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); vfs_mp_count_add_pcpu(mp, writeopcount, 1); vfs_op_thread_exit(mp); return (0); } if (mplocked) mtx_assert(MNT_MTX(mp), MA_OWNED); else MNT_ILOCK(mp); error = 0; /* * Check on status of suspension. */ if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || mp->mnt_susp_owner != curthread) { mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0) | (PUSER - 1); while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if (flags & V_NOWAIT) { error = EWOULDBLOCK; goto unlock; } error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0); if (error) goto unlock; } } if (flags & V_XSLEEP) goto unlock; mp->mnt_writeopcount++; unlock: if (error != 0 || (flags & V_XSLEEP) != 0) MNT_REL(mp); MNT_IUNLOCK(mp); return (error); } int vn_start_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); error = 0; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); if (!vn_suspendable(mp)) { if (vp != NULL || (flags & V_MNTREF) != 0) vfs_rel(mp); return (0); } /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ if (vp == NULL && (flags & V_MNTREF) == 0) vfs_ref(mp); return (vn_start_write_refed(mp, flags, false)); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); retry: if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if ((mp = *mpp) == NULL) return (0); if (!vn_suspendable(mp)) { if (vp != NULL || (flags & V_MNTREF) != 0) vfs_rel(mp); return (0); } /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ MNT_ILOCK(mp); if (vp == NULL && (flags & V_MNTREF) == 0) MNT_REF(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { mp->mnt_secondary_writes++; mp->mnt_secondary_accwrites++; MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP | ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0), "suspfs", 0); vfs_rel(mp); if (error == 0) goto retry; return (error); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(struct mount *mp) { int c; if (mp == NULL || !vn_suspendable(mp)) return; if (vfs_op_thread_enter(mp)) { vfs_mp_count_sub_pcpu(mp, writeopcount, 1); vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_writeopcount; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); } /* * Filesystem secondary write operation has completed. If we are * suspending and this operation is the last one, notify the suspender * that the suspension is now in effect. */ void vn_finished_secondary_write(struct mount *mp) { if (mp == NULL || !vn_suspendable(mp)) return; MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_secondary_writes--; if (mp->mnt_secondary_writes < 0) panic("vn_finished_secondary_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_secondary_writes <= 0) wakeup(&mp->mnt_secondary_writes); MNT_IUNLOCK(mp); } /* * Request a filesystem to suspend write operations. */ int vfs_write_suspend(struct mount *mp, int flags) { int error; MPASS(vn_suspendable(mp)); vfs_op_enter(mp); MNT_ILOCK(mp); vfs_assert_mount_counters(mp); if (mp->mnt_susp_owner == curthread) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); return (EALREADY); } while (mp->mnt_kern_flag & MNTK_SUSPEND) msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); /* * Unmount holds a write reference on the mount point. If we * own busy reference and drain for writers, we deadlock with * the reference draining in the unmount path. Callers of * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if * vfs_busy() reference is owned and caller is not in the * unmount context. */ if ((flags & VS_SKIP_UNMOUNT) != 0 && (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); return (EBUSY); } mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = curthread; if (mp->mnt_writeopcount > 0) (void) msleep(&mp->mnt_writeopcount, MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); else MNT_IUNLOCK(mp); if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) { vfs_write_resume(mp, 0); vfs_op_exit(mp); } return (error); } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(struct mount *mp, int flags) { MPASS(vn_suspendable(mp)); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED); mp->mnt_susp_owner = NULL; wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); curthread->td_pflags &= ~TDP_IGNSUSP; if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); mp->mnt_writeopcount++; } MNT_IUNLOCK(mp); if ((flags & VR_NO_SUSPCLR) == 0) VFS_SUSP_CLEAN(mp); vfs_op_exit(mp); } else if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); vn_start_write_refed(mp, 0, true); } else { MNT_IUNLOCK(mp); } } /* * Helper loop around vfs_write_suspend() for filesystem unmount VFS * methods. */ int vfs_write_suspend_umnt(struct mount *mp) { int error; MPASS(vn_suspendable(mp)); KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, ("vfs_write_suspend_umnt: recursed")); /* dounmount() already called vn_start_write(). */ for (;;) { vn_finished_write(mp); error = vfs_write_suspend(mp, 0); if (error != 0) { vn_start_write(NULL, &mp, V_WAIT); return (error); } MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) break; MNT_IUNLOCK(mp); vn_start_write(NULL, &mp, V_WAIT); } mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); curthread->td_pflags |= TDP_IGNSUSP; return (0); } /* * Implement kqueues for files by translating it to vnode operation. */ static int vn_kqfilter(struct file *fp, struct knote *kn) { return (VOP_KQFILTER(fp->f_vnode, kn)); } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_SHARED | LK_RETRY); ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp, 0); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute removal as kernel */ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0); } return (error); } static int vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); } int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) { return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, lkflags, rvp)); } int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp) { struct mount *mp; int ltype, error; ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); mp = vp->v_mount; ltype = VOP_ISLOCKED(vp); KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, ("vn_vget_ino: vp not locked")); error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp, 0); error = vfs_busy(mp, 0); vn_lock(vp, ltype | LK_RETRY); vfs_rel(mp); if (error != 0) return (ENOENT); if (vp->v_iflag & VI_DOOMED) { vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp, 0); error = alloc(mp, alloc_arg, lkflags, rvp); vfs_unbusy(mp); if (error != 0 || *rvp != vp) vn_lock(vp, ltype | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { if (error == 0) { if (*rvp == vp) vunref(vp); else vput(*rvp); } error = ENOENT; } return (error); } int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, struct thread *td) { if (vp->v_type != VREG || td == NULL) return (0); if ((uoff_t)uio->uio_offset + uio->uio_resid > lim_cur(td, RLIMIT_FSIZE)) { PROC_LOCK(td->td_proc); kern_psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } return (0); } int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp, 0); #endif return (setfmode(td, active_cred, vp, mode)); } int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp, 0); #endif return (setfown(td, active_cred, vp, uid, gid)); } void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) { vm_object_t object; if ((object = vp->v_object) == NULL) return; VM_OBJECT_WLOCK(object); vm_object_page_remove(object, start, end, 0); VM_OBJECT_WUNLOCK(object); } int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) { struct vattr va; daddr_t bn, bnp; uint64_t bsize; off_t noff; int error; KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, ("Wrong command %lu", cmd)); if (vn_lock(vp, LK_SHARED) != 0) return (EBADF); if (vp->v_type != VREG) { error = ENOTTY; goto unlock; } error = VOP_GETATTR(vp, &va, cred); if (error != 0) goto unlock; noff = *off; if (noff >= va.va_size) { error = ENXIO; goto unlock; } bsize = vp->v_mount->mnt_stat.f_iosize; for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize - noff % bsize) { error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); if (error == EOPNOTSUPP) { error = ENOTTY; goto unlock; } if ((bnp == -1 && cmd == FIOSEEKHOLE) || (bnp != -1 && cmd == FIOSEEKDATA)) { noff = bn * bsize; if (noff < *off) noff = *off; goto unlock; } } if (noff > va.va_size) noff = va.va_size; /* noff == va.va_size. There is an implicit hole at the end of file. */ if (cmd == FIOSEEKDATA) error = ENXIO; unlock: VOP_UNLOCK(vp, 0); if (error == 0) *off = noff; return (error); } int vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct ucred *cred; struct vnode *vp; struct vattr vattr; off_t foffset, size; int error, noneg; cred = td->td_ucred; vp = fp->f_vnode; foffset = foffset_lock(fp, 0); noneg = (vp->v_type != VCHR); error = 0; switch (whence) { case L_INCR: if (noneg && (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); if (error) break; /* * If the file references a disk device, then fetch * the media size and use that to determine the ending * offset. */ if (vattr.va_size == 0 && vp->v_type == VCHR && fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) vattr.va_size = size; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; case SEEK_DATA: error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); if (error == ENOTTY) error = EINVAL; break; case SEEK_HOLE: error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); if (error == ENOTTY) error = EINVAL; break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; VFS_KNOTE_UNLOCKED(vp, 0); td->td_uretoff.tdu_off = offset; drop: foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { int error; /* * Grant permission if the caller is the owner of the file, or * the super-user, or has ACL_WRITE_ATTRIBUTES permission on * on the file. If the time pointer is null, then write * permission on the file is also sufficient. * * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES * will be allowed to set the times [..] to the current * server time. */ error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) error = VOP_ACCESS(vp, VWRITE, cred, td); return (error); } int vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct vnode *vp; int error; if (fp->f_type == DTYPE_FIFO) kif->kf_type = KF_TYPE_FIFO; else kif->kf_type = KF_TYPE_VNODE; vp = fp->f_vnode; vref(vp); FILEDESC_SUNLOCK(fdp); error = vn_fill_kinfo_vnode(vp, kif); vrele(vp); FILEDESC_SLOCK(fdp); return (error); } static inline void vn_fill_junk(struct kinfo_file *kif) { size_t len, olen; /* * Simulate vn_fullpath returning changing values for a given * vp during e.g. coredump. */ len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1; olen = strlen(kif->kf_path); if (len < olen) strcpy(&kif->kf_path[len - 1], "$"); else for (; olen < len; olen++) strcpy(&kif->kf_path[olen], "A"); } int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) { struct vattr va; char *fullpath, *freepath; int error; kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type); freepath = NULL; fullpath = "-"; error = vn_fullpath(curthread, vp, &fullpath, &freepath); if (error == 0) { strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); } if (freepath != NULL) free(freepath, M_TEMP); KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path, vn_fill_junk(kif); ); /* * Retrieve vnode attributes. */ va.va_fsid = VNOVAL; va.va_rdev = NODEV; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, curthread->td_ucred); VOP_UNLOCK(vp, 0); if (error != 0) return (error); if (va.va_fsid != VNOVAL) kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; else kif->kf_un.kf_file.kf_file_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; kif->kf_un.kf_file.kf_file_fsid_freebsd11 = kif->kf_un.kf_file.kf_file_fsid; /* truncate */ kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); kif->kf_un.kf_file.kf_file_size = va.va_size; kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; kif->kf_un.kf_file.kf_file_rdev_freebsd11 = kif->kf_un.kf_file.kf_file_rdev; /* truncate */ return (0); } int vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { #ifdef HWPMC_HOOKS struct pmckern_map_in pkm; #endif struct mount *mp; struct vnode *vp; vm_object_t object; vm_prot_t maxprot; boolean_t writecounted; int error; #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support * read(2)/write(2) -- or even open(2). Thus, we can * use MAP_ASYNC to trade on-disk coherence for speed. * The shm_open(3) library routine turns on the FPOSIXSHM * flag to request this behavior. */ if ((fp->f_flag & FPOSIXSHM) != 0) flags |= MAP_NOSYNC; #endif vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } else { maxprot |= VM_PROT_WRITE; cap_maxprot |= VM_PROT_WRITE; } maxprot &= cap_maxprot; /* * For regular files and shared memory, POSIX requires that * the value of foff be a legitimate offset within the data * object. In particular, negative offsets are invalid. * Blocking negative offsets and overflows here avoids * possible wraparound or user-level access into reserved * ranges of the data object later. In contrast, POSIX does * not dictate how offsets are used by device drivers, so in * the case of a device mapping a negative offset is passed * on. */ if ( #ifdef _LP64 size > OFF_MAX || #endif foff < 0 || foff > OFF_MAX - size) return (EINVAL); writecounted = FALSE; error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, &foff, &object, &writecounted); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, writecounted, td); if (error != 0) { /* * If this mapping was accounted for in the vnode's * writecount, then undo that now. */ if (writecounted) vm_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } #ifdef HWPMC_HOOKS /* Inform hwpmc(4) if an executable is being mapped. */ if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) { if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) { pkm.pm_file = vp; pkm.pm_address = (uintptr_t) *addr; PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm); } } #endif return (error); } void vn_fsid(struct vnode *vp, struct vattr *va) { fsid_t *f; f = &vp->v_mount->mnt_stat.f_fsid; va->va_fsid = (uint32_t)f->val[1]; va->va_fsid <<= sizeof(f->val[1]) * NBBY; va->va_fsid += (uint32_t)f->val[0]; } int vn_fsync_buf(struct vnode *vp, int waitfor) { struct buf *bp, *nbp; struct bufobj *bo; struct mount *mp; int error, maxretry; error = 0; maxretry = 10000; /* large, arbitrarily chosen */ mp = NULL; if (vp->v_type == VCHR) { VI_LOCK(vp); mp = vp->v_rdev->si_mountpt; VI_UNLOCK(vp); } bo = &vp->v_bufobj; BO_LOCK(bo); loop1: /* * MARK/SCAN initialization to avoid infinite loops. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { bp->b_vflags &= ~BV_SCANNED; bp->b_error = 0; } /* * Flush all dirty buffers associated with a vnode. */ loop2: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { if (waitfor != MNT_WAIT) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL, BO_LOCKPTR(bo)) != 0) { BO_LOCK(bo); goto loop1; } BO_LOCK(bo); } BO_UNLOCK(bo); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if ((bp->b_flags & B_DELWRI) == 0) panic("fsync: not dirty"); if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { vfs_bio_awrite(bp); } else { bremfree(bp); bawrite(bp); } if (maxretry < 1000) pause("dirty", hz < 1000 ? 1 : hz / 1000); BO_LOCK(bo); goto loop2; } /* * If synchronous the caller expects us to completely resolve all * dirty buffers in the system. Wait for in-progress I/O to * complete (which could include background bitmap writes), then * retry if dirty blocks still exist. */ if (waitfor == MNT_WAIT) { bufobj_wwait(bo, 0, 0); if (bo->bo_dirty.bv_cnt > 0) { /* * If we are unable to write any of these buffers * then we fail now rather than trying endlessly * to write them out. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) if ((error = bp->b_error) != 0) break; if ((mp != NULL && mp->mnt_secondary_writes > 0) || (error == 0 && --maxretry >= 0)) goto loop1; if (error == 0) error = EAGAIN; } } BO_UNLOCK(bo); if (error != 0) vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error); return (error); } /* * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE() * or vn_generic_copy_file_range() after rangelocking the byte ranges, * to do the actual copy. * vn_generic_copy_file_range() is factored out, so it can be called * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from * different file systems. */ int vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) { struct vattr va; int error; size_t len; uint64_t uvalin, uvalout; len = *lenp; *lenp = 0; /* For error returns. */ error = 0; /* Do some sanity checks on the arguments. */ uvalin = *inoffp; uvalin += len; uvalout = *outoffp; uvalout += len; if (invp->v_type == VDIR || outvp->v_type == VDIR) error = EISDIR; else if (*inoffp < 0 || uvalin > INT64_MAX || uvalin < (uint64_t)*inoffp || *outoffp < 0 || uvalout > INT64_MAX || uvalout < (uint64_t)*outoffp || invp->v_type != VREG || outvp->v_type != VREG) error = EINVAL; else if (invp == outvp) error = EBADF; if (error != 0) goto out; error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; /* Check that the offset + len does not go past EOF of invp. */ error = VOP_GETATTR(invp, &va, incred); if (error == 0 && va.va_size < *inoffp + len) error = EINVAL; VOP_UNLOCK(invp, 0); if (error != 0) goto out; /* * If the two vnode are for the same file system, call * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range() * which can handle copies across multiple file systems. */ *lenp = len; if (invp->v_mount == outvp->v_mount) error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp, lenp, flags, incred, outcred, fsize_td); else error = vn_generic_copy_file_range(invp, inoffp, outvp, outoffp, lenp, flags, incred, outcred, fsize_td); out: return (error); } /* * Test len bytes of data starting at dat for all bytes == 0. * Return true if all bytes are zero, false otherwise. * Expects dat to be well aligned. */ static bool mem_iszero(void *dat, int len) { int i; const u_int *p; const char *cp; for (p = dat; len > 0; len -= sizeof(*p), p++) { if (len >= sizeof(*p)) { if (*p != 0) return (false); } else { cp = (const char *)p; for (i = 0; i < len; i++, cp++) if (*cp != '\0') return (false); } } return (true); } /* * Look for a hole in the output file and, if found, adjust *outoffp * and *xferp to skip past the hole. * *xferp is the entire hole length to be written and xfer2 is how many bytes * to be written as 0's upon return. */ static off_t vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp, off_t *dataoffp, off_t *holeoffp, struct ucred *cred) { int error; off_t delta; if (*holeoffp == 0 || *holeoffp <= *outoffp) { *dataoffp = *outoffp; error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred, curthread); if (error == 0) { *holeoffp = *dataoffp; error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred, curthread); } if (error != 0 || *holeoffp == *dataoffp) { /* * Since outvp is unlocked, it may be possible for * another thread to do a truncate(), lseek(), write() * creating a hole at startoff between the above * VOP_IOCTL() calls, if the other thread does not do * rangelocking. * If that happens, *holeoffp == *dataoffp and finding * the hole has failed, so disable vn_skip_hole(). */ *holeoffp = -1; /* Disable use of vn_skip_hole(). */ return (xfer2); } KASSERT(*dataoffp >= *outoffp, ("vn_skip_hole: dataoff=%jd < outoff=%jd", (intmax_t)*dataoffp, (intmax_t)*outoffp)); KASSERT(*holeoffp > *dataoffp, ("vn_skip_hole: holeoff=%jd <= dataoff=%jd", (intmax_t)*holeoffp, (intmax_t)*dataoffp)); } /* * If there is a hole before the data starts, advance *outoffp and * *xferp past the hole. */ if (*dataoffp > *outoffp) { delta = *dataoffp - *outoffp; if (delta >= *xferp) { /* Entire *xferp is a hole. */ *outoffp += *xferp; *xferp = 0; return (0); } *xferp -= delta; *outoffp += delta; xfer2 = MIN(xfer2, *xferp); } /* * If a hole starts before the end of this xfer2, reduce this xfer2 so * that the write ends at the start of the hole. * *holeoffp should always be greater than *outoffp, but for the * non-INVARIANTS case, check this to make sure xfer2 remains a sane * value. */ if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2) xfer2 = *holeoffp - *outoffp; return (xfer2); } /* * Write an xfer sized chunk to outvp in blksize blocks from dat. * dat is a maximum of blksize in length and can be written repeatedly in * the chunk. * If growfile == true, just grow the file via vn_truncate_locked() instead * of doing actual writes. * If checkhole == true, a hole is being punched, so skip over any hole * already in the output file. */ static int vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer, u_long blksize, bool growfile, bool checkhole, struct ucred *cred) { struct mount *mp; off_t dataoff, holeoff, xfer2; int error, lckf; /* * Loop around doing writes of blksize until write has been completed. * Lock/unlock on each loop iteration so that a bwillwrite() can be * done for each iteration, since the xfer argument can be very * large if there is a large hole to punch in the output file. */ error = 0; holeoff = 0; do { xfer2 = MIN(xfer, blksize); if (checkhole) { /* * Punching a hole. Skip writing if there is * already a hole in the output file. */ xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer, &dataoff, &holeoff, cred); if (xfer == 0) break; if (holeoff < 0) checkhole = false; KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd", (intmax_t)xfer2)); } bwillwrite(); mp = NULL; error = vn_start_write(outvp, &mp, V_WAIT); if (error == 0) { if (MNT_SHARED_WRITES(mp)) lckf = LK_SHARED; else lckf = LK_EXCLUSIVE; error = vn_lock(outvp, lckf); } if (error == 0) { if (growfile) error = vn_truncate_locked(outvp, outoff + xfer, false, cred); else { error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, outoff, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, cred, NULL, curthread); outoff += xfer2; xfer -= xfer2; } VOP_UNLOCK(outvp, 0); } if (mp != NULL) vn_finished_write(mp); } while (!growfile && xfer > 0 && error == 0); return (error); } /* * Copy a byte range of one file to another. This function can handle the * case where invp and outvp are on different file systems. * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there * is no better file system specific way to do it. */ int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) { struct vattr va; struct mount *mp; struct uio io; off_t startoff, endoff, xfer, xfer2; u_long blksize; int error; bool cantseek, readzeros; ssize_t aresid; size_t copylen, len, savlen; char *dat; long holein, holeout; holein = holeout = 0; savlen = len = *lenp; error = 0; dat = NULL; error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) holein = 0; VOP_UNLOCK(invp, 0); mp = NULL; error = vn_start_write(outvp, &mp, V_WAIT); if (error == 0) error = vn_lock(outvp, LK_EXCLUSIVE); if (error == 0) { /* * If fsize_td != NULL, do a vn_rlimit_fsize() call, * now that outvp is locked. */ if (fsize_td != NULL) { io.uio_offset = *outoffp; io.uio_resid = len; error = vn_rlimit_fsize(outvp, &io, fsize_td); if (error != 0) error = EFBIG; } if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) holeout = 0; /* * Holes that are past EOF do not need to be written as a block * of zero bytes. So, truncate the output file as far as * possible and then use va.va_size to decide if writing 0 * bytes is necessary in the loop below. */ if (error == 0) error = VOP_GETATTR(outvp, &va, outcred); if (error == 0 && va.va_size > *outoffp && va.va_size <= *outoffp + len) { #ifdef MAC error = mac_vnode_check_write(curthread->td_ucred, outcred, outvp); if (error == 0) #endif error = vn_truncate_locked(outvp, *outoffp, false, outcred); if (error == 0) va.va_size = *outoffp; } VOP_UNLOCK(outvp, 0); } if (mp != NULL) vn_finished_write(mp); if (error != 0) goto out; /* * Set the blksize to the larger of the hole sizes for invp and outvp. * If hole sizes aren't available, set the blksize to the larger * f_iosize of invp and outvp. * This code expects the hole sizes and f_iosizes to be powers of 2. * This value is clipped at 4Kbytes and 1Mbyte. */ blksize = MAX(holein, holeout); if (blksize == 0) blksize = MAX(invp->v_mount->mnt_stat.f_iosize, outvp->v_mount->mnt_stat.f_iosize); if (blksize < 4096) blksize = 4096; else if (blksize > 1024 * 1024) blksize = 1024 * 1024; dat = malloc(blksize, M_TEMP, M_WAITOK); /* * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA * to find holes. Otherwise, just scan the read block for all 0s * in the inner loop where the data copying is done. * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may * support holes on the server, but do not support FIOSEEKHOLE. */ while (len > 0 && error == 0) { endoff = 0; /* To shut up compilers. */ cantseek = true; startoff = *inoffp; copylen = len; /* * Find the next data area. If there is just a hole to EOF, * FIOSEEKDATA should fail and then we drop down into the * inner loop and create the hole on the outvp file. * (I do not know if any file system will report a hole to * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA * will fail for those file systems.) * * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, * the code just falls through to the inner copy loop. */ error = EINVAL; if (holein > 0) error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, incred, curthread); if (error == 0) { endoff = startoff; error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, incred, curthread); /* * Since invp is unlocked, it may be possible for * another thread to do a truncate(), lseek(), write() * creating a hole at startoff between the above * VOP_IOCTL() calls, if the other thread does not do * rangelocking. * If that happens, startoff == endoff and finding * the hole has failed, so set an error. */ if (error == 0 && startoff == endoff) error = EINVAL; /* Any error. Reset to 0. */ } if (error == 0) { if (startoff > *inoffp) { /* Found hole before data block. */ xfer = MIN(startoff - *inoffp, len); if (*outoffp < va.va_size) { /* Must write 0s to punch hole. */ xfer2 = MIN(va.va_size - *outoffp, xfer); memset(dat, 0, MIN(xfer2, blksize)); error = vn_write_outvp(outvp, dat, *outoffp, xfer2, blksize, false, holeout > 0, outcred); } if (error == 0 && *outoffp + xfer > va.va_size && xfer == len) /* Grow last block. */ error = vn_write_outvp(outvp, dat, *outoffp, xfer, blksize, true, false, outcred); if (error == 0) { *inoffp += xfer; *outoffp += xfer; len -= xfer; } } copylen = MIN(len, endoff - startoff); cantseek = false; } else { cantseek = true; startoff = *inoffp; copylen = len; error = 0; } xfer = blksize; if (cantseek) { /* * Set first xfer to end at a block boundary, so that * holes are more likely detected in the loop below via * the for all bytes 0 method. */ xfer -= (*inoffp % blksize); } /* Loop copying the data block. */ while (copylen > 0 && error == 0) { if (copylen < xfer) xfer = copylen; error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; error = vn_rdwr(UIO_READ, invp, dat, xfer, startoff, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, incred, &aresid, curthread); VOP_UNLOCK(invp, 0); /* * Linux considers a range that exceeds EOF to * be an error, so we will too. */ if (error == 0 && aresid > 0) error = EINVAL; if (error == 0) { /* * Skip the write for holes past the initial EOF * of the output file, unless this is the last * write of the output file at EOF. */ readzeros = cantseek ? mem_iszero(dat, xfer) : false; if (!cantseek || *outoffp < va.va_size || xfer == len || !readzeros) error = vn_write_outvp(outvp, dat, *outoffp, xfer, blksize, readzeros && xfer == len && *outoffp >= va.va_size, false, outcred); if (error == 0) { *inoffp += xfer; startoff += xfer; *outoffp += xfer; copylen -= xfer; len -= xfer; } } xfer = blksize; } } out: *lenp = savlen - len; free(dat, M_TEMP); return (error); } Index: projects/clang900-import/sys/net/if.c =================================================================== --- projects/clang900-import/sys/net/if.c (revision 352536) +++ projects/clang900-import/sys/net/if.c (revision 352537) @@ -1,4611 +1,4612 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); __read_mostly epoch_t net_epoch_preempt; __read_mostly epoch_t net_epoch; #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ _IOC_NEWTYPE((cmd), struct ifgroupreq32): case #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void *if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); #ifdef VIMAGE static void if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); VNET_DEFINE_STATIC(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); if (V_ifindex_table[idx] == IFNET_HOLD) return (NULL); return (V_ifindex_table[idx]); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; ifp = ifnet_byindex_locked(idx); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { NET_EPOCH_EXIT(et); return (NULL); } if_ref(ifp); NET_EPOCH_EXIT(et); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short ifindex_alloc(void **old) { u_short idx; IFNET_WLOCK_ASSERT(); /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { *old = if_grow(); return (USHRT_MAX); } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { V_ifindex_table[idx] = ifp; } struct ifaddr * ifaddr_byindex(u_short idx) { struct epoch_tracker et; struct ifnet *ifp; struct ifaddr *ifa = NULL; NET_EPOCH_ENTER(et); ifp = ifnet_byindex_locked(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); NET_EPOCH_EXIT(et); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { void *old; CK_STAILQ_INIT(&V_ifnet); CK_STAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); old = if_grow(); /* create initial table */ IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) if_vmove(ifp, ifp->if_home_vnet); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif static void * if_grow(void) { int oldlim; u_int n; struct ifnet **e; void *old; old = NULL; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return (NULL); } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); old = V_ifindex_table; } V_if_indexlim <<= 1; V_ifindex_table = e; return (old); } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc_domain(u_char type, int numa_domain) { struct ifnet *ifp; u_short idx; void *old; KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large")); if (numa_domain == IF_NODOM) ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK | M_ZERO); else ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET, DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO); restart: IFNET_WLOCK(); idx = ifindex_alloc(&old); if (__predict_false(idx == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; ifp->if_numa_domain = numa_domain; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); CK_STAILQ_INIT(&ifp->if_addrhead); CK_STAILQ_INIT(&ifp->if_multiaddrs); CK_STAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } struct ifnet * if_alloc_dev(u_char type, device_t dev) { int numa_domain; if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0) return (if_alloc_domain(type, IF_NODOM)); return (if_alloc_domain(type, numa_domain)); } struct ifnet * if_alloc(u_char type) { return (if_alloc_domain(type, IF_NODOM)); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp->if_description, M_IFDESCR); free(ifp->if_hw_addr, M_IFADDR); if (ifp->if_numa_domain == IF_NODOM) free(ifp, M_IFNET); else free_domain(ifp, M_IFNET); } static void if_destroy(epoch_context_t ctx) { struct ifnet *ifp; ifp = __containerof(ctx, struct ifnet, if_epoch_ctx); if_free_internal(ifp); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = CK_STAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_epochalloc(void *dummy __unused) { net_epoch_preempt = epoch_alloc(EPOCH_PREEMPT); net_epoch = epoch_alloc(0); } SYSINIT(ifepochalloc, SI_SUB_TASKQ + 1, SI_ORDER_ANY, if_epochalloc, NULL); static void if_attachdomain(void *dummy) { struct ifnet *ifp; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa; while (1) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) break; } NET_EPOCH_EXIT(et); if (ifa == NULL) break; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; struct ifnet *iter; int found = 0; #ifdef VIMAGE int shutdown; shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; #endif IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link); if (!vmove) ifp->if_flags |= IFF_DYING; found = 1; break; } IFNET_WUNLOCK(); if (!found) { /* * While we would want to panic here, we cannot * guarantee that the interface is indeed still on * the list given we don't hold locks all the way. */ return (ENOENT); #if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ #endif } /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif epoch_wait_preempt(net_epoch_preempt); /* * Ensure all pending EPOCH(9) callbacks have been executed. This * fixes issues about late destruction of multicast options * which lead to leave group calls, which in turn access the * belonging ifnet structure: */ epoch_drain_callbacks(net_epoch_preempt); /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* Give interface users the chance to clean up. */ EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Clean up all addresses. */ IF_ADDR_WLOCK(ifp); if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) { ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; u_int bif_dlt, bif_hdrlen; void *old; int rc; /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return; /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); restart: IFNET_WLOCK(); ifp->if_index = ifindex_alloc(&old); if (__predict_false(ifp->if_index == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; CK_STAILQ_INIT(&ifg->ifg_members); CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; int freeifgl; IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } freeifgl = 0; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); freeifgl = 1; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); if (freeifgl) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } free(ifgm, M_TEMP); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; int ifglfree; IFNET_WLOCK(); while (!CK_STAILQ_EMPTY(&ifp->if_groups)) { ifgl = CK_STAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); ifglfree = 0; if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); ifglfree = 1; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(ifgm, M_TEMP); if (ifglfree) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { struct epoch_tracker et; int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; if (ifgr->ifgr_len == 0) { NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); NET_EPOCH_EXIT(et); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { NET_EPOCH_EXIT(et); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { NET_EPOCH_EXIT(et); return (error); } len -= sizeof(ifgrq); ifgp++; } NET_EPOCH_EXIT(et); return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { epoch_enter_preempt(net_epoch_preempt, curthread->td_et); } void if_addr_runlock(struct ifnet *ifp) { epoch_exit_preempt(net_epoch_preempt, curthread->td_et); } void if_maddr_rlock(if_t ifp) { epoch_enter_preempt(net_epoch_preempt, curthread->td_et); } void if_maddr_runlock(if_t ifp) { epoch_exit_preempt(net_epoch_preempt, curthread->td_et); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } static void ifa_destroy(epoch_context_t ctx) { struct ifaddr *ifa; ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx); counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) epoch_call(net_epoch_preempt, &ifa->ifa_epoch_ctx, ifa_destroy); } static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { struct epoch_tracker et; int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; struct ifnet *ifp; ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; if (cmd == RTM_ADD) { /* explicitly specify (loopback) ifa */ if (info.rti_ifp != NULL) { NET_EPOCH_ENTER(et); info.rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp); if (info.rti_ifa != NULL) ifa_ref(info.rti_ifa); NET_EPOCH_EXIT(et); } } info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); if (error != 0 && !(cmd == RTM_ADD && error == EEXIST) && !(cmd == RTM_DELETE && error == ENOENT)) if_printf(ifp, "%s failed: %d\n", otype, error); return (error); } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { struct epoch_tracker et; int rc; NET_EPOCH_ENTER(et); rc = (ifa_ifwithaddr(addr) != NULL); NET_EPOCH_EXIT(et); return (rc); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; MPASS(in_epoch(net_epoch_preempt)); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { ifa_maybe = ifa; } } } } ifa = ifa_maybe; ifa_maybe = NULL; done: return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_pcp_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) ifp->if_bridge_linkstate(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) if_printf(ifp, "link state changed to %s\n", (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); NET_EPOCH_EXIT(et); return (ifp); } struct ifnet * ifunit(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } NET_EPOCH_EXIT(et); return (ifp); } static void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } static size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) if_printf(ifp, "permanently promiscuous mode %s\n", ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (strcmp(new_name, ifp->if_xname) == 0) break; if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if_printf(ifp, "changing name to '%s'\n", new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); #ifdef INET NETDUMP_REINIT(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct epoch_tracker et; struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ NET_EPOCH_ENTER(et); ifma = if_findmulti(ifp, &ifr->ifr_addr); NET_EPOCH_EXIT(et); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: + case SIOCGIFDOWNREASON: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): if ((error = if_getgroup((struct ifgroupreq *)data, ifp))) return (error); break; case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data = NULL; struct ifmediareq ifmr; struct ifmediareq *ifmrp; #endif struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; #ifdef VIMAGE int shutdown; #endif CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ shutdown = (so->so_vnet->vnet_state > SI_SUB_VNET && so->so_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; return (error); } #endif } #ifdef COMPAT_FREEBSD32 ifmrp = NULL; switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) if_printf(ifp, "promiscuous mode %s\n", (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { struct epoch_tracker et; int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } NET_EPOCH_EXIT(et); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ #ifdef MCAST_VERBOSE extern void kdb_backtrace(void); #endif static void if_freemulti_internal(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); #ifdef MCAST_VERBOSE kdb_backtrace(); printf("%s freeing ifma: %p\n", __func__, ifma); #endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } static void if_destroymulti(epoch_context_t ctx) { struct ifmultiaddr *ifma; ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx); if_freemulti_internal(ifma); } void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d", ifma->ifma_refcount)); epoch_call(net_epoch_preempt, &ifma->ifma_epoch_ctx, if_destroymulti); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } ll_ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct epoch_tracker et; struct ifnet *oifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; NET_EPOCH_EXIT(et); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } void if_delmulti_ifma(struct ifmultiaddr *ifma) { if_delmulti_ifma_flags(ifma, 0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct epoch_tracker et; struct ifnet *oifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; NET_EPOCH_EXIT(et); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } } if_freemulti(ll_ifma); } } #ifdef INVARIANTS if (ifp) { struct ifmultiaddr *ifmatmp; CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) MPASS(ifma != ifmatmp); } #endif if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; struct epoch_tracker et; int rc; rc = 0; NET_EPOCH_ENTER(et); ifa = ifp->if_addr; if (ifa == NULL) { rc = EINVAL; goto out; } sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { rc = EINVAL; goto out; } if (len != sdl->sdl_alen) { /* don't allow length to change */ rc = EINVAL; goto out; } switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: rc = ENODEV; goto out; } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ NET_EPOCH_EXIT(et); if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); out: NET_EPOCH_EXIT(et); return (rc); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Tunnel interfaces can nest, also they may cause infinite recursion * calls when misconfigured. We'll prevent this by detecting loops. * High nesting level may cause stack exhaustion. We'll prevent this * by introducing upper limit. * * Return 0, if tunnel nesting count is equal or less than limit. */ int if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie, int limit) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp)); return (EIO); } count++; } if (count > limit) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", if_name(ifp), count); return (EIO); } mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char *fmt, ...) { char if_fmt[256]; va_list ap; snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt); va_start(ap, fmt); vlog(LOG_INFO, if_fmt, ap); va_end(ap); return (0); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg) { struct ifmultiaddr *ifma; int cnt = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) cnt += filter(arg, ifma, cnt); if_maddr_runlock(ifp); return (cnt); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: projects/clang900-import/sys/net/if.h =================================================================== --- projects/clang900-import/sys/net/if.h (revision 352536) +++ projects/clang900-import/sys/net/if.h (revision 352537) @@ -1,610 +1,620 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_H_ #define _NET_IF_H_ #include #if __BSD_VISIBLE /* * does not depend on on most other systems. This * helps userland compatibility. (struct timeval ifi_lastchange) * The same holds for . (struct sockaddr ifru_addr) */ #ifndef _KERNEL #include #include #endif #endif /* * Length of interface external name, including terminating '\0'. * Note: this is the same size as a generic device's external name. */ #define IF_NAMESIZE 16 #if __BSD_VISIBLE #define IFNAMSIZ IF_NAMESIZE #define IF_MAXUNIT 0x7fff /* historical value */ #endif #if __BSD_VISIBLE /* * Structure used to query names of interface cloners. */ struct if_clonereq { int ifcr_total; /* total cloners (out) */ int ifcr_count; /* room for this many in user buffer */ char *ifcr_buffer; /* buffer for cloner names */ }; /* * Structure describing information about an interface * which may be of interest to management entities. */ struct if_data { /* generic interface information */ uint8_t ifi_type; /* ethernet, tokenring, etc */ uint8_t ifi_physical; /* e.g., AUI, Thinnet, 10base-T, etc */ uint8_t ifi_addrlen; /* media address length */ uint8_t ifi_hdrlen; /* media header length */ uint8_t ifi_link_state; /* current link state */ uint8_t ifi_vhid; /* carp vhid */ uint16_t ifi_datalen; /* length of this data struct */ uint32_t ifi_mtu; /* maximum transmission unit */ uint32_t ifi_metric; /* routing metric (external only) */ uint64_t ifi_baudrate; /* linespeed */ /* volatile statistics */ uint64_t ifi_ipackets; /* packets received on interface */ uint64_t ifi_ierrors; /* input errors on interface */ uint64_t ifi_opackets; /* packets sent on interface */ uint64_t ifi_oerrors; /* output errors on interface */ uint64_t ifi_collisions; /* collisions on csma interfaces */ uint64_t ifi_ibytes; /* total number of octets received */ uint64_t ifi_obytes; /* total number of octets sent */ uint64_t ifi_imcasts; /* packets received via multicast */ uint64_t ifi_omcasts; /* packets sent via multicast */ uint64_t ifi_iqdrops; /* dropped on input */ uint64_t ifi_oqdrops; /* dropped on output */ uint64_t ifi_noproto; /* destined for unsupported protocol */ uint64_t ifi_hwassist; /* HW offload capabilities, see IFCAP */ /* Unions are here to make sizes MI. */ union { /* uptime at attach or stat reset */ time_t tt; uint64_t ph; } __ifi_epoch; #define ifi_epoch __ifi_epoch.tt union { /* time of last administrative change */ struct timeval tv; struct { uint64_t ph1; uint64_t ph2; } ph; } __ifi_lastchange; #define ifi_lastchange __ifi_lastchange.tv }; /*- * Interface flags are of two types: network stack owned flags, and driver * owned flags. Historically, these values were stored in the same ifnet * flags field, but with the advent of fine-grained locking, they have been * broken out such that the network stack is responsible for synchronizing * the stack-owned fields, and the device driver the device-owned fields. * Both halves can perform lockless reads of the other half's field, subject * to accepting the involved races. * * Both sets of flags come from the same number space, and should not be * permitted to conflict, as they are exposed to user space via a single * field. * * The following symbols identify read and write requirements for fields: * * (i) if_flags field set by device driver before attach, read-only there * after. * (n) if_flags field written only by the network stack, read by either the * stack or driver. * (d) if_drv_flags field written only by the device driver, read by either * the stack or driver. */ #define IFF_UP 0x1 /* (n) interface is up */ #define IFF_BROADCAST 0x2 /* (i) broadcast address valid */ #define IFF_DEBUG 0x4 /* (n) turn on debugging */ #define IFF_LOOPBACK 0x8 /* (i) is a loopback net */ #define IFF_POINTOPOINT 0x10 /* (i) is a point-to-point link */ /* 0x20 was IFF_SMART */ #define IFF_DRV_RUNNING 0x40 /* (d) resources allocated */ #define IFF_NOARP 0x80 /* (n) no address resolution protocol */ #define IFF_PROMISC 0x100 /* (n) receive all packets */ #define IFF_ALLMULTI 0x200 /* (n) receive all multicast packets */ #define IFF_DRV_OACTIVE 0x400 /* (d) tx hardware queue is full */ #define IFF_SIMPLEX 0x800 /* (i) can't hear own transmissions */ #define IFF_LINK0 0x1000 /* per link layer defined bit */ #define IFF_LINK1 0x2000 /* per link layer defined bit */ #define IFF_LINK2 0x4000 /* per link layer defined bit */ #define IFF_ALTPHYS IFF_LINK2 /* use alternate physical connection */ #define IFF_MULTICAST 0x8000 /* (i) supports multicast */ #define IFF_CANTCONFIG 0x10000 /* (i) unconfigurable using ioctl(2) */ #define IFF_PPROMISC 0x20000 /* (n) user-requested promisc mode */ #define IFF_MONITOR 0x40000 /* (n) user-requested monitor mode */ #define IFF_STATICARP 0x80000 /* (n) static ARP */ #define IFF_DYING 0x200000 /* (n) interface is winding down */ #define IFF_RENAMING 0x400000 /* (n) interface is being renamed */ #define IFF_NOGROUP 0x800000 /* (n) interface is not part of any groups */ /* * Old names for driver flags so that user space tools can continue to use * the old (portable) names. */ #ifndef _KERNEL #define IFF_RUNNING IFF_DRV_RUNNING #define IFF_OACTIVE IFF_DRV_OACTIVE #endif /* flags set internally only: */ #define IFF_CANTCHANGE \ (IFF_BROADCAST|IFF_POINTOPOINT|IFF_DRV_RUNNING|IFF_DRV_OACTIVE|\ IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC|\ IFF_DYING|IFF_CANTCONFIG) /* * Values for if_link_state. */ #define LINK_STATE_UNKNOWN 0 /* link invalid/unknown */ #define LINK_STATE_DOWN 1 /* link is down */ #define LINK_STATE_UP 2 /* link is up */ /* * Some convenience macros used for setting ifi_baudrate. * XXX 1000 vs. 1024? --thorpej@netbsd.org */ #define IF_Kbps(x) ((uintmax_t)(x) * 1000) /* kilobits/sec. */ #define IF_Mbps(x) (IF_Kbps((x) * 1000)) /* megabits/sec. */ #define IF_Gbps(x) (IF_Mbps((x) * 1000)) /* gigabits/sec. */ /* * Capabilities that interfaces can advertise. * * struct ifnet.if_capabilities * contains the optional features & capabilities a particular interface * supports (not only the driver but also the detected hw revision). * Capabilities are defined by IFCAP_* below. * struct ifnet.if_capenable * contains the enabled (either by default or through ifconfig) optional * features & capabilities on this interface. * Capabilities are defined by IFCAP_* below. * struct if_data.ifi_hwassist in mbuf CSUM_ flag form, controlled by above * contains the enabled optional feature & capabilites that can be used * individually per packet and are specified in the mbuf pkthdr.csum_flags * field. IFCAP_* and CSUM_* do not match one to one and CSUM_* may be * more detailed or differenciated than IFCAP_*. * Hwassist features are defined CSUM_* in sys/mbuf.h * * Capabilities that cannot be arbitrarily changed with ifconfig/ioctl * are listed in IFCAP_CANTCHANGE, similar to IFF_CANTCHANGE. * This is not strictly necessary because the common code never * changes capabilities, and it is left to the individual driver * to do the right thing. However, having the filter here * avoids replication of the same code in all individual drivers. */ #define IFCAP_RXCSUM 0x00001 /* can offload checksum on RX */ #define IFCAP_TXCSUM 0x00002 /* can offload checksum on TX */ #define IFCAP_NETCONS 0x00004 /* can be a network console */ #define IFCAP_VLAN_MTU 0x00008 /* VLAN-compatible MTU */ #define IFCAP_VLAN_HWTAGGING 0x00010 /* hardware VLAN tag support */ #define IFCAP_JUMBO_MTU 0x00020 /* 9000 byte MTU supported */ #define IFCAP_POLLING 0x00040 /* driver supports polling */ #define IFCAP_VLAN_HWCSUM 0x00080 /* can do IFCAP_HWCSUM on VLANs */ #define IFCAP_TSO4 0x00100 /* can do TCP Segmentation Offload */ #define IFCAP_TSO6 0x00200 /* can do TCP6 Segmentation Offload */ #define IFCAP_LRO 0x00400 /* can do Large Receive Offload */ #define IFCAP_WOL_UCAST 0x00800 /* wake on any unicast frame */ #define IFCAP_WOL_MCAST 0x01000 /* wake on any multicast frame */ #define IFCAP_WOL_MAGIC 0x02000 /* wake on any Magic Packet */ #define IFCAP_TOE4 0x04000 /* interface can offload TCP */ #define IFCAP_TOE6 0x08000 /* interface can offload TCP6 */ #define IFCAP_VLAN_HWFILTER 0x10000 /* interface hw can filter vlan tag */ /* available 0x20000 */ #define IFCAP_VLAN_HWTSO 0x40000 /* can do IFCAP_TSO on VLANs */ #define IFCAP_LINKSTATE 0x80000 /* the runtime link state is dynamic */ #define IFCAP_NETMAP 0x100000 /* netmap mode supported/enabled */ #define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */ #define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */ #define IFCAP_HWSTATS 0x800000 /* manages counters internally */ #define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */ #define IFCAP_HWRXTSTMP 0x2000000 /* hardware rx timestamping */ #define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */ #define IFCAP_TXTLS4 0x8000000 /* can do TLS encryption and segmentation for TCP */ #define IFCAP_TXTLS6 0x10000000 /* can do TLS encryption and segmentation for TCP6 */ #define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6) #define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) #define IFCAP_TSO (IFCAP_TSO4 | IFCAP_TSO6) #define IFCAP_WOL (IFCAP_WOL_UCAST | IFCAP_WOL_MCAST | IFCAP_WOL_MAGIC) #define IFCAP_TOE (IFCAP_TOE4 | IFCAP_TOE6) #define IFCAP_TXTLS (IFCAP_TXTLS4 | IFCAP_TXTLS6) #define IFCAP_CANTCHANGE (IFCAP_NETMAP) #define IFQ_MAXLEN 50 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ /* * Message format for use in obtaining information about interfaces * from getkerninfo and the routing socket * For the new, extensible interface see struct if_msghdrl below. */ struct if_msghdr { u_short ifm_msglen; /* to skip over non-understood messages */ u_char ifm_version; /* future binary compatibility */ u_char ifm_type; /* message type */ int ifm_addrs; /* like rtm_addrs */ int ifm_flags; /* value of if_flags */ u_short ifm_index; /* index for associated ifp */ u_short _ifm_spare1; struct if_data ifm_data;/* statistics and other data about if */ }; /* * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL. It is * extensible after ifm_data_off or within ifm_data. Both the if_msghdr and * if_data now have a member field detailing the struct length in addition to * the routing message length. Macros are provided to find the start of * ifm_data and the start of the socket address strucutres immediately following * struct if_msghdrl given a pointer to struct if_msghdrl. */ #define IF_MSGHDRL_IFM_DATA(_l) \ (struct if_data *)((char *)(_l) + (_l)->ifm_data_off) #define IF_MSGHDRL_RTA(_l) \ (void *)((uintptr_t)(_l) + (_l)->ifm_len) struct if_msghdrl { u_short ifm_msglen; /* to skip over non-understood messages */ u_char ifm_version; /* future binary compatibility */ u_char ifm_type; /* message type */ int ifm_addrs; /* like rtm_addrs */ int ifm_flags; /* value of if_flags */ u_short ifm_index; /* index for associated ifp */ u_short _ifm_spare1; /* spare space to grow if_index, see if_var.h */ u_short ifm_len; /* length of if_msghdrl incl. if_data */ u_short ifm_data_off; /* offset of if_data from beginning */ int _ifm_spare2; struct if_data ifm_data;/* statistics and other data about if */ }; /* * Message format for use in obtaining information about interface addresses * from getkerninfo and the routing socket * For the new, extensible interface see struct ifa_msghdrl below. */ struct ifa_msghdr { u_short ifam_msglen; /* to skip over non-understood messages */ u_char ifam_version; /* future binary compatibility */ u_char ifam_type; /* message type */ int ifam_addrs; /* like rtm_addrs */ int ifam_flags; /* value of ifa_flags */ u_short ifam_index; /* index for associated ifp */ u_short _ifam_spare1; int ifam_metric; /* value of ifa_ifp->if_metric */ }; /* * The 'l' version shall be used by new interfaces, like NET_RT_IFLISTL. It is * extensible after ifam_metric or within ifam_data. Both the ifa_msghdrl and * if_data now have a member field detailing the struct length in addition to * the routing message length. Macros are provided to find the start of * ifm_data and the start of the socket address strucutres immediately following * struct ifa_msghdrl given a pointer to struct ifa_msghdrl. */ #define IFA_MSGHDRL_IFAM_DATA(_l) \ (struct if_data *)((char *)(_l) + (_l)->ifam_data_off) #define IFA_MSGHDRL_RTA(_l) \ (void *)((uintptr_t)(_l) + (_l)->ifam_len) struct ifa_msghdrl { u_short ifam_msglen; /* to skip over non-understood messages */ u_char ifam_version; /* future binary compatibility */ u_char ifam_type; /* message type */ int ifam_addrs; /* like rtm_addrs */ int ifam_flags; /* value of ifa_flags */ u_short ifam_index; /* index for associated ifp */ u_short _ifam_spare1; /* spare space to grow if_index, see if_var.h */ u_short ifam_len; /* length of ifa_msghdrl incl. if_data */ u_short ifam_data_off; /* offset of if_data from beginning */ int ifam_metric; /* value of ifa_ifp->if_metric */ struct if_data ifam_data;/* statistics and other data about if or * address */ }; /* * Message format for use in obtaining information about multicast addresses * from the routing socket */ struct ifma_msghdr { u_short ifmam_msglen; /* to skip over non-understood messages */ u_char ifmam_version; /* future binary compatibility */ u_char ifmam_type; /* message type */ int ifmam_addrs; /* like rtm_addrs */ int ifmam_flags; /* value of ifa_flags */ u_short ifmam_index; /* index for associated ifp */ u_short _ifmam_spare1; }; /* * Message format announcing the arrival or departure of a network interface. */ struct if_announcemsghdr { u_short ifan_msglen; /* to skip over non-understood messages */ u_char ifan_version; /* future binary compatibility */ u_char ifan_type; /* message type */ u_short ifan_index; /* index for associated ifp */ char ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */ u_short ifan_what; /* what type of announcement */ }; #define IFAN_ARRIVAL 0 /* interface arrival */ #define IFAN_DEPARTURE 1 /* interface departure */ /* * Buffer with length to be used in SIOCGIFDESCR/SIOCSIFDESCR requests */ struct ifreq_buffer { size_t length; void *buffer; }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; caddr_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ #define ifr_broadaddr ifr_ifru.ifru_broadaddr /* broadcast address */ #ifndef _KERNEL #define ifr_buffer ifr_ifru.ifru_buffer /* user supplied buffer with its length */ #endif #define ifr_flags ifr_ifru.ifru_flags[0] /* flags (low 16 bits) */ #define ifr_flagshigh ifr_ifru.ifru_flags[1] /* flags (high 16 bits) */ #define ifr_jid ifr_ifru.ifru_jid /* jail/vnet */ #define ifr_metric ifr_ifru.ifru_metric /* metric */ #define ifr_mtu ifr_ifru.ifru_mtu /* mtu */ #define ifr_phys ifr_ifru.ifru_phys /* physical wire */ #define ifr_media ifr_ifru.ifru_media /* physical media */ #ifndef _KERNEL #define ifr_data ifr_ifru.ifru_data /* for use by interface */ #endif #define ifr_reqcap ifr_ifru.ifru_cap[0] /* requested capabilities */ #define ifr_curcap ifr_ifru.ifru_cap[1] /* current capabilities */ #define ifr_index ifr_ifru.ifru_index /* interface index */ #define ifr_fib ifr_ifru.ifru_fib /* interface fib */ #define ifr_vlan_pcp ifr_ifru.ifru_vlan_pcp /* VLAN priority */ #define ifr_lan_pcp ifr_ifru.ifru_vlan_pcp /* VLAN priority */ }; #define _SIZEOF_ADDR_IFREQ(ifr) \ ((ifr).ifr_addr.sa_len > sizeof(struct sockaddr) ? \ (sizeof(struct ifreq) - sizeof(struct sockaddr) + \ (ifr).ifr_addr.sa_len) : sizeof(struct ifreq)) struct ifaliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr ifra_addr; struct sockaddr ifra_broadaddr; struct sockaddr ifra_mask; int ifra_vhid; }; /* 9.x compat */ struct oifaliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr ifra_addr; struct sockaddr ifra_broadaddr; struct sockaddr ifra_mask; }; struct ifmediareq { char ifm_name[IFNAMSIZ]; /* if name, e.g. "en0" */ int ifm_current; /* current media options */ int ifm_mask; /* don't care mask */ int ifm_status; /* media status */ int ifm_active; /* active options */ int ifm_count; /* # entries in ifm_ulist array */ int *ifm_ulist; /* media words */ }; struct ifdrv { char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */ unsigned long ifd_cmd; size_t ifd_len; void *ifd_data; }; /* * Structure used to retrieve aux status data from interfaces. * Kernel suppliers to this interface should respect the formatting * needed by ifconfig(8): each line starts with a TAB and ends with * a newline. The canonical example to copy and paste is in if_tun.c. */ #define IFSTATMAX 800 /* 10 lines of text */ struct ifstat { char ifs_name[IFNAMSIZ]; /* if name, e.g. "en0" */ char ascii[IFSTATMAX + 1]; }; /* * Structure used in SIOCGIFCONF request. * Used to retrieve interface configuration * for machine (useful for programs which * must know all networks accessible). */ struct ifconf { int ifc_len; /* size of associated buffer */ union { caddr_t ifcu_buf; struct ifreq *ifcu_req; } ifc_ifcu; #define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */ #define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */ }; /* * interface groups */ #define IFG_ALL "all" /* group contains all interfaces */ /* XXX: will we implement this? */ #define IFG_EGRESS "egress" /* if(s) default route(s) point to */ struct ifg_req { union { char ifgrqu_group[IFNAMSIZ]; char ifgrqu_member[IFNAMSIZ]; } ifgrq_ifgrqu; #define ifgrq_group ifgrq_ifgrqu.ifgrqu_group #define ifgrq_member ifgrq_ifgrqu.ifgrqu_member }; /* * Used to lookup groups for an interface */ struct ifgroupreq { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; struct ifg_req *ifgru_groups; } ifgr_ifgru; #ifndef _KERNEL #define ifgr_group ifgr_ifgru.ifgru_group #define ifgr_groups ifgr_ifgru.ifgru_groups #endif }; /* * Structure used to request i2c data * from interface transceivers. */ struct ifi2creq { uint8_t dev_addr; /* i2c address (0xA0, 0xA2) */ uint8_t offset; /* read offset */ uint8_t len; /* read length */ uint8_t spare0; uint32_t spare1; uint8_t data[8]; /* read buffer */ }; /* * RSS hash. */ #define RSS_FUNC_NONE 0 /* RSS disabled */ #define RSS_FUNC_PRIVATE 1 /* non-standard */ #define RSS_FUNC_TOEPLITZ 2 #define RSS_TYPE_IPV4 0x00000001 #define RSS_TYPE_TCP_IPV4 0x00000002 #define RSS_TYPE_IPV6 0x00000004 #define RSS_TYPE_IPV6_EX 0x00000008 #define RSS_TYPE_TCP_IPV6 0x00000010 #define RSS_TYPE_TCP_IPV6_EX 0x00000020 #define RSS_TYPE_UDP_IPV4 0x00000040 #define RSS_TYPE_UDP_IPV6 0x00000080 #define RSS_TYPE_UDP_IPV6_EX 0x00000100 #define RSS_KEYLEN 128 struct ifrsskey { char ifrk_name[IFNAMSIZ]; /* if name, e.g. "en0" */ uint8_t ifrk_func; /* RSS_FUNC_ */ uint8_t ifrk_spare0; uint16_t ifrk_keylen; uint8_t ifrk_key[RSS_KEYLEN]; }; struct ifrsshash { char ifrh_name[IFNAMSIZ]; /* if name, e.g. "en0" */ uint8_t ifrh_func; /* RSS_FUNC_ */ uint8_t ifrh_spare0; uint16_t ifrh_spare1; uint32_t ifrh_types; /* RSS_TYPE_ */ }; #define IFNET_PCP_NONE 0xff /* PCP disabled */ +#define IFDR_MSG_SIZE 64 +#define IFDR_REASON_MSG 1 +#define IFDR_REASON_VENDOR 2 +struct ifdownreason { + char ifdr_name[IFNAMSIZ]; + uint32_t ifdr_reason; + uint32_t ifdr_vendor; + char ifdr_msg[IFDR_MSG_SIZE]; +}; + #endif /* __BSD_VISIBLE */ #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_IFADDR); MALLOC_DECLARE(M_IFMADDR); #endif #endif #ifndef _KERNEL struct if_nameindex { unsigned int if_index; /* 1, 2, ... */ char *if_name; /* null terminated name: "le0", ... */ }; __BEGIN_DECLS void if_freenameindex(struct if_nameindex *); char *if_indextoname(unsigned int, char *); struct if_nameindex *if_nameindex(void); unsigned int if_nametoindex(const char *); __END_DECLS #endif #endif /* !_NET_IF_H_ */ Index: projects/clang900-import/sys/netinet/sctp_auth.c =================================================================== --- projects/clang900-import/sys/netinet/sctp_auth.c (revision 352536) +++ projects/clang900-import/sys/netinet/sctp_auth.c (revision 352537) @@ -1,2011 +1,2011 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #ifdef SCTP_DEBUG #define SCTP_AUTH_DEBUG (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_AUTH1) #define SCTP_AUTH_DEBUG2 (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_AUTH2) #endif /* SCTP_DEBUG */ void sctp_clear_chunklist(sctp_auth_chklist_t *chklist) { memset(chklist, 0, sizeof(*chklist)); /* chklist->num_chunks = 0; */ } sctp_auth_chklist_t * sctp_alloc_chunklist(void) { sctp_auth_chklist_t *chklist; SCTP_MALLOC(chklist, sctp_auth_chklist_t *, sizeof(*chklist), SCTP_M_AUTH_CL); if (chklist == NULL) { SCTPDBG(SCTP_DEBUG_AUTH1, "sctp_alloc_chunklist: failed to get memory!\n"); } else { sctp_clear_chunklist(chklist); } return (chklist); } void sctp_free_chunklist(sctp_auth_chklist_t *list) { if (list != NULL) SCTP_FREE(list, SCTP_M_AUTH_CL); } sctp_auth_chklist_t * sctp_copy_chunklist(sctp_auth_chklist_t *list) { sctp_auth_chklist_t *new_list; if (list == NULL) return (NULL); /* get a new list */ new_list = sctp_alloc_chunklist(); if (new_list == NULL) return (NULL); /* copy it */ memcpy(new_list, list, sizeof(*new_list)); return (new_list); } /* * add a chunk to the required chunks list */ int sctp_auth_add_chunk(uint8_t chunk, sctp_auth_chklist_t *list) { if (list == NULL) return (-1); /* is chunk restricted? */ if ((chunk == SCTP_INITIATION) || (chunk == SCTP_INITIATION_ACK) || (chunk == SCTP_SHUTDOWN_COMPLETE) || (chunk == SCTP_AUTHENTICATION)) { return (-1); } if (list->chunks[chunk] == 0) { list->chunks[chunk] = 1; list->num_chunks++; SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: added chunk %u (0x%02x) to Auth list\n", chunk, chunk); } return (0); } /* * delete a chunk from the required chunks list */ int sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t *list) { if (list == NULL) return (-1); if (list->chunks[chunk] == 1) { list->chunks[chunk] = 0; list->num_chunks--; SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: deleted chunk %u (0x%02x) from Auth list\n", chunk, chunk); } return (0); } size_t sctp_auth_get_chklist_size(const sctp_auth_chklist_t *list) { if (list == NULL) return (0); else return (list->num_chunks); } /* * return the current number and list of required chunks caller must * guarantee ptr has space for up to 256 bytes */ int sctp_serialize_auth_chunks(const sctp_auth_chklist_t *list, uint8_t *ptr) { int i, count = 0; if (list == NULL) return (0); for (i = 0; i < 256; i++) { if (list->chunks[i] != 0) { *ptr++ = i; count++; } } return (count); } int sctp_pack_auth_chunks(const sctp_auth_chklist_t *list, uint8_t *ptr) { int i, size = 0; if (list == NULL) return (0); if (list->num_chunks <= 32) { /* just list them, one byte each */ for (i = 0; i < 256; i++) { if (list->chunks[i] != 0) { *ptr++ = i; size++; } } } else { int index, offset; /* pack into a 32 byte bitfield */ for (i = 0; i < 256; i++) { if (list->chunks[i] != 0) { index = i / 8; offset = i % 8; ptr[index] |= (1 << offset); } } size = 32; } return (size); } int sctp_unpack_auth_chunks(const uint8_t *ptr, uint8_t num_chunks, sctp_auth_chklist_t *list) { int i; int size; if (list == NULL) return (0); if (num_chunks <= 32) { /* just pull them, one byte each */ for (i = 0; i < num_chunks; i++) { (void)sctp_auth_add_chunk(*ptr++, list); } size = num_chunks; } else { int index, offset; /* unpack from a 32 byte bitfield */ for (index = 0; index < 32; index++) { for (offset = 0; offset < 8; offset++) { if (ptr[index] & (1 << offset)) { (void)sctp_auth_add_chunk((index * 8) + offset, list); } } } size = 32; } return (size); } /* * allocate structure space for a key of length keylen */ sctp_key_t * sctp_alloc_key(uint32_t keylen) { sctp_key_t *new_key; SCTP_MALLOC(new_key, sctp_key_t *, sizeof(*new_key) + keylen, SCTP_M_AUTH_KY); if (new_key == NULL) { /* out of memory */ return (NULL); } new_key->keylen = keylen; return (new_key); } void sctp_free_key(sctp_key_t *key) { if (key != NULL) SCTP_FREE(key, SCTP_M_AUTH_KY); } void sctp_print_key(sctp_key_t *key, const char *str) { uint32_t i; if (key == NULL) { SCTP_PRINTF("%s: [Null key]\n", str); return; } SCTP_PRINTF("%s: len %u, ", str, key->keylen); if (key->keylen) { for (i = 0; i < key->keylen; i++) SCTP_PRINTF("%02x", key->key[i]); SCTP_PRINTF("\n"); } else { SCTP_PRINTF("[Null key]\n"); } } void sctp_show_key(sctp_key_t *key, const char *str) { uint32_t i; if (key == NULL) { SCTP_PRINTF("%s: [Null key]\n", str); return; } SCTP_PRINTF("%s: len %u, ", str, key->keylen); if (key->keylen) { for (i = 0; i < key->keylen; i++) SCTP_PRINTF("%02x", key->key[i]); SCTP_PRINTF("\n"); } else { SCTP_PRINTF("[Null key]\n"); } } static uint32_t sctp_get_keylen(sctp_key_t *key) { if (key != NULL) return (key->keylen); else return (0); } /* * generate a new random key of length 'keylen' */ sctp_key_t * sctp_generate_random_key(uint32_t keylen) { sctp_key_t *new_key; new_key = sctp_alloc_key(keylen); if (new_key == NULL) { /* out of memory */ return (NULL); } SCTP_READ_RANDOM(new_key->key, keylen); new_key->keylen = keylen; return (new_key); } sctp_key_t * sctp_set_key(uint8_t *key, uint32_t keylen) { sctp_key_t *new_key; new_key = sctp_alloc_key(keylen); if (new_key == NULL) { /* out of memory */ return (NULL); } memcpy(new_key->key, key, keylen); return (new_key); } /*- * given two keys of variable size, compute which key is "larger/smaller" * returns: 1 if key1 > key2 * -1 if key1 < key2 * 0 if key1 = key2 */ static int sctp_compare_key(sctp_key_t *key1, sctp_key_t *key2) { uint32_t maxlen; uint32_t i; uint32_t key1len, key2len; uint8_t *key_1, *key_2; uint8_t val1, val2; /* sanity/length check */ key1len = sctp_get_keylen(key1); key2len = sctp_get_keylen(key2); if ((key1len == 0) && (key2len == 0)) return (0); else if (key1len == 0) return (-1); else if (key2len == 0) return (1); if (key1len < key2len) { maxlen = key2len; } else { maxlen = key1len; } key_1 = key1->key; key_2 = key2->key; /* check for numeric equality */ for (i = 0; i < maxlen; i++) { /* left-pad with zeros */ val1 = (i < (maxlen - key1len)) ? 0 : *(key_1++); val2 = (i < (maxlen - key2len)) ? 0 : *(key_2++); if (val1 > val2) { return (1); } else if (val1 < val2) { return (-1); } } /* keys are equal value, so check lengths */ if (key1len == key2len) return (0); else if (key1len < key2len) return (-1); else return (1); } /* * generate the concatenated keying material based on the two keys and the * shared key (if available). draft-ietf-tsvwg-auth specifies the specific * order for concatenation */ sctp_key_t * sctp_compute_hashkey(sctp_key_t *key1, sctp_key_t *key2, sctp_key_t *shared) { uint32_t keylen; sctp_key_t *new_key; uint8_t *key_ptr; keylen = sctp_get_keylen(key1) + sctp_get_keylen(key2) + sctp_get_keylen(shared); if (keylen > 0) { /* get space for the new key */ new_key = sctp_alloc_key(keylen); if (new_key == NULL) { /* out of memory */ return (NULL); } new_key->keylen = keylen; key_ptr = new_key->key; } else { /* all keys empty/null?! */ return (NULL); } /* concatenate the keys */ if (sctp_compare_key(key1, key2) <= 0) { /* key is shared + key1 + key2 */ if (sctp_get_keylen(shared)) { memcpy(key_ptr, shared->key, shared->keylen); key_ptr += shared->keylen; } if (sctp_get_keylen(key1)) { memcpy(key_ptr, key1->key, key1->keylen); key_ptr += key1->keylen; } if (sctp_get_keylen(key2)) { memcpy(key_ptr, key2->key, key2->keylen); } } else { /* key is shared + key2 + key1 */ if (sctp_get_keylen(shared)) { memcpy(key_ptr, shared->key, shared->keylen); key_ptr += shared->keylen; } if (sctp_get_keylen(key2)) { memcpy(key_ptr, key2->key, key2->keylen); key_ptr += key2->keylen; } if (sctp_get_keylen(key1)) { memcpy(key_ptr, key1->key, key1->keylen); } } return (new_key); } sctp_sharedkey_t * sctp_alloc_sharedkey(void) { sctp_sharedkey_t *new_key; SCTP_MALLOC(new_key, sctp_sharedkey_t *, sizeof(*new_key), SCTP_M_AUTH_KY); if (new_key == NULL) { /* out of memory */ return (NULL); } new_key->keyid = 0; new_key->key = NULL; new_key->refcount = 1; new_key->deactivated = 0; return (new_key); } void sctp_free_sharedkey(sctp_sharedkey_t *skey) { if (skey == NULL) return; if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&skey->refcount)) { if (skey->key != NULL) sctp_free_key(skey->key); SCTP_FREE(skey, SCTP_M_AUTH_KY); } } sctp_sharedkey_t * sctp_find_sharedkey(struct sctp_keyhead *shared_keys, uint16_t key_id) { sctp_sharedkey_t *skey; LIST_FOREACH(skey, shared_keys, next) { if (skey->keyid == key_id) return (skey); } return (NULL); } int sctp_insert_sharedkey(struct sctp_keyhead *shared_keys, sctp_sharedkey_t *new_skey) { sctp_sharedkey_t *skey; if ((shared_keys == NULL) || (new_skey == NULL)) return (EINVAL); /* insert into an empty list? */ if (LIST_EMPTY(shared_keys)) { LIST_INSERT_HEAD(shared_keys, new_skey, next); return (0); } /* insert into the existing list, ordered by key id */ LIST_FOREACH(skey, shared_keys, next) { if (new_skey->keyid < skey->keyid) { /* insert it before here */ LIST_INSERT_BEFORE(skey, new_skey, next); return (0); } else if (new_skey->keyid == skey->keyid) { /* replace the existing key */ /* verify this key *can* be replaced */ - if ((skey->deactivated) && (skey->refcount > 1)) { + if ((skey->deactivated) || (skey->refcount > 1)) { SCTPDBG(SCTP_DEBUG_AUTH1, "can't replace shared key id %u\n", new_skey->keyid); return (EBUSY); } SCTPDBG(SCTP_DEBUG_AUTH1, "replacing shared key id %u\n", new_skey->keyid); LIST_INSERT_BEFORE(skey, new_skey, next); LIST_REMOVE(skey, next); sctp_free_sharedkey(skey); return (0); } if (LIST_NEXT(skey, next) == NULL) { /* belongs at the end of the list */ LIST_INSERT_AFTER(skey, new_skey, next); return (0); } } /* shouldn't reach here */ return (EINVAL); } void sctp_auth_key_acquire(struct sctp_tcb *stcb, uint16_t key_id) { sctp_sharedkey_t *skey; /* find the shared key */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, key_id); /* bump the ref count */ if (skey) { atomic_add_int(&skey->refcount, 1); SCTPDBG(SCTP_DEBUG_AUTH2, "%s: stcb %p key %u refcount acquire to %d\n", __func__, (void *)stcb, key_id, skey->refcount); } } void sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t key_id, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { sctp_sharedkey_t *skey; /* find the shared key */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, key_id); /* decrement the ref count */ if (skey) { SCTPDBG(SCTP_DEBUG_AUTH2, "%s: stcb %p key %u refcount release to %d\n", __func__, (void *)stcb, key_id, skey->refcount); /* see if a notification should be generated */ if ((skey->refcount <= 2) && (skey->deactivated)) { /* notify ULP that key is no longer used */ sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb, key_id, 0, so_locked); SCTPDBG(SCTP_DEBUG_AUTH2, "%s: stcb %p key %u no longer used, %d\n", __func__, (void *)stcb, key_id, skey->refcount); } sctp_free_sharedkey(skey); } } static sctp_sharedkey_t * sctp_copy_sharedkey(const sctp_sharedkey_t *skey) { sctp_sharedkey_t *new_skey; if (skey == NULL) return (NULL); new_skey = sctp_alloc_sharedkey(); if (new_skey == NULL) return (NULL); if (skey->key != NULL) new_skey->key = sctp_set_key(skey->key->key, skey->key->keylen); else new_skey->key = NULL; new_skey->keyid = skey->keyid; return (new_skey); } int sctp_copy_skeylist(const struct sctp_keyhead *src, struct sctp_keyhead *dest) { sctp_sharedkey_t *skey, *new_skey; int count = 0; if ((src == NULL) || (dest == NULL)) return (0); LIST_FOREACH(skey, src, next) { new_skey = sctp_copy_sharedkey(skey); if (new_skey != NULL) { if (sctp_insert_sharedkey(dest, new_skey)) { sctp_free_sharedkey(new_skey); } else { count++; } } } return (count); } sctp_hmaclist_t * sctp_alloc_hmaclist(uint16_t num_hmacs) { sctp_hmaclist_t *new_list; int alloc_size; alloc_size = sizeof(*new_list) + num_hmacs * sizeof(new_list->hmac[0]); SCTP_MALLOC(new_list, sctp_hmaclist_t *, alloc_size, SCTP_M_AUTH_HL); if (new_list == NULL) { /* out of memory */ return (NULL); } new_list->max_algo = num_hmacs; new_list->num_algo = 0; return (new_list); } void sctp_free_hmaclist(sctp_hmaclist_t *list) { if (list != NULL) { SCTP_FREE(list, SCTP_M_AUTH_HL); list = NULL; } } int sctp_auth_add_hmacid(sctp_hmaclist_t *list, uint16_t hmac_id) { int i; if (list == NULL) return (-1); if (list->num_algo == list->max_algo) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: HMAC id list full, ignoring add %u\n", hmac_id); return (-1); } if ((hmac_id != SCTP_AUTH_HMAC_ID_SHA1) && (hmac_id != SCTP_AUTH_HMAC_ID_SHA256)) { return (-1); } /* Now is it already in the list */ for (i = 0; i < list->num_algo; i++) { if (list->hmac[i] == hmac_id) { /* already in list */ return (-1); } } SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: add HMAC id %u to list\n", hmac_id); list->hmac[list->num_algo++] = hmac_id; return (0); } sctp_hmaclist_t * sctp_copy_hmaclist(sctp_hmaclist_t *list) { sctp_hmaclist_t *new_list; int i; if (list == NULL) return (NULL); /* get a new list */ new_list = sctp_alloc_hmaclist(list->max_algo); if (new_list == NULL) return (NULL); /* copy it */ new_list->max_algo = list->max_algo; new_list->num_algo = list->num_algo; for (i = 0; i < list->num_algo; i++) new_list->hmac[i] = list->hmac[i]; return (new_list); } sctp_hmaclist_t * sctp_default_supported_hmaclist(void) { sctp_hmaclist_t *new_list; new_list = sctp_alloc_hmaclist(2); if (new_list == NULL) return (NULL); /* We prefer SHA256, so list it first */ (void)sctp_auth_add_hmacid(new_list, SCTP_AUTH_HMAC_ID_SHA256); (void)sctp_auth_add_hmacid(new_list, SCTP_AUTH_HMAC_ID_SHA1); return (new_list); } /*- * HMAC algos are listed in priority/preference order * find the best HMAC id to use for the peer based on local support */ uint16_t sctp_negotiate_hmacid(sctp_hmaclist_t *peer, sctp_hmaclist_t *local) { int i, j; if ((local == NULL) || (peer == NULL)) return (SCTP_AUTH_HMAC_ID_RSVD); for (i = 0; i < peer->num_algo; i++) { for (j = 0; j < local->num_algo; j++) { if (peer->hmac[i] == local->hmac[j]) { /* found the "best" one */ SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: negotiated peer HMAC id %u\n", peer->hmac[i]); return (peer->hmac[i]); } } } /* didn't find one! */ return (SCTP_AUTH_HMAC_ID_RSVD); } /*- * serialize the HMAC algo list and return space used * caller must guarantee ptr has appropriate space */ int sctp_serialize_hmaclist(sctp_hmaclist_t *list, uint8_t *ptr) { int i; uint16_t hmac_id; if (list == NULL) return (0); for (i = 0; i < list->num_algo; i++) { hmac_id = htons(list->hmac[i]); memcpy(ptr, &hmac_id, sizeof(hmac_id)); ptr += sizeof(hmac_id); } return (list->num_algo * sizeof(hmac_id)); } int sctp_verify_hmac_param(struct sctp_auth_hmac_algo *hmacs, uint32_t num_hmacs) { uint32_t i; for (i = 0; i < num_hmacs; i++) { if (ntohs(hmacs->hmac_ids[i]) == SCTP_AUTH_HMAC_ID_SHA1) { return (0); } } return (-1); } sctp_authinfo_t * sctp_alloc_authinfo(void) { sctp_authinfo_t *new_authinfo; SCTP_MALLOC(new_authinfo, sctp_authinfo_t *, sizeof(*new_authinfo), SCTP_M_AUTH_IF); if (new_authinfo == NULL) { /* out of memory */ return (NULL); } memset(new_authinfo, 0, sizeof(*new_authinfo)); return (new_authinfo); } void sctp_free_authinfo(sctp_authinfo_t *authinfo) { if (authinfo == NULL) return; if (authinfo->random != NULL) sctp_free_key(authinfo->random); if (authinfo->peer_random != NULL) sctp_free_key(authinfo->peer_random); if (authinfo->assoc_key != NULL) sctp_free_key(authinfo->assoc_key); if (authinfo->recv_key != NULL) sctp_free_key(authinfo->recv_key); /* We are NOT dynamically allocating authinfo's right now... */ /* SCTP_FREE(authinfo, SCTP_M_AUTH_??); */ } uint32_t sctp_get_auth_chunk_len(uint16_t hmac_algo) { int size; size = sizeof(struct sctp_auth_chunk) + sctp_get_hmac_digest_len(hmac_algo); return (SCTP_SIZE32(size)); } uint32_t sctp_get_hmac_digest_len(uint16_t hmac_algo) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: return (SCTP_AUTH_DIGEST_LEN_SHA1); case SCTP_AUTH_HMAC_ID_SHA256: return (SCTP_AUTH_DIGEST_LEN_SHA256); default: /* unknown HMAC algorithm: can't do anything */ return (0); } /* end switch */ } static inline int sctp_get_hmac_block_len(uint16_t hmac_algo) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: return (64); case SCTP_AUTH_HMAC_ID_SHA256: return (64); case SCTP_AUTH_HMAC_ID_RSVD: default: /* unknown HMAC algorithm: can't do anything */ return (0); } /* end switch */ } static void sctp_hmac_init(uint16_t hmac_algo, sctp_hash_context_t *ctx) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: SCTP_SHA1_INIT(&ctx->sha1); break; case SCTP_AUTH_HMAC_ID_SHA256: SCTP_SHA256_INIT(&ctx->sha256); break; case SCTP_AUTH_HMAC_ID_RSVD: default: /* unknown HMAC algorithm: can't do anything */ return; } /* end switch */ } static void sctp_hmac_update(uint16_t hmac_algo, sctp_hash_context_t *ctx, uint8_t *text, uint32_t textlen) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: SCTP_SHA1_UPDATE(&ctx->sha1, text, textlen); break; case SCTP_AUTH_HMAC_ID_SHA256: SCTP_SHA256_UPDATE(&ctx->sha256, text, textlen); break; case SCTP_AUTH_HMAC_ID_RSVD: default: /* unknown HMAC algorithm: can't do anything */ return; } /* end switch */ } static void sctp_hmac_final(uint16_t hmac_algo, sctp_hash_context_t *ctx, uint8_t *digest) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: SCTP_SHA1_FINAL(digest, &ctx->sha1); break; case SCTP_AUTH_HMAC_ID_SHA256: SCTP_SHA256_FINAL(digest, &ctx->sha256); break; case SCTP_AUTH_HMAC_ID_RSVD: default: /* unknown HMAC algorithm: can't do anything */ return; } /* end switch */ } /*- * Keyed-Hashing for Message Authentication: FIPS 198 (RFC 2104) * * Compute the HMAC digest using the desired hash key, text, and HMAC * algorithm. Resulting digest is placed in 'digest' and digest length * is returned, if the HMAC was performed. * * WARNING: it is up to the caller to supply sufficient space to hold the * resultant digest. */ uint32_t sctp_hmac(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, uint8_t *text, uint32_t textlen, uint8_t *digest) { uint32_t digestlen; uint32_t blocklen; sctp_hash_context_t ctx; uint8_t ipad[128], opad[128]; /* keyed hash inner/outer pads */ uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; uint32_t i; /* sanity check the material and length */ if ((key == NULL) || (keylen == 0) || (text == NULL) || (textlen == 0) || (digest == NULL)) { /* can't do HMAC with empty key or text or digest store */ return (0); } /* validate the hmac algo and get the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_algo); if (digestlen == 0) return (0); /* hash the key if it is longer than the hash block size */ blocklen = sctp_get_hmac_block_len(hmac_algo); if (keylen > blocklen) { sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, key, keylen); sctp_hmac_final(hmac_algo, &ctx, temp); /* set the hashed key as the key */ keylen = digestlen; key = temp; } /* initialize the inner/outer pads with the key and "append" zeroes */ memset(ipad, 0, blocklen); memset(opad, 0, blocklen); memcpy(ipad, key, keylen); memcpy(opad, key, keylen); /* XOR the key with ipad and opad values */ for (i = 0; i < blocklen; i++) { ipad[i] ^= 0x36; opad[i] ^= 0x5c; } /* perform inner hash */ sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, ipad, blocklen); sctp_hmac_update(hmac_algo, &ctx, text, textlen); sctp_hmac_final(hmac_algo, &ctx, temp); /* perform outer hash */ sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, opad, blocklen); sctp_hmac_update(hmac_algo, &ctx, temp, digestlen); sctp_hmac_final(hmac_algo, &ctx, digest); return (digestlen); } /* mbuf version */ uint32_t sctp_hmac_m(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, struct mbuf *m, uint32_t m_offset, uint8_t *digest, uint32_t trailer) { uint32_t digestlen; uint32_t blocklen; sctp_hash_context_t ctx; uint8_t ipad[128], opad[128]; /* keyed hash inner/outer pads */ uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; uint32_t i; struct mbuf *m_tmp; /* sanity check the material and length */ if ((key == NULL) || (keylen == 0) || (m == NULL) || (digest == NULL)) { /* can't do HMAC with empty key or text or digest store */ return (0); } /* validate the hmac algo and get the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_algo); if (digestlen == 0) return (0); /* hash the key if it is longer than the hash block size */ blocklen = sctp_get_hmac_block_len(hmac_algo); if (keylen > blocklen) { sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, key, keylen); sctp_hmac_final(hmac_algo, &ctx, temp); /* set the hashed key as the key */ keylen = digestlen; key = temp; } /* initialize the inner/outer pads with the key and "append" zeroes */ memset(ipad, 0, blocklen); memset(opad, 0, blocklen); memcpy(ipad, key, keylen); memcpy(opad, key, keylen); /* XOR the key with ipad and opad values */ for (i = 0; i < blocklen; i++) { ipad[i] ^= 0x36; opad[i] ^= 0x5c; } /* perform inner hash */ sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, ipad, blocklen); /* find the correct starting mbuf and offset (get start of text) */ m_tmp = m; while ((m_tmp != NULL) && (m_offset >= (uint32_t)SCTP_BUF_LEN(m_tmp))) { m_offset -= SCTP_BUF_LEN(m_tmp); m_tmp = SCTP_BUF_NEXT(m_tmp); } /* now use the rest of the mbuf chain for the text */ while (m_tmp != NULL) { if ((SCTP_BUF_NEXT(m_tmp) == NULL) && trailer) { sctp_hmac_update(hmac_algo, &ctx, mtod(m_tmp, uint8_t *)+m_offset, SCTP_BUF_LEN(m_tmp) - (trailer + m_offset)); } else { sctp_hmac_update(hmac_algo, &ctx, mtod(m_tmp, uint8_t *)+m_offset, SCTP_BUF_LEN(m_tmp) - m_offset); } /* clear the offset since it's only for the first mbuf */ m_offset = 0; m_tmp = SCTP_BUF_NEXT(m_tmp); } sctp_hmac_final(hmac_algo, &ctx, temp); /* perform outer hash */ sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, opad, blocklen); sctp_hmac_update(hmac_algo, &ctx, temp, digestlen); sctp_hmac_final(hmac_algo, &ctx, digest); return (digestlen); } /* * computes the requested HMAC using a key struct (which may be modified if * the keylen exceeds the HMAC block len). */ uint32_t sctp_compute_hmac(uint16_t hmac_algo, sctp_key_t *key, uint8_t *text, uint32_t textlen, uint8_t *digest) { uint32_t digestlen; uint32_t blocklen; sctp_hash_context_t ctx; uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; /* sanity check */ if ((key == NULL) || (text == NULL) || (textlen == 0) || (digest == NULL)) { /* can't do HMAC with empty key or text or digest store */ return (0); } /* validate the hmac algo and get the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_algo); if (digestlen == 0) return (0); /* hash the key if it is longer than the hash block size */ blocklen = sctp_get_hmac_block_len(hmac_algo); if (key->keylen > blocklen) { sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, key->key, key->keylen); sctp_hmac_final(hmac_algo, &ctx, temp); /* save the hashed key as the new key */ key->keylen = digestlen; memcpy(key->key, temp, key->keylen); } return (sctp_hmac(hmac_algo, key->key, key->keylen, text, textlen, digest)); } /* mbuf version */ uint32_t sctp_compute_hmac_m(uint16_t hmac_algo, sctp_key_t *key, struct mbuf *m, uint32_t m_offset, uint8_t *digest) { uint32_t digestlen; uint32_t blocklen; sctp_hash_context_t ctx; uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; /* sanity check */ if ((key == NULL) || (m == NULL) || (digest == NULL)) { /* can't do HMAC with empty key or text or digest store */ return (0); } /* validate the hmac algo and get the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_algo); if (digestlen == 0) return (0); /* hash the key if it is longer than the hash block size */ blocklen = sctp_get_hmac_block_len(hmac_algo); if (key->keylen > blocklen) { sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, key->key, key->keylen); sctp_hmac_final(hmac_algo, &ctx, temp); /* save the hashed key as the new key */ key->keylen = digestlen; memcpy(key->key, temp, key->keylen); } return (sctp_hmac_m(hmac_algo, key->key, key->keylen, m, m_offset, digest, 0)); } int sctp_auth_is_supported_hmac(sctp_hmaclist_t *list, uint16_t id) { int i; if ((list == NULL) || (id == SCTP_AUTH_HMAC_ID_RSVD)) return (0); for (i = 0; i < list->num_algo; i++) if (list->hmac[i] == id) return (1); /* not in the list */ return (0); } /*- * clear any cached key(s) if they match the given key id on an association. * the cached key(s) will be recomputed and re-cached at next use. * ASSUMES TCB_LOCK is already held */ void sctp_clear_cachedkeys(struct sctp_tcb *stcb, uint16_t keyid) { if (stcb == NULL) return; if (keyid == stcb->asoc.authinfo.assoc_keyid) { sctp_free_key(stcb->asoc.authinfo.assoc_key); stcb->asoc.authinfo.assoc_key = NULL; } if (keyid == stcb->asoc.authinfo.recv_keyid) { sctp_free_key(stcb->asoc.authinfo.recv_key); stcb->asoc.authinfo.recv_key = NULL; } } /*- * clear any cached key(s) if they match the given key id for all assocs on * an endpoint. * ASSUMES INP_WLOCK is already held */ void sctp_clear_cachedkeys_ep(struct sctp_inpcb *inp, uint16_t keyid) { struct sctp_tcb *stcb; if (inp == NULL) return; /* clear the cached keys on all assocs on this instance */ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); sctp_clear_cachedkeys(stcb, keyid); SCTP_TCB_UNLOCK(stcb); } } /*- * delete a shared key from an association * ASSUMES TCB_LOCK is already held */ int sctp_delete_sharedkey(struct sctp_tcb *stcb, uint16_t keyid) { sctp_sharedkey_t *skey; if (stcb == NULL) return (-1); /* is the keyid the assoc active sending key */ if (keyid == stcb->asoc.authinfo.active_keyid) return (-1); /* does the key exist? */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); if (skey == NULL) return (-1); /* are there other refcount holders on the key? */ if (skey->refcount > 1) return (-1); /* remove it */ LIST_REMOVE(skey, next); sctp_free_sharedkey(skey); /* frees skey->key as well */ /* clear any cached keys */ sctp_clear_cachedkeys(stcb, keyid); return (0); } /*- * deletes a shared key from the endpoint * ASSUMES INP_WLOCK is already held */ int sctp_delete_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid) { sctp_sharedkey_t *skey; if (inp == NULL) return (-1); /* is the keyid the active sending key on the endpoint */ if (keyid == inp->sctp_ep.default_keyid) return (-1); /* does the key exist? */ skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid); if (skey == NULL) return (-1); /* endpoint keys are not refcounted */ /* remove it */ LIST_REMOVE(skey, next); sctp_free_sharedkey(skey); /* frees skey->key as well */ /* clear any cached keys */ sctp_clear_cachedkeys_ep(inp, keyid); return (0); } /*- * set the active key on an association * ASSUMES TCB_LOCK is already held */ int sctp_auth_setactivekey(struct sctp_tcb *stcb, uint16_t keyid) { sctp_sharedkey_t *skey = NULL; /* find the key on the assoc */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); if (skey == NULL) { /* that key doesn't exist */ return (-1); } if ((skey->deactivated) && (skey->refcount > 1)) { /* can't reactivate a deactivated key with other refcounts */ return (-1); } /* set the (new) active key */ stcb->asoc.authinfo.active_keyid = keyid; /* reset the deactivated flag */ skey->deactivated = 0; return (0); } /*- * set the active key on an endpoint * ASSUMES INP_WLOCK is already held */ int sctp_auth_setactivekey_ep(struct sctp_inpcb *inp, uint16_t keyid) { sctp_sharedkey_t *skey; /* find the key */ skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid); if (skey == NULL) { /* that key doesn't exist */ return (-1); } inp->sctp_ep.default_keyid = keyid; return (0); } /*- * deactivates a shared key from the association * ASSUMES INP_WLOCK is already held */ int sctp_deact_sharedkey(struct sctp_tcb *stcb, uint16_t keyid) { sctp_sharedkey_t *skey; if (stcb == NULL) return (-1); /* is the keyid the assoc active sending key */ if (keyid == stcb->asoc.authinfo.active_keyid) return (-1); /* does the key exist? */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); if (skey == NULL) return (-1); /* are there other refcount holders on the key? */ if (skey->refcount == 1) { /* no other users, send a notification for this key */ sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb, keyid, 0, SCTP_SO_LOCKED); } /* mark the key as deactivated */ skey->deactivated = 1; return (0); } /*- * deactivates a shared key from the endpoint * ASSUMES INP_WLOCK is already held */ int sctp_deact_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid) { sctp_sharedkey_t *skey; if (inp == NULL) return (-1); /* is the keyid the active sending key on the endpoint */ if (keyid == inp->sctp_ep.default_keyid) return (-1); /* does the key exist? */ skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid); if (skey == NULL) return (-1); /* endpoint keys are not refcounted */ /* remove it */ LIST_REMOVE(skey, next); sctp_free_sharedkey(skey); /* frees skey->key as well */ return (0); } /* * get local authentication parameters from cookie (from INIT-ACK) */ void sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m, uint32_t offset, uint32_t length) { struct sctp_paramhdr *phdr, tmp_param; uint16_t plen, ptype; uint8_t random_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_random *p_random = NULL; uint16_t random_len = 0; uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_hmac_algo *hmacs = NULL; uint16_t hmacs_len = 0; uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_chunk_list *chunks = NULL; uint16_t num_chunks = 0; sctp_key_t *new_key; uint32_t keylen; /* convert to upper bound */ length += offset; phdr = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *)&tmp_param); while (phdr != NULL) { ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); if ((plen == 0) || (offset + plen > length)) break; if (ptype == SCTP_RANDOM) { if (plen > sizeof(random_store)) break; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)random_store, plen); if (phdr == NULL) return; /* save the random and length for the key */ p_random = (struct sctp_auth_random *)phdr; random_len = plen - sizeof(*p_random); } else if (ptype == SCTP_HMAC_LIST) { uint16_t num_hmacs; uint16_t i; if (plen > sizeof(hmacs_store)) break; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)hmacs_store, plen); if (phdr == NULL) return; /* save the hmacs list and num for the key */ hmacs = (struct sctp_auth_hmac_algo *)phdr; hmacs_len = plen - sizeof(*hmacs); num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]); if (stcb->asoc.local_hmacs != NULL) sctp_free_hmaclist(stcb->asoc.local_hmacs); stcb->asoc.local_hmacs = sctp_alloc_hmaclist(num_hmacs); if (stcb->asoc.local_hmacs != NULL) { for (i = 0; i < num_hmacs; i++) { (void)sctp_auth_add_hmacid(stcb->asoc.local_hmacs, ntohs(hmacs->hmac_ids[i])); } } } else if (ptype == SCTP_CHUNK_LIST) { int i; if (plen > sizeof(chunks_store)) break; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)chunks_store, plen); if (phdr == NULL) return; chunks = (struct sctp_auth_chunk_list *)phdr; num_chunks = plen - sizeof(*chunks); /* save chunks list and num for the key */ if (stcb->asoc.local_auth_chunks != NULL) sctp_clear_chunklist(stcb->asoc.local_auth_chunks); else stcb->asoc.local_auth_chunks = sctp_alloc_chunklist(); for (i = 0; i < num_chunks; i++) { (void)sctp_auth_add_chunk(chunks->chunk_types[i], stcb->asoc.local_auth_chunks); } } /* get next parameter */ offset += SCTP_SIZE32(plen); if (offset + sizeof(struct sctp_paramhdr) > length) break; phdr = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *)&tmp_param); } /* concatenate the full random key */ keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len; if (chunks != NULL) { keylen += sizeof(*chunks) + num_chunks; } new_key = sctp_alloc_key(keylen); if (new_key != NULL) { /* copy in the RANDOM */ if (p_random != NULL) { keylen = sizeof(*p_random) + random_len; memcpy(new_key->key, p_random, keylen); } else { keylen = 0; } /* append in the AUTH chunks */ if (chunks != NULL) { memcpy(new_key->key + keylen, chunks, sizeof(*chunks) + num_chunks); keylen += sizeof(*chunks) + num_chunks; } /* append in the HMACs */ if (hmacs != NULL) { memcpy(new_key->key + keylen, hmacs, sizeof(*hmacs) + hmacs_len); } } if (stcb->asoc.authinfo.random != NULL) sctp_free_key(stcb->asoc.authinfo.random); stcb->asoc.authinfo.random = new_key; stcb->asoc.authinfo.random_len = random_len; sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid); sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid); /* negotiate what HMAC to use for the peer */ stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs, stcb->asoc.local_hmacs); /* copy defaults from the endpoint */ /* FIX ME: put in cookie? */ stcb->asoc.authinfo.active_keyid = stcb->sctp_ep->sctp_ep.default_keyid; /* copy out the shared key list (by reference) from the endpoint */ (void)sctp_copy_skeylist(&stcb->sctp_ep->sctp_ep.shared_keys, &stcb->asoc.shared_keys); } /* * compute and fill in the HMAC digest for a packet */ void sctp_fill_hmac_digest_m(struct mbuf *m, uint32_t auth_offset, struct sctp_auth_chunk *auth, struct sctp_tcb *stcb, uint16_t keyid) { uint32_t digestlen; sctp_sharedkey_t *skey; sctp_key_t *key; if ((stcb == NULL) || (auth == NULL)) return; /* zero the digest + chunk padding */ digestlen = sctp_get_hmac_digest_len(stcb->asoc.peer_hmac_id); memset(auth->hmac, 0, SCTP_SIZE32(digestlen)); /* is the desired key cached? */ if ((keyid != stcb->asoc.authinfo.assoc_keyid) || (stcb->asoc.authinfo.assoc_key == NULL)) { if (stcb->asoc.authinfo.assoc_key != NULL) { /* free the old cached key */ sctp_free_key(stcb->asoc.authinfo.assoc_key); } skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); /* the only way skey is NULL is if null key id 0 is used */ if (skey != NULL) key = skey->key; else key = NULL; /* compute a new assoc key and cache it */ stcb->asoc.authinfo.assoc_key = sctp_compute_hashkey(stcb->asoc.authinfo.random, stcb->asoc.authinfo.peer_random, key); stcb->asoc.authinfo.assoc_keyid = keyid; SCTPDBG(SCTP_DEBUG_AUTH1, "caching key id %u\n", stcb->asoc.authinfo.assoc_keyid); #ifdef SCTP_DEBUG if (SCTP_AUTH_DEBUG) sctp_print_key(stcb->asoc.authinfo.assoc_key, "Assoc Key"); #endif } /* set in the active key id */ auth->shared_key_id = htons(keyid); /* compute and fill in the digest */ (void)sctp_compute_hmac_m(stcb->asoc.peer_hmac_id, stcb->asoc.authinfo.assoc_key, m, auth_offset, auth->hmac); } static void sctp_zero_m(struct mbuf *m, uint32_t m_offset, uint32_t size) { struct mbuf *m_tmp; uint8_t *data; /* sanity check */ if (m == NULL) return; /* find the correct starting mbuf and offset (get start position) */ m_tmp = m; while ((m_tmp != NULL) && (m_offset >= (uint32_t)SCTP_BUF_LEN(m_tmp))) { m_offset -= SCTP_BUF_LEN(m_tmp); m_tmp = SCTP_BUF_NEXT(m_tmp); } /* now use the rest of the mbuf chain */ while ((m_tmp != NULL) && (size > 0)) { data = mtod(m_tmp, uint8_t *)+m_offset; if (size > (uint32_t)(SCTP_BUF_LEN(m_tmp) - m_offset)) { memset(data, 0, SCTP_BUF_LEN(m_tmp) - m_offset); size -= SCTP_BUF_LEN(m_tmp) - m_offset; } else { memset(data, 0, size); size = 0; } /* clear the offset since it's only for the first mbuf */ m_offset = 0; m_tmp = SCTP_BUF_NEXT(m_tmp); } } /*- * process the incoming Authentication chunk * return codes: * -1 on any authentication error * 0 on authentication verification */ int sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth, struct mbuf *m, uint32_t offset) { uint16_t chunklen; uint16_t shared_key_id; uint16_t hmac_id; sctp_sharedkey_t *skey; uint32_t digestlen; uint8_t digest[SCTP_AUTH_DIGEST_LEN_MAX]; uint8_t computed_digest[SCTP_AUTH_DIGEST_LEN_MAX]; /* auth is checked for NULL by caller */ chunklen = ntohs(auth->ch.chunk_length); if (chunklen < sizeof(*auth)) { SCTP_STAT_INCR(sctps_recvauthfailed); return (-1); } SCTP_STAT_INCR(sctps_recvauth); /* get the auth params */ shared_key_id = ntohs(auth->shared_key_id); hmac_id = ntohs(auth->hmac_id); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP AUTH Chunk: shared key %u, HMAC id %u\n", shared_key_id, hmac_id); /* is the indicated HMAC supported? */ if (!sctp_auth_is_supported_hmac(stcb->asoc.local_hmacs, hmac_id)) { struct mbuf *op_err; struct sctp_error_auth_invalid_hmac *cause; SCTP_STAT_INCR(sctps_recvivalhmacid); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: unsupported HMAC id %u\n", hmac_id); /* * report this in an Error Chunk: Unsupported HMAC * Identifier */ op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_auth_invalid_hmac), 0, M_NOWAIT, 1, MT_HEADER); if (op_err != NULL) { /* pre-reserve some space */ SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); /* fill in the error */ cause = mtod(op_err, struct sctp_error_auth_invalid_hmac *); cause->cause.code = htons(SCTP_CAUSE_UNSUPPORTED_HMACID); cause->cause.length = htons(sizeof(struct sctp_error_auth_invalid_hmac)); cause->hmac_id = ntohs(hmac_id); SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_auth_invalid_hmac); /* queue it */ sctp_queue_op_err(stcb, op_err); } return (-1); } /* get the indicated shared key, if available */ if ((stcb->asoc.authinfo.recv_key == NULL) || (stcb->asoc.authinfo.recv_keyid != shared_key_id)) { /* find the shared key on the assoc first */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, shared_key_id); /* if the shared key isn't found, discard the chunk */ if (skey == NULL) { SCTP_STAT_INCR(sctps_recvivalkeyid); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: unknown key id %u\n", shared_key_id); return (-1); } /* generate a notification if this is a new key id */ if (stcb->asoc.authinfo.recv_keyid != shared_key_id) /* * sctp_ulp_notify(SCTP_NOTIFY_AUTH_NEW_KEY, stcb, * shared_key_id, (void * *)stcb->asoc.authinfo.recv_keyid); */ sctp_notify_authentication(stcb, SCTP_AUTH_NEW_KEY, shared_key_id, stcb->asoc.authinfo.recv_keyid, SCTP_SO_NOT_LOCKED); /* compute a new recv assoc key and cache it */ if (stcb->asoc.authinfo.recv_key != NULL) sctp_free_key(stcb->asoc.authinfo.recv_key); stcb->asoc.authinfo.recv_key = sctp_compute_hashkey(stcb->asoc.authinfo.random, stcb->asoc.authinfo.peer_random, skey->key); stcb->asoc.authinfo.recv_keyid = shared_key_id; #ifdef SCTP_DEBUG if (SCTP_AUTH_DEBUG) sctp_print_key(stcb->asoc.authinfo.recv_key, "Recv Key"); #endif } /* validate the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_id); if (chunklen < (sizeof(*auth) + digestlen)) { /* invalid digest length */ SCTP_STAT_INCR(sctps_recvauthfailed); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: chunk too short for HMAC\n"); return (-1); } /* save a copy of the digest, zero the pseudo header, and validate */ memcpy(digest, auth->hmac, digestlen); sctp_zero_m(m, offset + sizeof(*auth), SCTP_SIZE32(digestlen)); (void)sctp_compute_hmac_m(hmac_id, stcb->asoc.authinfo.recv_key, m, offset, computed_digest); /* compare the computed digest with the one in the AUTH chunk */ if (timingsafe_bcmp(digest, computed_digest, digestlen) != 0) { SCTP_STAT_INCR(sctps_recvauthfailed); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: HMAC digest check failed\n"); return (-1); } return (0); } /* * Generate NOTIFICATION */ void sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication, uint16_t keyid, uint16_t alt_keyid, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { struct mbuf *m_notify; struct sctp_authkey_event *auth; struct sctp_queued_to_read *control; if ((stcb == NULL) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) ) { /* If the socket is gone we are out of here */ return; } if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_AUTHEVNT)) /* event not enabled */ return; m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_authkey_event), 0, M_NOWAIT, 1, MT_HEADER); if (m_notify == NULL) /* no space left */ return; SCTP_BUF_LEN(m_notify) = 0; auth = mtod(m_notify, struct sctp_authkey_event *); memset(auth, 0, sizeof(struct sctp_authkey_event)); auth->auth_type = SCTP_AUTHENTICATION_EVENT; auth->auth_flags = 0; auth->auth_length = sizeof(*auth); auth->auth_keynumber = keyid; auth->auth_altkeynumber = alt_keyid; auth->auth_indication = indication; auth->auth_assoc_id = sctp_get_associd(stcb); SCTP_BUF_LEN(m_notify) = sizeof(*auth); SCTP_BUF_NEXT(m_notify) = NULL; /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); } /*- * validates the AUTHentication related parameters in an INIT/INIT-ACK * Note: currently only used for INIT as INIT-ACK is handled inline * with sctp_load_addresses_from_init() */ int sctp_validate_init_auth_params(struct mbuf *m, int offset, int limit) { struct sctp_paramhdr *phdr, param_buf; uint16_t ptype, plen; int peer_supports_asconf = 0; int peer_supports_auth = 0; int got_random = 0, got_hmacs = 0, got_chklist = 0; uint8_t saw_asconf = 0; uint8_t saw_asconf_ack = 0; /* go through each of the params. */ phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); while (phdr) { ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); if (offset + plen > limit) { break; } if (plen < sizeof(struct sctp_paramhdr)) { break; } if (ptype == SCTP_SUPPORTED_CHUNK_EXT) { /* A supported extension chunk */ struct sctp_supported_chunk_types_param *pr_supported; uint8_t local_store[SCTP_SMALL_CHUNK_STORE]; int num_ent, i; if (plen > sizeof(local_store)) { break; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&local_store, plen); if (phdr == NULL) { return (-1); } pr_supported = (struct sctp_supported_chunk_types_param *)phdr; num_ent = plen - sizeof(struct sctp_paramhdr); for (i = 0; i < num_ent; i++) { switch (pr_supported->chunk_types[i]) { case SCTP_ASCONF: case SCTP_ASCONF_ACK: peer_supports_asconf = 1; break; default: /* one we don't care about */ break; } } } else if (ptype == SCTP_RANDOM) { /* enforce the random length */ if (plen != (sizeof(struct sctp_auth_random) + SCTP_AUTH_RANDOM_SIZE_REQUIRED)) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid RANDOM len\n"); return (-1); } got_random = 1; } else if (ptype == SCTP_HMAC_LIST) { struct sctp_auth_hmac_algo *hmacs; uint8_t store[SCTP_PARAM_BUFFER_SIZE]; int num_hmacs; if (plen > sizeof(store)) { break; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)store, plen); if (phdr == NULL) { return (-1); } hmacs = (struct sctp_auth_hmac_algo *)phdr; num_hmacs = (plen - sizeof(*hmacs)) / sizeof(hmacs->hmac_ids[0]); /* validate the hmac list */ if (sctp_verify_hmac_param(hmacs, num_hmacs)) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid HMAC param\n"); return (-1); } got_hmacs = 1; } else if (ptype == SCTP_CHUNK_LIST) { struct sctp_auth_chunk_list *chunks; uint8_t chunks_store[SCTP_SMALL_CHUNK_STORE]; int i, num_chunks; if (plen > sizeof(chunks_store)) { break; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)chunks_store, plen); if (phdr == NULL) { return (-1); } /*- * Flip through the list and mark that the * peer supports asconf/asconf_ack. */ chunks = (struct sctp_auth_chunk_list *)phdr; num_chunks = plen - sizeof(*chunks); for (i = 0; i < num_chunks; i++) { /* record asconf/asconf-ack if listed */ if (chunks->chunk_types[i] == SCTP_ASCONF) saw_asconf = 1; if (chunks->chunk_types[i] == SCTP_ASCONF_ACK) saw_asconf_ack = 1; } if (num_chunks) got_chklist = 1; } offset += SCTP_SIZE32(plen); if (offset >= limit) { break; } phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); } /* validate authentication required parameters */ if (got_random && got_hmacs) { peer_supports_auth = 1; } else { peer_supports_auth = 0; } if (!peer_supports_auth && got_chklist) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: peer sent chunk list w/o AUTH\n"); return (-1); } if (peer_supports_asconf && !peer_supports_auth) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: peer supports ASCONF but not AUTH\n"); return (-1); } else if ((peer_supports_asconf) && (peer_supports_auth) && ((saw_asconf == 0) || (saw_asconf_ack == 0))) { return (-2); } return (0); } void sctp_initialize_auth_params(struct sctp_inpcb *inp, struct sctp_tcb *stcb) { uint16_t chunks_len = 0; uint16_t hmacs_len = 0; uint16_t random_len = SCTP_AUTH_RANDOM_SIZE_DEFAULT; sctp_key_t *new_key; uint16_t keylen; /* initialize hmac list from endpoint */ stcb->asoc.local_hmacs = sctp_copy_hmaclist(inp->sctp_ep.local_hmacs); if (stcb->asoc.local_hmacs != NULL) { hmacs_len = stcb->asoc.local_hmacs->num_algo * sizeof(stcb->asoc.local_hmacs->hmac[0]); } /* initialize auth chunks list from endpoint */ stcb->asoc.local_auth_chunks = sctp_copy_chunklist(inp->sctp_ep.local_auth_chunks); if (stcb->asoc.local_auth_chunks != NULL) { int i; for (i = 0; i < 256; i++) { if (stcb->asoc.local_auth_chunks->chunks[i]) chunks_len++; } } /* copy defaults from the endpoint */ stcb->asoc.authinfo.active_keyid = inp->sctp_ep.default_keyid; /* copy out the shared key list (by reference) from the endpoint */ (void)sctp_copy_skeylist(&inp->sctp_ep.shared_keys, &stcb->asoc.shared_keys); /* now set the concatenated key (random + chunks + hmacs) */ /* key includes parameter headers */ keylen = (3 * sizeof(struct sctp_paramhdr)) + random_len + chunks_len + hmacs_len; new_key = sctp_alloc_key(keylen); if (new_key != NULL) { struct sctp_paramhdr *ph; int plen; /* generate and copy in the RANDOM */ ph = (struct sctp_paramhdr *)new_key->key; ph->param_type = htons(SCTP_RANDOM); plen = sizeof(*ph) + random_len; ph->param_length = htons(plen); SCTP_READ_RANDOM(new_key->key + sizeof(*ph), random_len); keylen = plen; /* append in the AUTH chunks */ /* NOTE: currently we always have chunks to list */ ph = (struct sctp_paramhdr *)(new_key->key + keylen); ph->param_type = htons(SCTP_CHUNK_LIST); plen = sizeof(*ph) + chunks_len; ph->param_length = htons(plen); keylen += sizeof(*ph); if (stcb->asoc.local_auth_chunks) { int i; for (i = 0; i < 256; i++) { if (stcb->asoc.local_auth_chunks->chunks[i]) new_key->key[keylen++] = i; } } /* append in the HMACs */ ph = (struct sctp_paramhdr *)(new_key->key + keylen); ph->param_type = htons(SCTP_HMAC_LIST); plen = sizeof(*ph) + hmacs_len; ph->param_length = htons(plen); keylen += sizeof(*ph); (void)sctp_serialize_hmaclist(stcb->asoc.local_hmacs, new_key->key + keylen); } if (stcb->asoc.authinfo.random != NULL) sctp_free_key(stcb->asoc.authinfo.random); stcb->asoc.authinfo.random = new_key; stcb->asoc.authinfo.random_len = random_len; } Index: projects/clang900-import/sys/netinet/tcp_stacks/rack.c =================================================================== --- projects/clang900-import/sys/netinet/tcp_stacks/rack.c (revision 352536) +++ projects/clang900-import/sys/netinet/tcp_stacks/rack.c (revision 352537) @@ -1,9246 +1,9261 @@ /*- * Copyright (c) 2016-2019 * Netflix Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #include /* for proc0 declaration */ #ifdef NETFLIX_STATS #include #endif #include #include #include #include #include #ifdef NETFLIX_STATS #include /* Must come after qmath.h and tree.h */ #endif #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef INET6 #include #endif #include #if defined(IPSEC) || defined(IPSEC_SUPPORT) #include #include #endif /* IPSEC */ #include #include #include #ifdef MAC #include #endif #include "sack_filter.h" #include "tcp_rack.h" #include "rack_bbr_common.h" uma_zone_t rack_zone; uma_zone_t rack_pcb_zone; #ifndef TICKS2SBT #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) #endif struct sysctl_ctx_list rack_sysctl_ctx; struct sysctl_oid *rack_sysctl_root; #define CUM_ACKED 1 #define SACKED 2 /* * The RACK module incorporates a number of * TCP ideas that have been put out into the IETF * over the last few years: * - Matt Mathis's Rate Halving which slowly drops * the congestion window so that the ack clock can * be maintained during a recovery. * - Yuchung Cheng's RACK TCP (for which its named) that * will stop us using the number of dup acks and instead * use time as the gage of when we retransmit. * - Reorder Detection of RFC4737 and the Tail-Loss probe draft * of Dukkipati et.al. * RACK depends on SACK, so if an endpoint arrives that * cannot do SACK the state machine below will shuttle the * connection back to using the "default" TCP stack that is * in FreeBSD. * * To implement RACK the original TCP stack was first decomposed * into a functional state machine with individual states * for each of the possible TCP connection states. The do_segement * functions role in life is to mandate the connection supports SACK * initially and then assure that the RACK state matches the conenction * state before calling the states do_segment function. Each * state is simplified due to the fact that the original do_segment * has been decomposed and we *know* what state we are in (no * switches on the state) and all tests for SACK are gone. This * greatly simplifies what each state does. * * TCP output is also over-written with a new version since it * must maintain the new rack scoreboard. * */ static int32_t rack_precache = 1; static int32_t rack_tlp_thresh = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 * - 60 seconds */ static int32_t rack_pkt_delay = 1; static int32_t rack_inc_var = 0;/* For TLP */ static int32_t rack_reduce_largest_on_idle = 0; static int32_t rack_min_pace_time = 0; static int32_t rack_min_pace_time_seg_req=6; static int32_t rack_early_recovery = 1; static int32_t rack_early_recovery_max_seg = 6; static int32_t rack_send_a_lot_in_prr = 1; static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; static int32_t rack_map_entries_limit = 1024; static int32_t rack_map_split_limit = 256; /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up * being a total of 122.850 seconds before a * connection is killed. */ static int32_t rack_tlp_min = 10; static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ static int32_t rack_rto_max = 30000; /* 30 seconds */ static const int32_t rack_free_cache = 2; static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; static int32_t rack_pace_every_seg = 1; static int32_t rack_delayed_ack_time = 200; /* 200ms */ static int32_t rack_slot_reduction = 4; static int32_t rack_lower_cwnd_at_tlp = 0; static int32_t rack_use_proportional_reduce = 0; static int32_t rack_proportional_rate = 10; static int32_t rack_tlp_max_resend = 2; static int32_t rack_limited_retran = 0; static int32_t rack_always_send_oldest = 0; static int32_t rack_sack_block_limit = 128; static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; /* Rack specific counters */ counter_u64_t rack_badfr; counter_u64_t rack_badfr_bytes; counter_u64_t rack_rtm_prr_retran; counter_u64_t rack_rtm_prr_newdata; counter_u64_t rack_timestamp_mismatch; counter_u64_t rack_reorder_seen; counter_u64_t rack_paced_segments; counter_u64_t rack_unpaced_segments; counter_u64_t rack_saw_enobuf; counter_u64_t rack_saw_enetunreach; /* Tail loss probe counters */ counter_u64_t rack_tlp_tot; counter_u64_t rack_tlp_newdata; counter_u64_t rack_tlp_retran; counter_u64_t rack_tlp_retran_bytes; counter_u64_t rack_tlp_retran_fail; counter_u64_t rack_to_tot; counter_u64_t rack_to_arm_rack; counter_u64_t rack_to_arm_tlp; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; counter_u64_t rack_to_alloc_limited; counter_u64_t rack_alloc_limited_conns; counter_u64_t rack_split_limited; counter_u64_t rack_sack_proc_all; counter_u64_t rack_sack_proc_short; counter_u64_t rack_sack_proc_restart; counter_u64_t rack_runt_sacks; counter_u64_t rack_used_tlpmethod; counter_u64_t rack_used_tlpmethod2; counter_u64_t rack_enter_tlp_calc; counter_u64_t rack_input_idle_reduces; counter_u64_t rack_tlp_does_nada; /* Temp CPU counters */ counter_u64_t rack_find_high; counter_u64_t rack_progress_drops; counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; /* * This was originally defined in tcp_timer.c, but is now reproduced here given * the unification of the SYN and non-SYN retransmit timer exponents combined * with wanting to retain previous behaviour for previously deployed stack * versions. */ int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; static void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); static int rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); static int rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type); static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused); static void rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); static void rack_counter_destroy(void); static int rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); static void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); static void rack_dtor(void *mem, int32_t size, void *arg); static void rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, uint32_t t, uint32_t cts); static struct rack_sendmap * rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm); static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); static int32_t rack_handoff_ok(struct tcpcb *tp); static int32_t rack_init(struct tcpcb *tp); static void rack_init_sysctls(void); static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th); static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, uint8_t pass, struct rack_sendmap *hintrsm); static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm); static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); static int32_t rack_output(struct tcpcb *tp); static void rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, struct timeval *tv); static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts); static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); static void rack_remxt_tmr(struct tcpcb *tp); static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); static int32_t rack_stopall(struct tcpcb *tp); static void rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta); static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts); static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); static int32_t tcp_addrack(module_t mod, int32_t type, void *data); static void rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val); static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static void rack_do_drop(struct mbuf *m, struct tcpcb *tp); static void rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); static void rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen); static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt); static int rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static int rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val); static int rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp); struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused); static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); static int rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); int32_t rack_clear_counter=0; static int sysctl_rack_clear(SYSCTL_HANDLER_ARGS) { uint32_t stat; int32_t error; error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); if (error || req->newptr == NULL) return error; error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); if (error) return (error); if (stat == 1) { #ifdef INVARIANTS printf("Clearing RACK counters\n"); #endif counter_u64_zero(rack_badfr); counter_u64_zero(rack_badfr_bytes); counter_u64_zero(rack_rtm_prr_retran); counter_u64_zero(rack_rtm_prr_newdata); counter_u64_zero(rack_timestamp_mismatch); counter_u64_zero(rack_reorder_seen); counter_u64_zero(rack_tlp_tot); counter_u64_zero(rack_tlp_newdata); counter_u64_zero(rack_tlp_retran); counter_u64_zero(rack_tlp_retran_bytes); counter_u64_zero(rack_tlp_retran_fail); counter_u64_zero(rack_to_tot); counter_u64_zero(rack_to_arm_rack); counter_u64_zero(rack_to_arm_tlp); counter_u64_zero(rack_paced_segments); counter_u64_zero(rack_unpaced_segments); counter_u64_zero(rack_saw_enobuf); counter_u64_zero(rack_saw_enetunreach); counter_u64_zero(rack_to_alloc_hard); counter_u64_zero(rack_to_alloc_emerg); counter_u64_zero(rack_sack_proc_all); counter_u64_zero(rack_sack_proc_short); counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); counter_u64_zero(rack_to_alloc_limited); counter_u64_zero(rack_alloc_limited_conns); counter_u64_zero(rack_split_limited); counter_u64_zero(rack_find_high); counter_u64_zero(rack_runt_sacks); counter_u64_zero(rack_used_tlpmethod); counter_u64_zero(rack_used_tlpmethod2); counter_u64_zero(rack_enter_tlp_calc); counter_u64_zero(rack_progress_drops); counter_u64_zero(rack_tlp_does_nada); } rack_clear_counter = 0; return (0); } static void rack_init_sysctls() { SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "map_limit", CTLFLAG_RW, &rack_map_entries_limit , 1024, "Is there a limit on how big the sendmap can grow? "); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "map_splitlimit", CTLFLAG_RW, &rack_map_split_limit , 256, "Is there a limit on how much splitting a peer can do?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rate_sample_method", CTLFLAG_RW, &rack_rate_sample_method , USE_RTT_LOW, "What method should we use for rate sampling 0=high, 1=low "); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "data_after_close", CTLFLAG_RW, &rack_ignore_data_after_close, 0, "Do we hold off sending a RST until all pending data is ack'd"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlpmethod", CTLFLAG_RW, &rack_tlp_threshold_use, TLP_USE_TWO_ONE, "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "min_pace_time", CTLFLAG_RW, &rack_min_pace_time, 0, "Should we enforce a minimum pace time of 1ms"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "min_pace_segs", CTLFLAG_RW, &rack_min_pace_time_seg_req, 6, "How many segments have to be in the len to enforce min-pace-time"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "idle_reduce_high", CTLFLAG_RW, &rack_reduce_largest_on_idle, 0, "Should we reduce the largest cwnd seen to IW on idle reduction"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "bb_verbose", CTLFLAG_RW, &rack_verbose_logging, 0, "Should RACK black box logging be verbose"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sackfiltering", CTLFLAG_RW, &rack_use_sack_filter, 1, "Do we use sack filtering?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "delayed_ack", CTLFLAG_RW, &rack_delayed_ack_time, 200, "Delayed ack time (200ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlpminto", CTLFLAG_RW, &rack_tlp_min, 10, "TLP minimum timeout per the specification (10ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "precache", CTLFLAG_RW, &rack_precache, 0, "Where should we precache the mcopy (0 is not at all)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sblklimit", CTLFLAG_RW, &rack_sack_block_limit, 128, "When do we start paying attention to small sack blocks"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "send_oldest", CTLFLAG_RW, &rack_always_send_oldest, 1, "Should we always send the oldest TLP and RACK-TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, &rack_tlp_in_recovery, 1, "Can we do a TLP during recovery?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_tlimit", CTLFLAG_RW, &rack_limited_retran, 0, "How many times can a rack timeout drive out sends"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "minrto", CTLFLAG_RW, &rack_rto_min, 0, "Minimum RTO in ms -- set with caution below 1000 due to TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "maxrto", CTLFLAG_RW, &rack_rto_max, 0, "Maxiumum RTO in ms -- should be at least as large as min_rto"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retry", CTLFLAG_RW, &rack_tlp_max_resend, 2, "How many times does TLP retry a single segment or multiple with no ACK"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, &rack_use_proportional_reduce, 0, "Should we proportionaly reduce cwnd based on the number of losses "); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "recovery_prop", CTLFLAG_RW, &rack_proportional_rate, 10, "What percent reduction per loss"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, &rack_lower_cwnd_at_tlp, 0, "When a TLP completes a retran should we enter recovery?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_reduces", CTLFLAG_RW, &rack_slot_reduction, 4, "When setting a slot should we reduce by divisor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, &rack_pace_every_seg, 1, "Should we pace out every segment hptsi"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, &rack_hptsi_segments, 6, "Should we pace out only a limited size of segments"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prr_sendalot", CTLFLAG_RW, &rack_send_a_lot_in_prr, 1, "Send a lot in prr"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "minto", CTLFLAG_RW, &rack_min_to, 1, "Minimum rack timeout in milliseconds"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, &rack_early_recovery_max_seg, 6, "Max segments in early recovery"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "earlyrecovery", CTLFLAG_RW, &rack_early_recovery, 1, "Do we do early recovery with rack"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "reorder_thresh", CTLFLAG_RW, &rack_reorder_thresh, 2, "What factor for rack will be added when seeing reordering (shift right)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, &rack_tlp_thresh, 1, "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "reorder_fade", CTLFLAG_RW, &rack_reorder_fade, 0, "Does reorder detection fade, if so how many ms (0 means never)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "pktdelay", CTLFLAG_RW, &rack_pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "inc_var", CTLFLAG_RW, &rack_inc_var, 0, "Should rack add to the TLP timer the variance in rtt calculation"); rack_badfr = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "badfr", CTLFLAG_RD, &rack_badfr, "Total number of bad FRs"); rack_badfr_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "badfr_bytes", CTLFLAG_RD, &rack_badfr_bytes, "Total number of bad FRs"); rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prrsndret", CTLFLAG_RD, &rack_rtm_prr_retran, "Total number of prr based retransmits"); rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prrsndnew", CTLFLAG_RD, &rack_rtm_prr_newdata, "Total number of prr based new transmits"); rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tsnf", CTLFLAG_RD, &rack_timestamp_mismatch, "Total number of timestamps that we could not find the reported ts"); rack_find_high = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "findhigh", CTLFLAG_RD, &rack_find_high, "Total number of FIN causing find-high"); rack_reorder_seen = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "reordering", CTLFLAG_RD, &rack_reorder_seen, "Total number of times we added delay due to reordering"); rack_tlp_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_to_total", CTLFLAG_RD, &rack_tlp_tot, "Total number of tail loss probe expirations"); rack_tlp_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_new", CTLFLAG_RD, &rack_tlp_newdata, "Total number of tail loss probe sending new data"); rack_tlp_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retran", CTLFLAG_RD, &rack_tlp_retran, "Total number of tail loss probe sending retransmitted data"); rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, &rack_tlp_retran_bytes, "Total bytes of tail loss probe sending retransmitted data"); rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, &rack_tlp_retran_fail, "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); rack_to_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_to_tot", CTLFLAG_RD, &rack_to_tot, "Total number of times the rack to expired?"); rack_to_arm_rack = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "arm_rack", CTLFLAG_RD, &rack_to_arm_rack, "Total number of times the rack timer armed?"); rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "arm_tlp", CTLFLAG_RD, &rack_to_arm_tlp, "Total number of times the tlp timer armed?"); rack_paced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "paced", CTLFLAG_RD, &rack_paced_segments, "Total number of times a segment send caused hptsi"); rack_unpaced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "unpaced", CTLFLAG_RD, &rack_unpaced_segments, "Total number of times a segment did not cause hptsi"); rack_saw_enobuf = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "saw_enobufs", CTLFLAG_RD, &rack_saw_enobuf, "Total number of times a segment did not cause hptsi"); rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "saw_enetunreach", CTLFLAG_RD, &rack_saw_enetunreach, "Total number of times a segment did not cause hptsi"); rack_to_alloc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allocs", CTLFLAG_RD, &rack_to_alloc, "Total allocations of tracking structures"); rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allochard", CTLFLAG_RD, &rack_to_alloc_hard, "Total allocations done with sleeping the hard way"); rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "allocemerg", CTLFLAG_RD, &rack_to_alloc_emerg, "Total allocations done from emergency cache"); rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "alloc_limited", CTLFLAG_RD, &rack_to_alloc_limited, "Total allocations dropped due to limit"); rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, &rack_alloc_limited_conns, "Connections with allocations dropped due to limit"); rack_split_limited = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "split_limited", CTLFLAG_RD, &rack_split_limited, "Split allocations dropped due to limit"); rack_sack_proc_all = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_long", CTLFLAG_RD, &rack_sack_proc_all, "Total times we had to walk whole list for sack processing"); rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_restart", CTLFLAG_RD, &rack_sack_proc_restart, "Total times we had to walk whole list due to a restart"); rack_sack_proc_short = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_short", CTLFLAG_RD, &rack_sack_proc_short, "Total times we took shortcut for sack processing"); rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, &rack_enter_tlp_calc, "Total times we called calc-tlp"); rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hit_tlp_method", CTLFLAG_RD, &rack_used_tlpmethod, "Total number of runt sacks"); rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, &rack_used_tlpmethod2, "Total number of runt sacks 2"); rack_runt_sacks = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "runtsacks", CTLFLAG_RD, &rack_runt_sacks, "Total number of runt sacks"); rack_progress_drops = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "prog_drops", CTLFLAG_RD, &rack_progress_drops, "Total number of progress drops"); rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, &rack_input_idle_reduces, "Total number of idle reductions on input"); rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp_nada", CTLFLAG_RD, &rack_tlp_does_nada, "Total number of nada tlp calls"); COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "opts", CTLFLAG_RD, rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); SYSCTL_ADD_PROC(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); } static inline int32_t rack_progress_timeout_check(struct tcpcb *tp) { #ifdef NETFLIX_PROGRESS if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { /* * There is an assumption that the caller * will drop the connection so we will * increment the counters here. */ struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; counter_u64_add(rack_progress_drops, 1); TCPSTAT_INC(tcps_progdrops); rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); return (1); } } #endif return (0); } static void rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex8 = which; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERSTAR, 0, 0, &log, false); } } static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_RTO, 0, 0, &log, false); } } static void rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, uint32_t o_srtt, uint32_t o_var) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = t; log.u_bbr.flex2 = o_srtt; log.u_bbr.flex3 = o_var; log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; TCP_LOG_EVENT(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRRTT, 0, 0, &log, false); } } static void rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) { /* * Log the rtt sample we are * applying to the srtt algorithm in * useconds. */ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); /* Convert our ms to a microsecond */ log.u_bbr.flex1 = rtt * 1000; log.u_bbr.timeStamp = tcp_get_usecs(&tv); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, TCP_LOG_RTT, 0, 0, &log, false, &tv); } } static inline void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) { if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; log.u_bbr.flex3 = tp->t_maxunacktime; log.u_bbr.flex4 = tp->t_acktime; log.u_bbr.flex8 = event; TCP_LOG_EVENT(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_PROGRESS, 0, 0, &log, false); } } static void rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = rack->rc_in_persist; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRSND, 0, 0, &log, false); } } static void rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = way_out; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex7 = rack->r_wanted_output; log.u_bbr.flex8 = rack->rc_in_persist; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_DOSEG_DONE, 0, 0, &log, false); } } static void rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex7 = hpts_calling; log.u_bbr.flex8 = rack->rc_in_persist; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_JUSTRET, 0, tlen, &log, false); } } static void rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = line; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = 0; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex8 = hpts_removed; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERCANC, 0, 0, &log, false); } } static void rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = timers; log.u_bbr.flex2 = ret; log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = cts; TCP_LOG_EVENT(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TO_PROCESS, 0, 0, &log, false); } } static void rack_counter_destroy() { counter_u64_free(rack_badfr); counter_u64_free(rack_badfr_bytes); counter_u64_free(rack_rtm_prr_retran); counter_u64_free(rack_rtm_prr_newdata); counter_u64_free(rack_timestamp_mismatch); counter_u64_free(rack_reorder_seen); counter_u64_free(rack_tlp_tot); counter_u64_free(rack_tlp_newdata); counter_u64_free(rack_tlp_retran); counter_u64_free(rack_tlp_retran_bytes); counter_u64_free(rack_tlp_retran_fail); counter_u64_free(rack_to_tot); counter_u64_free(rack_to_arm_rack); counter_u64_free(rack_to_arm_tlp); counter_u64_free(rack_paced_segments); counter_u64_free(rack_unpaced_segments); counter_u64_free(rack_saw_enobuf); counter_u64_free(rack_saw_enetunreach); counter_u64_free(rack_to_alloc_hard); counter_u64_free(rack_to_alloc_emerg); counter_u64_free(rack_sack_proc_all); counter_u64_free(rack_sack_proc_short); counter_u64_free(rack_sack_proc_restart); counter_u64_free(rack_to_alloc); counter_u64_free(rack_to_alloc_limited); counter_u64_free(rack_split_limited); counter_u64_free(rack_find_high); counter_u64_free(rack_runt_sacks); counter_u64_free(rack_enter_tlp_calc); counter_u64_free(rack_used_tlpmethod); counter_u64_free(rack_used_tlpmethod2); counter_u64_free(rack_progress_drops); counter_u64_free(rack_input_idle_reduces); counter_u64_free(rack_tlp_does_nada); COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); } static struct rack_sendmap * rack_alloc(struct tcp_rack *rack) { struct rack_sendmap *rsm; rsm = uma_zalloc(rack_zone, M_NOWAIT); if (rsm) { rack->r_ctl.rc_num_maps_alloced++; counter_u64_add(rack_to_alloc, 1); return (rsm); } if (rack->rc_free_cnt) { counter_u64_add(rack_to_alloc_emerg, 1); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt--; return (rsm); } return (NULL); } static struct rack_sendmap * rack_alloc_full_limit(struct tcp_rack *rack) { if ((rack_map_entries_limit > 0) && (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { counter_u64_add(rack_to_alloc_limited, 1); if (!rack->alloc_limit_reported) { rack->alloc_limit_reported = 1; counter_u64_add(rack_alloc_limited_conns, 1); } return (NULL); } return (rack_alloc(rack)); } /* wrapper to allocate a sendmap entry, subject to a specific limit */ static struct rack_sendmap * rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) { struct rack_sendmap *rsm; if (limit_type) { /* currently there is only one limit type */ if (rack_map_split_limit > 0 && rack->r_ctl.rc_num_split_allocs >= rack_map_split_limit) { counter_u64_add(rack_split_limited, 1); if (!rack->alloc_limit_reported) { rack->alloc_limit_reported = 1; counter_u64_add(rack_alloc_limited_conns, 1); } return (NULL); } } /* allocate and mark in the limit type, if set */ rsm = rack_alloc(rack); if (rsm != NULL && limit_type) { rsm->r_limit_type = limit_type; rack->r_ctl.rc_num_split_allocs++; } return (rsm); } static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { if (rsm->r_limit_type) { /* currently there is only one limit type */ rack->r_ctl.rc_num_split_allocs--; } if (rack->r_ctl.rc_tlpsend == rsm) rack->r_ctl.rc_tlpsend = NULL; if (rack->r_ctl.rc_next == rsm) rack->r_ctl.rc_next = NULL; if (rack->r_ctl.rc_sacklast == rsm) rack->r_ctl.rc_sacklast = NULL; if (rack->rc_free_cnt < rack_free_cache) { memset(rsm, 0, sizeof(struct rack_sendmap)); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); rsm->r_limit_type = 0; rack->rc_free_cnt++; return; } rack->r_ctl.rc_num_maps_alloced--; uma_zfree(rack_zone, rsm); } /* * CC wrapper hook functions */ static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery) { #ifdef NETFLIX_STATS int32_t gput; #endif INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { uint32_t max; max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; if (tp->ccv->bytes_this_ack > max) { tp->ccv->bytes_this_ack = max; } } if (tp->snd_cwnd <= tp->snd_wnd) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { #ifdef NETFLIX_STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t) tp->snd_cwnd) - tp->snd_wnd); if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* * XXXLAS: This is a temporary hack, and should be * chained off VOI_TCP_GPUT when stats(9) grows an * API to deal with chained VOIs. */ if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; if (tp->t_maxpeakrate) { /* * We update t_peakrate_thr. This gives us roughly * one update per round trip time. */ tcp_update_peakrate_thr(tp); } } #endif if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, nsegs * V_tcp_abc_l_var * tp->t_maxseg); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } #ifdef NETFLIX_STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); #endif if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; } /* we enforce max peak rate if it is set. */ if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { tp->snd_cwnd = tp->t_peakrate_thr; } } static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); if (rack->r_ctl.rc_prr_sndcnt > 0) rack->r_wanted_output++; } static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) { struct tcp_rack *rack; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* * Here we can in theory adjust cwnd to be based on the number of * losses in the window (rack->r_ctl.rc_loss_count). This is done * based on the rack_use_proportional flag. */ if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { int32_t reduce; reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); if (reduce > 50) { reduce = 50; } tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); } else { if (tp->snd_cwnd > tp->snd_ssthresh) { /* Drop us down to the ssthresh (1/2 cwnd at loss) */ tp->snd_cwnd = tp->snd_ssthresh; } } if (rack->r_ctl.rc_prr_sndcnt > 0) { /* Suck the next prr cnt back into cwnd */ tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; rack->r_ctl.rc_prr_sndcnt = 0; } tp->snd_recover = tp->snd_una; EXIT_RECOVERY(tp->t_flags); } static void rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { struct tcp_rack *rack; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: /* rack->r_ctl.rc_ssthresh_set = 1;*/ if (!IN_FASTRECOVERY(tp->t_flags)) { rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; rack->r_ctl.rc_loss_count = 0; rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags)) { TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg) * tp->t_maxseg; tp->snd_cwnd = tp->t_maxseg; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } } static inline void rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) { uint32_t i_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef NETFLIX_STATS TCPSTAT_INC(tcps_idle_restarts); if (tp->t_state == TCPS_ESTABLISHED) TCPSTAT_INC(tcps_idle_estrestarts); #endif if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); if (V_tcp_initcwnd_segments) i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), max(2 * tp->t_maxseg, 14600)); else if (V_tcp_do_rfc3390) i_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (tp->t_maxseg > 2190) i_cwnd = 2 * tp->t_maxseg; else if (tp->t_maxseg > 1095) i_cwnd = 3 * tp->t_maxseg; else i_cwnd = 4 * tp->t_maxseg; } if (reduce_largest) { /* * Do we reduce the largest cwnd to make * rack play nice on restart hptsi wise? */ if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; } /* * Being idle is no differnt than the initial window. If the cc * clamps it down below the initial window raise it to the initial * window. */ if (tp->snd_cwnd < i_cwnd) { tp->snd_cwnd = i_cwnd; } } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. * - Delayed acks are enabled or this is a half-synchronized T/TCP * connection. */ #define DELAY_ACK(tp, tlen) \ (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ ((tp->t_flags & TF_DELACK) == 0) && \ (tlen <= tp->t_maxseg) && \ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) static inline void rack_calc_rwin(struct socket *so, struct tcpcb *tp) { int32_t win; /* * Calculate amount of space in receive window, and then do TCP * input processing. Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } static void rack_do_drop(struct mbuf *m, struct tcpcb *tp) { /* * Drop space held by incoming segment and return. */ if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); if (m) m_freem(m); } static void rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) { if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ static void rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) { /* * Generate an ACK dropping incoming segment if it occupies sequence * space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all paths to this * code happen after packets containing RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the segment * we received passes the SYN-RECEIVED ACK test. If it fails send a * RST. This breaks the loop in the "LAND" DoS attack, and also * prevents an ACK storm between two listening ports that have been * sent forged SYN segments, each with the source address of the * other. */ struct tcp_rack *rack; if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return; } else *ret_val = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; rack->r_wanted_output++; tp->t_flags |= TF_ACKNOW; if (m) m_freem(m); } static int rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in * window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should test against * last_ack_sent instead of rcv_nxt. Note 2: we handle special case * of closed window, not covered by the RFC. */ int dropped = 0; if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || (tp->last_ack_sent == th->th_seq) || (tp->rcv_nxt == th->th_seq) || ((tp->last_ack_sent - 1) == th->th_seq)) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ default: tp = tcp_close(tp); } dropped = 1; rack_do_drop(m, tp); } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; } } else { m_freem(m); } return (dropped); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ static void rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); *ret_val = 1; rack_do_drop(m, tp); } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; *ret_val = 0; rack_do_drop(m, NULL); } } /* * rack_ts_check returns 1 for you should not proceed. It places * in ret_val what should be returned 1/0 by the caller. The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ static int rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates ts_recent, * the age will be reset later and ts_recent will get a * valid value. If it does not, setting ts_recent to zero * will at least satisfy the requirement that zero be placed * in the timestamp echo reply when ts_recent isn't valid. * The age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be dropped * when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); *ret_val = 0; if (tlen) { rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); } else { rack_do_drop(m, NULL); } return (1); } return (0); } /* * rack_drop_checks returns 1 for you should not proceed. It places * in ret_val what should be returned 1/0 by the caller. The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ static int rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) { int32_t todrop; int32_t thflags; int32_t tlen; thflags = *thf; tlen = *tlenp; todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } if (tp->t_flags & TF_SACK_PERMIT) { /* * record the left, to-be-dropped edge of data * here, for use as dsack block further down */ tcp_update_sack_list(tp, th->th_seq, th->th_seq + todrop); /* * ACK now, as the next in-sequence segment * will clear the DSACK block again */ tp->t_flags |= TF_ACKNOW; } *drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If segment ends after window, drop trailing data (and PUSH and * FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment and * ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else { rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); return (1); } } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH | TH_FIN); } *thf = thflags; *tlenp = tlen; return (0); } static struct rack_sendmap * rack_find_lowest_rsm(struct tcp_rack *rack) { struct rack_sendmap *rsm; /* * Walk the time-order transmitted list looking for an rsm that is * not acked. This will be the one that was sent the longest time * ago that is still outstanding. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { if (rsm->r_flags & RACK_ACKED) { continue; } goto finish; } finish: return (rsm); } static struct rack_sendmap * rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) { struct rack_sendmap *prsm; /* * Walk the sequence order list backward until we hit and arrive at * the highest seq not acked. In theory when this is called it * should be the last segment (which it was not). */ counter_u64_add(rack_find_high, 1); prsm = rsm; TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { continue; } return (prsm); } return (NULL); } static uint32_t rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) { int32_t lro; uint32_t thresh; /* * lro is the flag we use to determine if we have seen reordering. * If it gets set we have seen reordering. The reorder logic either * works in one of two ways: * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as * passed we no longer consider reordering has occuring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro * to 1. * * In the end if lro is non-zero we add the extra time for * reordering in. */ if (srtt == 0) srtt = 1; if (rack->r_ctl.rc_reorder_ts) { if (rack->r_ctl.rc_reorder_fade) { if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { lro = cts - rack->r_ctl.rc_reorder_ts; if (lro == 0) { /* * No time as passed since the last * reorder, mark it as reordering. */ lro = 1; } } else { /* Negative time? */ lro = 0; } if (lro > rack->r_ctl.rc_reorder_fade) { /* Turn off reordering seen too */ rack->r_ctl.rc_reorder_ts = 0; lro = 0; } } else { /* Reodering does not fade */ lro = 1; } } else { lro = 0; } thresh = srtt + rack->r_ctl.rc_pkt_delay; if (lro) { /* It must be set, if not you get 1/4 rtt */ if (rack->r_ctl.rc_reorder_shift) thresh += (srtt >> rack->r_ctl.rc_reorder_shift); else thresh += (srtt >> 2); } else { thresh += 1; } /* We don't let the rack timeout be above a RTO */ if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); } /* And we don't want it above the RTO max either */ if (thresh > rack_rto_max) { thresh = rack_rto_max; } return (thresh); } static uint32_t rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t srtt) { struct rack_sendmap *prsm; uint32_t thresh, len; int maxseg; if (srtt == 0) srtt = 1; if (rack->r_ctl.rc_tlp_threshold) thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); else thresh = (srtt * 2); /* Get the previous sent packet, if any */ maxseg = tcp_maxseg(tp); counter_u64_add(rack_enter_tlp_calc, 1); len = rsm->r_end - rsm->r_start; if (rack->rack_tlp_threshold_use == TLP_USE_ID) { /* Exactly like the ID */ if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. */ counter_u64_add(rack_used_tlpmethod, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { /* 2.1 behavior */ prsm = TAILQ_PREV(rsm, rack_head, r_tnext); if (prsm && (len <= maxseg)) { /* * Two packets outstanding, thresh should be (2*srtt) + * possible inter-packet delay (if any). */ uint32_t inter_gap = 0; int idx, nidx; counter_u64_add(rack_used_tlpmethod, 1); idx = rsm->r_rtr_cnt - 1; nidx = prsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { /* Yes it was sent later (or at the same time) */ inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; } thresh += inter_gap; } else if (len <= maxseg) { /* * Possibly compensate for delayed-ack. */ uint32_t alt_thresh; counter_u64_add(rack_used_tlpmethod2, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { /* 2.2 behavior */ if (len <= maxseg) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. */ counter_u64_add(rack_used_tlpmethod, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } /* Not above an RTO */ if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { thresh = TICKS_2_MSEC(tp->t_rxtcur); } /* Not above a RTO max */ if (thresh > rack_rto_max) { thresh = rack_rto_max; } /* Apply user supplied min TLP */ if (thresh < rack_tlp_min) { thresh = rack_tlp_min; } return (thresh); } static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) { /* * Check to see that we don't need to fall into recovery. We will * need to do so if our oldest transmit is past the time we should * have had an ack. */ struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t idx; uint32_t srtt_cur, srtt, thresh; rack = (struct tcp_rack *)tp->t_fb_ptr; if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { return (NULL); } srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; srtt = TICKS_2_MSEC(srtt_cur); if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) srtt = rack->rc_rack_rtt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) return (NULL); if (rsm->r_flags & RACK_ACKED) { rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) return (NULL); } idx = rsm->r_rtr_cnt - 1; thresh = rack_calc_thresh_rack(rack, srtt, tsused); if (tsused < rsm->r_tim_lastsent[idx]) { return (NULL); } if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { return (NULL); } /* Ok if we reach here we are over-due */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); return (rsm); } static uint32_t rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) { int32_t t; int32_t tt; uint32_t ret_val; t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; ret_val = (uint32_t)tt; return (ret_val); } static uint32_t rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * Start the FR timer, we do this based on getting the first one in * the rc_tmap. Note that if its NULL we must stop the timer. in all * events we need to stop the running timer (if its running) before * starting the new one. */ uint32_t thresh, exp, to, srtt, time_since_sent; uint32_t srtt_cur; int32_t idx; int32_t is_tlp_timer = 0; struct rack_sendmap *rsm; if (rack->t_timers_stopped) { /* All timers have been stopped none are to run */ return (0); } if (rack->rc_in_persist) { /* We can't start any timer in persists */ return (rack_get_persists_timer_val(tp, rack)); } rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) { /* Nothing on the send map */ activate_rxt: if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; to = TICKS_2_MSEC(tp->t_rxtcur); if (to == 0) to = 1; return (to); } return (0); } if (rsm->r_flags & RACK_ACKED) { rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) { /* No lowest? */ goto activate_rxt; } } /* Convert from ms to usecs */ if (rsm->r_flags & RACK_SACK_PASSED) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & RACK_HAS_FIN)) { /* * We don't start a rack timer if all we have is a * FIN outstanding. */ goto activate_rxt; } if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); srtt = TICKS_2_MSEC(srtt_cur); } else srtt = RACK_INITIAL_RTO; thresh = rack_calc_thresh_rack(rack, srtt, cts); idx = rsm->r_rtr_cnt - 1; exp = rsm->r_tim_lastsent[idx] + thresh; if (SEQ_GEQ(exp, cts)) { to = exp - cts; if (to < rack->r_ctl.rc_min_to) { to = rack->r_ctl.rc_min_to; } } else { to = rack->r_ctl.rc_min_to; } } else { /* Ok we need to do a TLP not RACK */ if ((rack->rc_tlp_in_progress != 0) || (rack->r_ctl.rc_tlp_rtx_out != 0)) { /* * The previous send was a TLP or a tlp_rtx is in * process. */ goto activate_rxt; } if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) { /* * Peer collapsed rwnd, don't do TLP. */ goto activate_rxt; } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (rsm == NULL) { /* We found no rsm to TLP with. */ goto activate_rxt; } if (rsm->r_flags & RACK_HAS_FIN) { /* If its a FIN we dont do TLP */ rsm = NULL; goto activate_rxt; } idx = rsm->r_rtr_cnt - 1; if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) time_since_sent = cts - rsm->r_tim_lastsent[idx]; else time_since_sent = 0; is_tlp_timer = 1; if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); srtt = TICKS_2_MSEC(srtt_cur); } else srtt = RACK_INITIAL_RTO; thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); if (thresh > time_since_sent) to = thresh - time_since_sent; else to = rack->r_ctl.rc_min_to; if (to > TCPTV_REXMTMAX) { /* * If the TLP time works out to larger than the max * RTO lets not do TLP.. just RTO. */ goto activate_rxt; } if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { /* * The tail is no longer the last one I did a probe * on */ rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_last_tlp_seq = rsm->r_start; } } if (is_tlp_timer == 0) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; } else { if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { /* * We have exceeded how many times we can retran the * current TLP timer, switch to the RTO timer. */ goto activate_rxt; } else { rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; } } if (to == 0) to = 1; return (to); } static void rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (rack->rc_in_persist == 0) { if (((tp->t_flags & TF_SENTFIN) == 0) && (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) /* Must need to send more data to enter persist */ return; rack->r_ctl.rc_went_idle_time = cts; rack_timer_cancel(tp, rack, cts, __LINE__); tp->t_rxtshift = 0; rack->rc_in_persist = 1; } } static void rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) { if (rack->rc_inp->inp_in_hpts) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack->r_ctl.rc_hpts_flags = 0; } rack->rc_in_persist = 0; rack->r_ctl.rc_went_idle_time = 0; tp->t_flags &= ~TF_FORCEDATA; tp->t_rxtshift = 0; } static void rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) { struct inpcb *inp; uint32_t delayed_ack = 0; uint32_t hpts_timeout; uint8_t stopped; uint32_t left = 0; inp = tp->t_inpcb; if (inp->inp_in_hpts) { /* A previous call is already set up */ return; } if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { return; } stopped = rack->rc_tmr_stopped; if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { left = rack->r_ctl.rc_timer_exp - cts; } rack->r_ctl.rc_timer_exp = 0; if (rack->rc_inp->inp_in_hpts == 0) { rack->r_ctl.rc_hpts_flags = 0; } if (slot) { /* We are hptsi too */ rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { /* * We are still left on the hpts when the to goes * it will be for output. */ if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) slot = rack->r_ctl.rc_last_output_to - cts; else slot = 1; } if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { /* No send window.. we must enter persist */ rack_enter_persist(tp, rack, cts); } else if ((frm_out_sbavail && (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && (tp->snd_wnd < tp->t_maxseg)) && TCPS_HAVEESTABLISHED(tp->t_state)) { /* * If we have no window or we can't send a segment (and have * data to send.. we cheat here and frm_out_sbavail is * passed in with the sbavail(sb) only from bbr_output) and * we are established, then we must enter persits (if not * already in persits). */ rack_enter_persist(tp, rack, cts); } hpts_timeout = rack_timer_start(tp, rack, cts); if (tp->t_flags & TF_DELACK) { delayed_ack = TICKS_2_MSEC(tcp_delacktime); rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; } if (delayed_ack && ((hpts_timeout == 0) || (delayed_ack < hpts_timeout))) hpts_timeout = delayed_ack; else rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; /* * If no timers are going to run and we will fall off the hptsi * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && (slot == 0)) { if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* * Ok we have no timer (persists, rack, tlp, rxt or * del-ack), we don't have segments being paced. So * all that is left is the keepalive timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { /* Get the established keep-alive time */ hpts_timeout = TP_KEEPIDLE(tp); } else { /* Get the initial setup keep-alive time */ hpts_timeout = TP_KEEPINIT(tp); } rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; } } if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { /* * RACK, TLP, persists and RXT timers all are restartable * based on actions input .. i.e we received a packet (ack * or sack) and that changes things (rw, or snd_una etc). * Thus we can restart them with a new value. For * keep-alive, delayed_ack we keep track of what was left * and restart the timer with a smaller value. */ if (left < hpts_timeout) hpts_timeout = left; } if (hpts_timeout) { /* * Hack alert for now we can't time-out over 2,147,483 * seconds (a bit more than 596 hours), which is probably ok * :). */ if (hpts_timeout > 0x7ffffffe) hpts_timeout = 0x7ffffffe; rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } if (slot) { rack->r_ctl.rc_last_output_to = cts + slot; if ((hpts_timeout == 0) || (hpts_timeout > slot)) { if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); rack_log_to_start(rack, cts, hpts_timeout, slot, 1); } else { /* * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } } else if (hpts_timeout) { if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } else { /* No timer starting */ #ifdef INVARIANTS if (SEQ_GT(tp->snd_max, tp->snd_una)) { panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", tp, rack, tot_len_this_send, cts, slot, hpts_timeout); } #endif } rack->rc_tmr_stopped = 0; if (slot) rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); } /* * RACK Timer, here we simply do logging and house keeping. * the normal rack_output() function will call the * appropriate thing to check if we need to do a RACK retransmit. * We return 1, saying don't proceed with rack_output only * when all timers have been stopped (destroyed PCB?). */ static int rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * This timer simply provides an internal trigger to send out data. * The check_recovery_mode call will see if there are needed * retransmissions, if so we will enter fast-recovery. The output * call may or may not do the same thing depending on sysctl * settings. */ struct rack_sendmap *rsm; int32_t recovery; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } rack_log_to_event(rack, RACK_TO_FRM_RACK); recovery = IN_RECOVERY(tp->t_flags); counter_u64_add(rack_to_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); rsm = rack_check_recovery_mode(tp, cts); if (rsm) { uint32_t rtt; rtt = rack->rc_rack_rtt; if (rtt == 0) rtt = 1; if ((recovery == 0) && (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { /* * The rack-timeout that enter's us into recovery * will force out one MSS and set us up so that we * can do one more send in 2*rtt (transitioning the * rack timeout into a rack-tlp). */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { /* * When a rack timer goes, we have to send at * least one segment. They will be paced a min of 1ms * apart via the next rack timer (or further * if the rack timer dictates it). */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } } else { /* This is a case that should happen rarely if ever */ counter_u64_add(rack_tlp_does_nada, 1); #ifdef TCP_BLACKBOX tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); #endif rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; return (0); } static struct rack_sendmap * rack_merge_rsm(struct tcp_rack *rack, struct rack_sendmap *l_rsm, struct rack_sendmap *r_rsm) { /* * We are merging two ack'd RSM's, * the l_rsm is on the left (lower seq * values) and the r_rsm is on the right * (higher seq value). The simplest way * to merge these is to move the right * one into the left. I don't think there * is any reason we need to try to find * the oldest (or last oldest retransmitted). */ l_rsm->r_end = r_rsm->r_end; if (r_rsm->r_rtr_bytes) l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; if (r_rsm->r_in_tmap) { /* This really should not happen */ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); } /* Now the flags */ if (r_rsm->r_flags & RACK_HAS_FIN) l_rsm->r_flags |= RACK_HAS_FIN; if (r_rsm->r_flags & RACK_TLP) l_rsm->r_flags |= RACK_TLP; TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next); if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { /* Transfer the split limit to the map we free */ r_rsm->r_limit_type = l_rsm->r_limit_type; l_rsm->r_limit_type = 0; } rack_free(rack, r_rsm); return(l_rsm); } /* * TLP Timer, here we simply setup what segment we want to * have the TLP expire on, the normal rack_output() will then * send it out. * * We return 1, saying don't proceed with rack_output only * when all timers have been stopped (destroyed PCB?). */ static int rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * Tail Loss Probe. */ struct rack_sendmap *rsm = NULL; struct socket *so; uint32_t amm, old_prr_snd = 0; uint32_t out, avail; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); return (1); } /* * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. */ rack_log_to_event(rack, RACK_TO_FRM_TLP); counter_u64_add(rack_tlp_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); so = tp->t_inpcb->inp_socket; avail = sbavail(&so->so_snd); out = tp->snd_max - tp->snd_una; rack->rc_timer_up = 1; /* * If we are in recovery we can jazz out a segment if new data is * present simply by setting rc_prr_sndcnt to a segment. */ if ((avail > out) && ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { /* New data is available */ amm = avail - out; if (amm > tp->t_maxseg) { amm = tp->t_maxseg; } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { /* not enough to fill a MTU and no-delay is off */ goto need_retran; } if (IN_RECOVERY(tp->t_flags)) { /* Unlikely */ old_prr_snd = rack->r_ctl.rc_prr_sndcnt; if (out + amm <= tp->snd_wnd) rack->r_ctl.rc_prr_sndcnt = amm; else goto need_retran; } else { /* Set the send-new override */ if (out + amm <= tp->snd_wnd) rack->r_ctl.rc_tlp_new_data = amm; else goto need_retran; } rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_last_tlp_seq = tp->snd_max; rack->r_ctl.rc_tlpsend = NULL; counter_u64_add(rack_tlp_newdata, 1); goto send; } need_retran: /* * Ok we need to arrange the last un-acked segment to be re-sent, or * optionally the first un-acked segment. */ if (rack_always_send_oldest) rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); else { rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { rsm = rack_find_high_nonack(rack, rsm); } } if (rsm == NULL) { counter_u64_add(rack_tlp_does_nada, 1); #ifdef TCP_BLACKBOX tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); #endif goto out; } if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { /* * We need to split this the last segment in two. */ int32_t idx; struct rack_sendmap *nrsm; nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { /* * No memory to split, we will just exit and punt * off to the RXT timer. */ counter_u64_add(rack_tlp_does_nada, 1); goto out; } nrsm->r_start = (rsm->r_end - tp->t_maxseg); nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; nrsm->r_rtr_bytes = 0; rsm->r_end = nrsm->r_start; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rsm = nrsm; } rack->r_ctl.rc_tlpsend = rsm; rack->r_ctl.rc_tlp_rtx_out = 1; if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { rack->r_ctl.rc_tlp_seg_send_cnt++; tp->t_rxtshift++; } else { rack->r_ctl.rc_last_tlp_seq = rsm->r_start; rack->r_ctl.rc_tlp_seg_send_cnt = 1; } send: rack->r_ctl.rc_tlp_send_cnt++; if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { /* * Can't [re]/transmit a segment we have not heard from the * peer in max times. We need the retransmit timer to take * over. */ restore: rack->r_ctl.rc_tlpsend = NULL; if (rsm) rsm->r_flags &= ~RACK_TLP; rack->r_ctl.rc_prr_sndcnt = old_prr_snd; counter_u64_add(rack_tlp_retran_fail, 1); goto out; } else if (rsm) { rsm->r_flags |= RACK_TLP; } if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { /* * We don't want to send a single segment more than the max * either. */ goto restore; } rack->r_timer_override = 1; rack->r_tlp_running = 1; rack->rc_tlp_in_progress = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); out: rack->rc_timer_up = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); } /* * Delayed ack Timer, here we simply need to setup the * ACK_NOW flag and remove the DELACK flag. From there * the output routine will send the ack out. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } rack_log_to_event(rack, RACK_TO_FRM_DELACK); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; return (0); } /* * Persists timer, here we simply need to setup the * FORCE-DATA flag the output routine will send * the one byte send. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { struct inpcb *inp; int32_t retval = 0; inp = tp->t_inpcb; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (rack->rc_in_persist == 0) return (0); if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); /* * Persistence timer into zero window. Force a byte to be output, if * possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not time out if the * window is closed. After a full backoff, drop the connection if * the idle time (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); retval = 1; tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && tp->snd_una == tp->snd_max) rack_exit_persist(tp, rack); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { retval = 1; TCPSTAT_INC(tcps_persistdrop); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } tp->t_flags |= TF_FORCEDATA; out: rack_log_to_event(rack, RACK_TO_FRM_PERSIST); return (retval); } /* * If a keepalive goes off, we had no other timers * happening. We always return 1 here since this * routine either drops the connection or sends * out a segment with respond. */ static int rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { struct tcptemp *t_template; struct inpcb *inp; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; inp = tp->t_inpcb; rack_log_to_event(rack, RACK_TO_FRM_KEEP); /* * Keep-alive timer went off; send something or drop connection if * idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response if the peer is * up and reachable: either an ACK if the connection is * still alive, or an RST if the peer has closed the * connection due to timeout or reboot. Using sequence * number tp->snd_una-1 causes the transmitted zero-length * segment to lie outside the receive window; by the * protocol spec, this requires the correspondent TCP to * respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } } rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); return (1); dropit: TCPSTAT_INC(tcps_keepdrops); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); return (1); } /* * Retransmit helper function, clear up all the ack * flags and take care of important book keeping. */ static void rack_remxt_tmr(struct tcpcb *tp) { /* * The retransmit timer went off, all sack'd blocks must be * un-acked. */ struct rack_sendmap *rsm, *trsm = NULL; struct tcp_rack *rack; int32_t cnt = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); rack_log_to_event(rack, RACK_TO_FRM_TMR); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); /* * Ideally we would like to be able to * mark SACK-PASS on anything not acked here. * However, if we do that we would burst out * all that data 1ms apart. This would be unwise, * so for now we will just let the normal rxt timer * and tlp timer take care of it. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { if (rsm->r_flags & RACK_ACKED) { cnt++; rsm->r_sndcnt = 0; if (rsm->r_in_tmap == 0) { /* We must re-add it back to the tlist */ if (trsm == NULL) { TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); } else { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); } rsm->r_in_tmap = 1; trsm = rsm; } } rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); } /* Clear the count (we just un-acked them) */ rack->r_ctl.rc_sacked = 0; /* Clear the tlp rtx mark */ rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); /* Setup so we send one segment */ if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; rack->r_timer_override = 1; } /* * Re-transmit timeout! If we drop the PCB we will return 1, otherwise * we will setup to retransmit the lowest seq number outstanding. */ static int rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { int32_t rexmt; struct inpcb *inp; int32_t retval = 0; inp = tp->t_inpcb; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_una == tp->snd_max)) { /* Nothing outstanding .. nothing to do */ return (0); } /* * Retransmission timer went off. Message has not been acked within * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); retval = 1; tcp_set_inp_to_drop(rack->rc_inp, (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; } rack_remxt_tmr(tp); if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be limited * to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can be * recovered if this turns out to be a "bad" retransmit. A * retransmit is considered "bad" if an ACK for this segment * is received within RTT/2 interval; the assumption here is * that the ACK was already in flight. See "On Estimating * End-to-End Network Path Properties" by Allman and Paxson * for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, max(MSEC_2_TICKS(rack_rto_min), rexmt), MSEC_2_TICKS(rack_rto_max)); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple * of packets and process straight to FIN. In that case we won't * catch ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { #ifdef INET6 int32_t isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, * 1448 -> 1188 -> 524) should be given 2 chances to recover * before further clamping down. 'tp->t_rxtshift % 2 == 0' * should take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: - * Disable Path MTU Discovery (IP "DF" bit). - * Reduce MTU to lower value than what we negotiated * with peer. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; } /* * Reduce the MSS to blackhole value or to the * default in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole * and we restore the previous MSS and blackhole * detection flags. The limit '6' is determined by * giving each probe stage (1448, 1188, 524) 2 * chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift >= 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; TCPSTAT_INC(tcps_pmtud_blackhole_failed); } } } /* * Disable RFC1323 and SACK if we haven't got any response to our * third SYN to work-around some broken terminal servers (most of * which have hopefully been retired) that have bad VJ header * compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current retransmit * times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } if (rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); tp->snd_recover = tp->snd_max; tp->t_flags |= TF_ACKNOW; tp->t_rtttime = 0; rack_cong_signal(tp, NULL, CC_RTO); out: return (retval); } static int rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) { int32_t ret = 0; int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); if (timers == 0) { return (0); } if (tp->t_state == TCPS_LISTEN) { /* no timers on listen sockets */ if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) return (0); return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { uint32_t left; if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { ret = -1; rack_log_to_processing(rack, cts, ret, 0); return (0); } if (hpts_calling == 0) { ret = -2; rack_log_to_processing(rack, cts, ret, 0); return (0); } /* * Ok our timer went off early and we are not paced false * alarm, go back to sleep. */ ret = -3; left = rack->r_ctl.rc_timer_exp - cts; tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); rack_log_to_processing(rack, cts, ret, left); rack->rc_last_pto_set = 0; return (1); } rack->rc_tmr_stopped = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; if (timers & PACE_TMR_DELACK) { ret = rack_timeout_delack(tp, rack, cts); } else if (timers & PACE_TMR_RACK) { ret = rack_timeout_rack(tp, rack, cts); } else if (timers & PACE_TMR_TLP) { ret = rack_timeout_tlp(tp, rack, cts); } else if (timers & PACE_TMR_RXT) { ret = rack_timeout_rxt(tp, rack, cts); } else if (timers & PACE_TMR_PERSIT) { ret = rack_timeout_persist(tp, rack, cts); } else if (timers & PACE_TMR_KEEP) { ret = rack_timeout_keepalive(tp, rack, cts); } rack_log_to_processing(rack, cts, ret, timers); return (ret); } static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) { uint8_t hpts_removed = 0; if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; } if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (rack->rc_inp->inp_in_hpts && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { /* * Canceling timer's when we have no output being * paced. We also must remove ourselves from the * hpts. */ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; } rack_log_to_cancel(rack, hpts_removed, line); rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); } } static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) { return; } static int rack_stopall(struct tcpcb *tp) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; rack->t_timers_stopped = 1; return (0); } static void rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) { return; } static int rack_timer_active(struct tcpcb *tp, uint32_t timer_type) { return (0); } static void rack_stop_all_timers(struct tcpcb *tp) { struct tcp_rack *rack; /* * Assure no timers are running. */ if (tcp_timer_active(tp, TT_PERSIST)) { /* We enter in persists, set the flag appropriately */ rack = (struct tcp_rack *)tp->t_fb_ptr; rack->rc_in_persist = 1; } tcp_timer_suspend(tp, TT_PERSIST); tcp_timer_suspend(tp, TT_REXMT); tcp_timer_suspend(tp, TT_KEEP); tcp_timer_suspend(tp, TT_DELACK); } static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts) { int32_t idx; rsm->r_rtr_cnt++; rsm->r_sndcnt++; if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; rsm->r_flags |= RACK_OVERMAX; } if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); } idx = rsm->r_rtr_cnt - 1; rsm->r_tim_lastsent[idx] = ts; if (rsm->r_flags & RACK_ACKED) { /* Problably MTU discovery messing with us */ rsm->r_flags &= ~RACK_ACKED; rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); } if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (rsm->r_flags & RACK_SACK_PASSED) { /* We have retransmitted due to the SACK pass */ rsm->r_flags &= ~RACK_SACK_PASSED; rsm->r_flags |= RACK_WAS_SACKPASS; } /* Update memory for next rtr */ rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); } static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) { /* * We (re-)transmitted starting at rsm->r_start for some length * (possibly less than r_end. */ struct rack_sendmap *nrsm; uint32_t c_end; int32_t len; int32_t idx; len = *lenp; c_end = rsm->r_start + len; if (SEQ_GEQ(c_end, rsm->r_end)) { /* * We retransmitted the whole piece or more than the whole * slopping into the next rsm. */ rack_update_rsm(tp, rack, rsm, ts); if (c_end == rsm->r_end) { *lenp = 0; return (0); } else { int32_t act_len; /* Hangs over the end return whats left */ act_len = rsm->r_end - rsm->r_start; *lenp = (len - act_len); return (rsm->r_end); } /* We don't get out of this block. */ } /* * Here we retransmitted less than the whole thing which means we * have to split this into what was transmitted and what was not. */ nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { /* * We can't get memory, so lets not proceed. */ *lenp = 0; return (0); } /* * So here we are going to take the original rsm and make it what we * retransmitted. nrsm will be the tail portion we did not * retransmit. For example say the chunk was 1, 11 (10 bytes). And * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to * 1, 6 and the new piece will be 6, 11. */ nrsm->r_start = c_end; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; nrsm->r_rtr_bytes = 0; rsm->r_end = c_end; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rack_update_rsm(tp, rack, rsm, ts); *lenp = 0; return (0); } static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, uint8_t pass, struct rack_sendmap *hintrsm) { struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm; register uint32_t snd_max, snd_una; int32_t idx; /* * Add to the RACK log of packets in flight or retransmitted. If * there is a TS option we will use the TS echoed, if not we will * grab a TS. * * Retransmissions will increment the count and move the ts to its * proper place. Note that if options do not include TS's then we * won't be able to effectively use the ACK for an RTT on a retran. * * Notes about r_start and r_end. Lets consider a send starting at * sequence 1 for 10 bytes. In such an example the r_start would be * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. * This means that r_end is actually the first sequence for the next * slot (11). * */ /* * If err is set what do we do XXXrrs? should we not add the thing? * -- i.e. return if err != 0 or should we pretend we sent it? -- * i.e. proceed with add ** do this for now. */ INP_WLOCK_ASSERT(tp->t_inpcb); if (err) /* * We don't log errors -- we could but snd_max does not * advance in this case either. */ return; if (th_flags & TH_RST) { /* * We don't log resets and we return immediately from * sending */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; snd_una = tp->snd_una; if (SEQ_LEQ((seq_out + len), snd_una)) { /* Are sending an old segment to induce an ack (keep-alive)? */ return; } if (SEQ_LT(seq_out, snd_una)) { /* huh? should we panic? */ uint32_t end; end = seq_out + len; seq_out = snd_una; len = end - seq_out; } snd_max = tp->snd_max; if (th_flags & (TH_SYN | TH_FIN)) { /* * The call to rack_log_output is made before bumping * snd_max. This means we can record one extra byte on a SYN * or FIN if seq_out is adding more on and a FIN is present * (and we are not resending). */ if (th_flags & TH_SYN) len++; if (th_flags & TH_FIN) len++; if (SEQ_LT(snd_max, tp->snd_nxt)) { /* * The add/update as not been done for the FIN/SYN * yet. */ snd_max = tp->snd_nxt; } } if (len == 0) { /* We don't log zero window probes */ return; } rack->r_ctl.rc_time_last_sent = ts; if (IN_RECOVERY(tp->t_flags)) { rack->r_ctl.rc_prr_out += len; } /* First question is it a retransmission? */ if (seq_out == snd_max) { again: rsm = rack_alloc(rack); if (rsm == NULL) { /* * Hmm out of memory and the tcb got destroyed while * we tried to wait. */ return; } if (th_flags & TH_FIN) { rsm->r_flags = RACK_HAS_FIN; } else { rsm->r_flags = 0; } rsm->r_tim_lastsent[0] = ts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = seq_out; rsm->r_end = rsm->r_start + len; rsm->r_sndcnt = 0; TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; return; } /* * If we reach here its a retransmission and we need to find it. */ more: if (hintrsm && (hintrsm->r_start == seq_out)) { rsm = hintrsm; hintrsm = NULL; } else if (rack->r_ctl.rc_next) { /* We have a hint from a previous run */ rsm = rack->r_ctl.rc_next; } else { /* No hints sorry */ rsm = NULL; } if ((rsm) && (rsm->r_start == seq_out)) { /* * We used rc_next or hintrsm to retransmit, hopefully the * likely case. */ seq_out = rack_update_entry(tp, rack, rsm, ts, &len); if (len == 0) { return; } else { goto more; } } /* Ok it was not the last pointer go through it the hard way. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { if (rsm->r_start == seq_out) { seq_out = rack_update_entry(tp, rack, rsm, ts, &len); rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); if (len == 0) { return; } else { continue; } } if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { /* Transmitted within this piece */ /* * Ok we must split off the front and then let the * update do the rest */ nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { rack_update_rsm(tp, rack, rsm, ts); return; } /* * copy rsm to nrsm and then trim the front of rsm * to not include this part. */ nrsm->r_start = seq_out; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; nrsm->r_rtr_bytes = 0; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } rsm->r_end = nrsm->r_start; TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); if (len == 0) { return; } } } /* * Hmm not found in map did they retransmit both old and on into the * new? */ if (seq_out == tp->snd_max) { goto again; } else if (SEQ_LT(seq_out, tp->snd_max)) { #ifdef INVARIANTS printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", seq_out, len, tp->snd_una, tp->snd_max); printf("Starting Dump of all rack entries\n"); TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { printf("rsm:%p start:%u end:%u\n", rsm, rsm->r_start, rsm->r_end); } printf("Dump complete\n"); panic("seq_out not found rack:%p tp:%p", rack, tp); #endif } else { #ifdef INVARIANTS /* * Hmm beyond sndmax? (only if we are using the new rtt-pack * flag) */ panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", seq_out, len, tp->snd_max, tp); #endif } } /* * Record one of the RTT updates from an ack into * our sample structure. */ static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) { if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; } if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { rack->r_ctl.rack_rs.rs_rtt_highest = rtt; } rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; rack->r_ctl.rack_rs.rs_rtt_tot += rtt; rack->r_ctl.rack_rs.rs_rtt_cnt++; } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) { int32_t delta; uint32_t o_srtt, o_var; int32_t rtt; if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) /* No valid sample */ return; if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { /* We are to use the lowest RTT seen in a single ack */ rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { /* We are to use the highest RTT seen in a single ack */ rtt = rack->r_ctl.rack_rs.rs_rtt_highest; } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { /* We are to use the average RTT seen in a single ack */ rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); } else { #ifdef INVARIANTS panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); #endif return; } if (rtt == 0) rtt = 1; rack_log_rtt_sample(rack, rtt); o_srtt = tp->t_srtt; o_var = tp->t_rttvar; rack = (struct tcp_rack *)tp->t_fb_ptr; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic is * equivalent to the smoothing algorithm in rfc793 with an * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). * Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); tp->t_srtt += delta; if (tp->t_srtt <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit timer * to smoothed rtt + 4 times the smoothed variance. rttvar * is stored as fixed point with 4 bits after the binary * point (scaled by 16). The following is equivalent to * rfc793 smoothing with an alpha of .75 (rttvar = * rttvar*3/4 + |delta| / 4). This replaces rfc793's * wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); tp->t_rttvar += delta; if (tp->t_rttvar <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. Set the * variance to half the rtt (so our first retransmit happens * at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } TCPSTAT_INC(tcps_rttupdated); rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); tp->t_rttupdated++; #ifdef NETFLIX_STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); #endif tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. Because of the * way we do the smoothing, srtt and rttvar will each average +1/2 * tick of bias. When we compute the retransmit timer, we want 1/2 * tick of rounding and 1 extra tick because of +-1/2 tick * uncertainty in the firing of the timer. The bias will give us * exactly the 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below the minimum * feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); tp->t_softerror = 0; } static void rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, uint32_t t, uint32_t cts) { /* * For this RSM, we acknowledged the data from a previous * transmission, not the last one we made. This means we did a false * retransmit. */ struct tcp_rack *rack; if (rsm->r_flags & RACK_HAS_FIN) { /* * The sending of the FIN often is multiple sent when we * have everything outstanding ack'd. We ignore this case * since its over now. */ return; } if (rsm->r_flags & RACK_TLP) { /* * We expect TLP's to have this occur. */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; /* should we undo cc changes and exit recovery? */ if (IN_RECOVERY(tp->t_flags)) { if (rack->r_ctl.rc_rsm_start == rsm->r_start) { /* * Undo what we ratched down and exit recovery if * possible */ EXIT_RECOVERY(tp->t_flags); tp->snd_recover = tp->snd_una; if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; } } if (rsm->r_flags & RACK_WAS_SACKPASS) { /* * We retransmitted based on a sack and the earlier * retransmission ack'd it - re-ordering is occuring. */ counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } counter_u64_add(rack_badfr, 1); counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); } static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) { int32_t i; uint32_t t; if (rsm->r_flags & RACK_ACKED) /* Already done */ return (0); if ((rsm->r_rtr_cnt == 1) || ((ack_type == CUM_ACKED) && (to->to_flags & TOF_TS) && (to->to_tsecr) && (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) ) { /* * We will only find a matching timestamp if its cum-acked. * But if its only one retransmission its for-sure matching * :-) */ t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; if ((int)t <= 0) t = 1; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); if ((rsm->r_flags & RACK_TLP) && (!IN_RECOVERY(tp->t_flags))) { /* Segment was a TLP and our retrans matched */ if (rack->r_ctl.rc_tlp_cwnd_reduce) { rack->r_ctl.rc_rsm_start = tp->snd_max; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure * we send one packet. */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } else rack->r_ctl.rc_tlp_rtx_out = 0; } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } return (1); } /* * We clear the soft/rxtshift since we got an ack. * There is no assurance we will call the commit() function * so we need to clear these to avoid incorrect handling. */ tp->t_rxtshift = 0; tp->t_softerror = 0; if ((to->to_flags & TOF_TS) && (ack_type == CUM_ACKED) && (to->to_tsecr) && ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { /* * Now which timestamp does it match? In this block the ACK * must be coming from a previous transmission. */ for (i = 0; i < rsm->r_rtr_cnt; i++) { if (rsm->r_tim_lastsent[i] == to->to_tsecr) { t = cts - rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if ((i + 1) < rsm->r_rtr_cnt) { /* Likely */ rack_earlier_retran(tp, rsm, t, cts); } if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } /* * Note the following calls to * tcp_rack_xmit_timer() are being commented * out for now. They give us no more accuracy * and often lead to a wrong choice. We have * enough samples that have not been * retransmitted. I leave the commented out * code in here in case in the future we * decide to add it back (though I can't forsee * doing that). That way we will easily see * where they need to be placed. */ if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } return (1); } } goto ts_not_found; } else { /* * Ok its a SACK block that we retransmitted. or a windows * machine without timestamps. We can tell nothing from the * time-stamp since its not there or the time the peer last * recieved a segment that moved forward its cum-ack point. */ ts_not_found: i = rsm->r_rtr_cnt - 1; t = cts - rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { /* * We retransmitted and the ack came back in less * than the smallest rtt we have observed. We most * likey did an improper retransmit as outlined in * 4.2 Step 3 point 2 in the rack-draft. */ i = rsm->r_rtr_cnt - 2; t = cts - rsm->r_tim_lastsent[i]; rack_earlier_retran(tp, rsm, t, cts); } else if (rack->r_ctl.rc_rack_min_rtt) { /* * We retransmitted it and the retransmit did the * job. */ if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; rack->rc_rack_rtt = t; } return (1); } } return (0); } /* * Mark the SACK_PASSED flag on all entries prior to rsm send wise. */ static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm) { struct rack_sendmap *nrsm; uint32_t ts; int32_t idx; idx = rsm->r_rtr_cnt - 1; ts = rsm->r_tim_lastsent[idx]; nrsm = rsm; TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, rack_head, r_tnext) { if (nrsm == rsm) { /* Skip orginal segment he is acked */ continue; } if (nrsm->r_flags & RACK_ACKED) { /* Skip ack'd segments */ continue; } if (nrsm->r_flags & RACK_SACK_PASSED) { /* * We found one that is already marked * passed, we have been here before and * so all others below this are marked. */ break; } idx = nrsm->r_rtr_cnt - 1; if (ts == nrsm->r_tim_lastsent[idx]) { /* * For this case lets use seq no, if we sent in a * big block (TSO) we would have a bunch of segments * sent at the same time. * * We would only get a report if its SEQ is earlier. * If we have done multiple retransmits the times * would not be equal. */ if (SEQ_LT(nrsm->r_start, rsm->r_start)) { nrsm->r_flags |= RACK_SACK_PASSED; nrsm->r_flags &= ~RACK_WAS_SACKPASS; } } else { /* * Here they were sent at different times, not a big * block. Since we transmitted this one later and * see it sack'd then this must also be missing (or * we would have gotten a sack block for it) */ nrsm->r_flags |= RACK_SACK_PASSED; nrsm->r_flags &= ~RACK_WAS_SACKPASS; } } } static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) { int32_t idx; int32_t times = 0; uint32_t start, end, changed = 0; struct rack_sendmap *rsm, *nrsm; int32_t used_ref = 1; start = sack->start; end = sack->end; rsm = *prsm; if (rsm && SEQ_LT(start, rsm->r_start)) { TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { goto do_rest_ofb; } } } if (rsm == NULL) { start_at_beginning: rsm = NULL; used_ref = 0; } /* First lets locate the block where this guy is */ TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { break; } } do_rest_ofb: if (rsm == NULL) { /* * This happens when we get duplicate sack blocks with the * same end. For example SACK 4: 100 SACK 3: 100 The sort * will not change there location so we would just start at * the end of the first one and get lost. */ if (tp->t_flags & TF_SENTFIN) { /* * Check to see if we have not logged the FIN that * went out. */ nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { /* * Ok we did not get the FIN logged. */ nrsm->r_end++; rsm = nrsm; goto do_rest_ofb; } } if (times == 1) { #ifdef INVARIANTS panic("tp:%p rack:%p sack:%p to:%p prsm:%p", tp, rack, sack, to, prsm); #else goto out; #endif } times++; counter_u64_add(rack_sack_proc_restart, 1); goto start_at_beginning; } /* Ok we have an ACK for some piece of rsm */ if (rsm->r_start != start) { /* * Need to split this in two pieces the before and after. */ nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* * failed XXXrrs what can we do but loose the sack * info? */ goto out; } nrsm->r_start = start; nrsm->r_rtr_bytes = 0; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } rsm->r_end = nrsm->r_start; TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rsm = nrsm; } if (SEQ_GEQ(end, rsm->r_end)) { /* * The end of this block is either beyond this guy or right * at this guy. */ if ((rsm->r_flags & RACK_ACKED) == 0) { rack_update_rtt(tp, rack, rsm, to, cts, SACKED); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); rack_log_sack_passed(tp, rack, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } } if (end == rsm->r_end) { /* This block only - done */ goto out; } /* There is more not coverend by this rsm move on */ start = rsm->r_end; nrsm = TAILQ_NEXT(rsm, r_next); rsm = nrsm; times = 0; goto do_rest_ofb; } /* Ok we need to split off this one at the tail */ nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* failed rrs what can we do but loose the sack info? */ goto out; } /* Clone it */ nrsm->r_start = end; nrsm->r_end = rsm->r_end; nrsm->r_rtr_bytes = 0; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_sndcnt = rsm->r_sndcnt; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } /* The sack block does not cover this guy fully */ rsm->r_flags &= (~RACK_HAS_FIN); rsm->r_end = end; TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } if (rsm->r_flags & RACK_ACKED) { /* Been here done that */ goto out; } rack_update_rtt(tp, rack, rsm, to, cts, SACKED); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); rack_log_sack_passed(tp, rack, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } out: if (rsm && (rsm->r_flags & RACK_ACKED)) { /* * Now can we merge this newly acked * block with either the previous or * next block? */ nrsm = TAILQ_NEXT(rsm, r_next); if (nrsm && (nrsm->r_flags & RACK_ACKED)) { /* yep this and next can be merged */ rsm = rack_merge_rsm(rack, rsm, nrsm); } /* Now what about the previous? */ nrsm = TAILQ_PREV(rsm, rack_head, r_next); if (nrsm && (nrsm->r_flags & RACK_ACKED)) { /* yep the previous and this can be merged */ rsm = rack_merge_rsm(rack, nrsm, rsm); } } if (used_ref == 0) { counter_u64_add(rack_sack_proc_all, 1); } else { counter_u64_add(rack_sack_proc_short, 1); } /* Save off where we last were */ if (rsm) rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); else rack->r_ctl.rc_sacklast = NULL; *prsm = rsm; return (changed); } static void inline rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) { struct rack_sendmap *tmap; tmap = NULL; while (rsm && (rsm->r_flags & RACK_ACKED)) { /* Its no longer sacked, mark it so */ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); #ifdef INVARIANTS if (rsm->r_in_tmap) { panic("rack:%p rsm:%p flags:0x%x in tmap?", rack, rsm, rsm->r_flags); } #endif rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); /* Rebuild it into our tmap */ if (tmap == NULL) { TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); tmap = rsm; } else { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); tmap = rsm; } tmap->r_in_tmap = 1; rsm = TAILQ_NEXT(rsm, r_next); } /* * Now lets possibly clear the sack filter so we start * recognizing sacks that cover this area. */ if (rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); } static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) { uint32_t changed, last_seq, entered_recovery = 0; struct tcp_rack *rack; struct rack_sendmap *rsm; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; register uint32_t th_ack; int32_t i, j, k, num_sack_blks = 0; uint32_t cts, acked, ack_point, sack_changed = 0; INP_WLOCK_ASSERT(tp->t_inpcb); if (th->th_flags & TH_RST) { /* We don't log resets */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; cts = tcp_ts_getticks(); rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); changed = 0; th_ack = th->th_ack; if (SEQ_GT(th_ack, tp->snd_una)) { rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); tp->t_acktime = ticks; } if (rsm && SEQ_GT(th_ack, rsm->r_start)) changed = th_ack - rsm->r_start; if (changed) { /* * The ACK point is advancing to th_ack, we must drop off * the packets in the rack log and calculate any eligble * RTT's. */ rack->r_wanted_output++; more: rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); if (rsm == NULL) { if ((th_ack - 1) == tp->iss) { /* * For the SYN incoming case we will not * have called tcp_output for the sending of * the SYN, so there will be no map. All * other cases should probably be a panic. */ goto proc_sack; } if (tp->t_flags & TF_SENTFIN) { /* if we send a FIN we will not hav a map */ goto proc_sack; } #ifdef INVARIANTS panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", tp, th, tp->t_state, rack, tp->snd_una, tp->snd_max, tp->snd_nxt, changed); #endif goto proc_sack; } if (SEQ_LT(th_ack, rsm->r_start)) { /* Huh map is missing this */ #ifdef INVARIANTS printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", rsm->r_start, th_ack, tp->t_state, rack->r_state); #endif goto proc_sack; } rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); /* Now do we consume the whole thing? */ if (SEQ_GEQ(th_ack, rsm->r_end)) { /* Its all consumed. */ uint32_t left; rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } if (rack->r_ctl.rc_next == rsm) { /* scoot along the marker */ rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove * it from total */ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); } else if (rsm->r_flags & RACK_SACK_PASSED) { /* * There are acked segments ACKED on the * scoreboard further up. We are seeing * reordering. */ counter_u64_add(rack_reorder_seen, 1); rsm->r_flags |= RACK_ACKED; rack->r_ctl.rc_reorder_ts = cts; } left = th_ack - rsm->r_end; if (rsm->r_rtr_cnt > 1) { /* * Technically we should make r_rtr_cnt be * monotonicly increasing and just mod it to * the timestamp it is replacing.. that way * we would have the last 3 retransmits. Now * rc_loss_count will be wrong if we * retransmit something more than 2 times in * recovery :( */ rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); } /* Free back to zone */ rack_free(rack, rsm); if (left) { goto more; } goto proc_sack; } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove it from * total for the part being cum-acked. */ rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); } rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; rsm->r_start = th_ack; } proc_sack: /* Check for reneging */ rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { /* * The peer has moved snd_una up to * the edge of this send, i.e. one * that it had previously acked. The only * way that can be true if the peer threw * away data (space issues) that it had * previously sacked (else it would have * given us snd_una up to (rsm->r_end). * We need to undo the acked markings here. * * Note we have to look to make sure th_ack is * our rsm->r_start in case we get an old ack * where th_ack is behind snd_una. */ rack_peer_reneges(rack, rsm, th->th_ack); } if ((to->to_flags & TOF_SACK) == 0) { /* We are done nothing left to log */ goto out; } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); if (rsm) { last_seq = rsm->r_end; } else { last_seq = tp->snd_max; } /* Sack block processing */ if (SEQ_GT(th_ack, tp->snd_una)) ack_point = th_ack; else ack_point = tp->snd_una; for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, ack_point) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, ack_point) && SEQ_LEQ(sack.end, tp->snd_max)) { if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && (SEQ_LT(sack.end, last_seq)) && ((sack.end - sack.start) < (tp->t_maxseg / 8))) { /* * Not the last piece and its smaller than * 1/8th of a MSS. We ignore this. */ counter_u64_add(rack_runt_sacks, 1); continue; } sack_blocks[num_sack_blks] = sack; num_sack_blks++; } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { /* * Its a D-SACK block. */ /* tcp_record_dsack(sack.start, sack.end); */ } } if (num_sack_blks == 0) goto out; /* * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ if (rack_use_sack_filter) { num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); } if (num_sack_blks < 2) { goto do_sack_work; } /* Sort the sacks */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } /* * Now are any of the sack block ends the same (yes some * implememtations send these)? */ again: if (num_sack_blks > 1) { for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (sack_blocks[i].end == sack_blocks[j].end) { /* * Ok these two have the same end we * want the smallest end and then * throw away the larger and start * again. */ if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { /* * The second block covers * more area use that */ sack_blocks[i].start = sack_blocks[j].start; } /* * Now collapse out the dup-sack and * lower the count */ for (k = (j + 1); k < num_sack_blks; k++) { sack_blocks[j].start = sack_blocks[k].start; sack_blocks[j].end = sack_blocks[k].end; j++; } num_sack_blks--; goto again; } } } } do_sack_work: rsm = rack->r_ctl.rc_sacklast; for (i = 0; i < num_sack_blks; i++) { acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); if (acked) { rack->r_wanted_output++; changed += acked; sack_changed += acked; } } out: if (changed) { /* Something changed cancel the rack timer */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { /* * Ok we have a high probability that we need to go in to * recovery since we have data sack'd */ struct rack_sendmap *rsm; uint32_t tsused; tsused = tcp_ts_getticks(); rsm = tcp_rack_output(tp, rack, tsused); if (rsm) { /* Enter recovery */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; entered_recovery = 1; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure we send * one packet. */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; rack->r_timer_override = 1; } } if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { /* Deal with changed an PRR here (in recovery only) */ uint32_t pipe, snd_una; rack->r_ctl.rc_prr_delivered += changed; /* Compute prr_sndcnt */ if (SEQ_GT(tp->snd_una, th_ack)) { snd_una = tp->snd_una; } else { snd_una = th_ack; } pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; if (pipe > tp->snd_ssthresh) { long sndcnt; sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; if (rack->r_ctl.rc_prr_recovery_fs > 0) sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; else { rack->r_ctl.rc_prr_sndcnt = 0; sndcnt = 0; } sndcnt++; if (sndcnt > (long)rack->r_ctl.rc_prr_out) sndcnt -= rack->r_ctl.rc_prr_out; else sndcnt = 0; rack->r_ctl.rc_prr_sndcnt = sndcnt; } else { uint32_t limit; if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); else limit = 0; if (changed > limit) limit = changed; limit += tp->t_maxseg; if (tp->snd_ssthresh > pipe) { rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); } else { rack->r_ctl.rc_prr_sndcnt = min(0, limit); } } if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { rack->r_timer_override = 1; } } } /* * Return value of 1, we do not need to call rack_process_data(). * return value of 0, rack_process_data can be called. * For ret_val if its 0 the TCP is locked, if its non-zero * its unlocked and probably unsafe to touch the TCB. */ static int rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val) { int32_t ourfinisacked = 0; int32_t nsegs, acked_amount; int32_t acked; struct mbuf *mfree; struct tcp_rack *rack; int32_t recovery = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; if (SEQ_GT(th->th_ack, tp->snd_max)) { rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); return (1); } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { rack_log_ack(tp, to, th); } if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* * Old ack, behind (or duplicate to) the last one rcv'd * Note: Should mark reordering is occuring! We should also * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, * 3-3, 4-4 would be reording. As well as ack 1, 3-3 ack 3 */ return (0); } /* * If we reach this point, ACK is not a duplicate, i.e., it ACKs * something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our SYN has * been ACK'd (so connection is now fully synchronized). Go * to non-starred state, increment snd_una for ACK of SYN, * and check if we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } nsegs = max(1, m->m_pkthdr.lro_nsegs); INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK arrives * within our recovery window, then it was a mistake to do the * retransmit in the first place. Recover our original cwnd and * ssthresh, and proceed to transmit where we left off. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) rack_cong_signal(tp, th, CC_RTO_ERR); } /* * If we have a timestamp reply, update smoothed round trip time. If * no timestamp is present but transmit timer is running and timed * sequence number was acked, update smoothed round trip time. Since * we now have an rtt measurement, cancel the timer backoff (cf., * Phil Karn's retransmit alg.). Recompute the initial retransmit * timer. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ /* * If all outstanding data is acked, stop retransmit timer and * remember to restart (more output or persist). If there is more * data to be acked, restart retransmit timer, using current * (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack->r_wanted_output++; } /* * If no data (only SYN) was ACK'd, skip rest of ACK processing. */ if (acked == 0) { if (ofia) *ofia = ourfinisacked; return (0); } if (rack->r_ctl.rc_early_recovery) { if (IN_RECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover) && (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); recovery = 1; } } } /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the * congestion window. */ rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); SOCKBUF_LOCK(&so->so_snd); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; mfree = sbcut_locked(&so->so_snd, acked_amount); if ((sbused(&so->so_snd) == 0) && (acked > acked_amount) && (tp->t_state >= TCPS_FIN_WAIT_1)) { ourfinisacked = 1; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); if (rack->r_ctl.rc_early_recovery == 0) { if (IN_RECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover) && (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); } } } tp->snd_una = th->th_ack; if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { tp->snd_nxt = tp->snd_una; } if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); /* Set need output so persist might get set */ rack->r_wanted_output++; if (rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); if ((tp->t_state >= TCPS_FIN_WAIT_1) && (sbavail(&so->so_snd) == 0) && (tp->t_flags2 & TF2_DROP_AF_DATA)) { /* * The socket was gone and the * peer sent data, time to * reset him. */ *ret_val = 1; tp = tcp_close(tp); rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); return (1); } } if (ofia) *ofia = ourfinisacked; return (0); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { /* * Update window information. Don't look at window if no ACK: TAC's * send garbage on first SYN. */ int32_t nsegs; #ifdef TCP_RFC7413 int32_t tfo_syn; #else #define tfo_syn (FALSE) #endif struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); nsegs = max(1, m->m_pkthdr.lro_nsegs); if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; rack->r_wanted_output++; } else if (thflags & TH_ACK) { if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; } } /* Was persist timer active and now we have window space? */ if ((rack->rc_in_persist != 0) && tp->snd_wnd) { rack_exit_persist(tp, rack); tp->snd_nxt = tp->snd_max; /* Make sure we output to start the timer */ rack->r_wanted_output++; } if (tp->t_flags2 & TF2_DROP_AF_DATA) { m_freem(m); return (0); } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept random * urgent pointers, we'll crash in soreceive. It's hard to * imagine someone actually wanting to send this much urgent * data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, then * mark the data stream. This should not happen in * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a * FIN has been received from the remote side. In these * states we ignore the URG. * * According to RFC961 (Assigned Protocols), the urgent * pointer points to the last octet of urgent data. We * continue, however, to consider it to indicate the first * octet of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t) tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, pull receive urgent * pointer along with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing * queue, and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data is * presented to the user (this happens in tcp_usrreq.c, case * PRU_RCVD). If a FIN has already been received on this connection * then we just ignore the text. */ #ifdef TCP_RFC7413 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags & TF_FASTOPEN)); #endif if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_rnxt = tp->rcv_nxt; int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly * queue with control block tp. Set thflags to whether * reassembly now includes a segment with FIN. This handles * the common case inline (segment is the next to be * received on an established connection, and the queue is * empty), avoiding linkage into and removal from the queue * and repetition of various conversions. Set DELACK for * segments received in order, but ack immediately when * segments are out of order (so fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { rack->r_wanted_output++; tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs when * trimming from the head. */ tcp_seq temp = save_start; thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { /* * DSACK actually handled in the fastpath * above. */ tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { if ((tp->rcv_numsacks >= 1) && (tp->sackblks[0].end == save_start)) { /* * Partial overlap, recorded at todrop * above. */ tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { tcp_update_dsack_list(tp, save_start, save_start + tlen); } } } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know that the * connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized (ie NEEDSYN * flag on) then delay ACK, so it may be piggybacked * when SYN is sent. Otherwise, since we received a * FIN then no more input can be expected, send ACK * now. */ if (tp->t_flags & TF_NEEDSYN) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES enter the * CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been * acked so enter the CLOSING state. */ case TCPS_FIN_WAIT_1: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the * other standard timers. */ case TCPS_FIN_WAIT_2: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); return (1); } } /* * Return any desired output. */ if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { rack->r_wanted_output++; } INP_WLOCK_ASSERT(tp->t_inpcb); return (0); } /* * Here nothing is really faster, its just that we * have broken out the fast-data path also just like * the fast-ack. */ static int rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt) { int32_t nsegs; int32_t newsize = 0; /* automatic sockbuf scaling */ struct tcp_rack *rack; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if (__predict_false(th->th_seq != tp->rcv_nxt)) { return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { return (0); } if (tiwin && tiwin != tp->snd_wnd) { return (0); } if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { return (0); } if (__predict_false((to->to_flags & TOF_TS) && (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { return (0); } if (__predict_false((th->th_ack != tp->snd_una))) { return (0); } if (__predict_false(tlen > sbspace(&so->so_rcv))) { return (0); } if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } rack = (struct tcp_rack *)tp->t_fb_ptr; /* * This is a pure, in-sequence data packet with nothing on the * reassembly queue and we have enough buffer space to take it. */ nsegs = max(1, m->m_pkthdr.lro_nsegs); /* Clean receiver SACK report if present */ if (tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. Give up when limit is * reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); rack_calc_rwin(so, tp); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; rack->r_wanted_output++; } if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); return (1); } /* * This subfunction is used to try to highly optimize the * fast path. We again allow window updates that are * in sequence to remain in the fast-path. We also add * in the __predict's to attempt to help the compiler. * Note that if we return a 0, then we can *not* process * it and the caller should push the packet into the * slow-path. */ static int rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) { int32_t acked; int32_t nsegs; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif struct tcp_rack *rack; if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* Old ack, behind (or duplicate to) the last one rcv'd */ return (0); } if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { /* Above what we have sent? */ return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { /* We are retransmitting */ return (0); } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); } if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { /* We need a SYN or a FIN, unlikely.. */ return (0); } if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { /* Timestamp is behind .. old ack with seq wrap? */ return (0); } if (__predict_false(IN_RECOVERY(tp->t_flags))) { /* Still recovering */ return (0); } rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->r_ctl.rc_sacked) { /* We have sack holes on our scoreboard */ return (0); } /* Ok if we reach here, we can process a fast-ack */ nsegs = max(1, m->m_pkthdr.lro_nsegs); rack_log_ack(tp, to, th); /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { rack_exit_persist(tp, rack); } /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) rack_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the * congestion window. */ rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* ND6_HINT(tp); *//* Some progress has been made. */ /* * If all outstanding data are acked, stop retransmit timer, * otherwise restart timer using current (possibly backed-off) * value. If process is waiting for space, wakeup/selwakeup/signal. * If data are ready to send, let tcp_output decide between more * output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp->snd_una == tp->snd_max) { rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } /* Wake up the socket if we have room to write more */ sowwakeup(so); if (sbavail(&so->so_snd)) { rack->r_wanted_output++; } return (1); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t todrop; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); /* * If the state is SYN_SENT: if seg contains an ACK, but not for our * SYN, drop the input. if seg contains a RST, then drop the * connection. if seg does not contain SYN, then drop it. Otherwise * this is an acceptable SYN segment initialize tp->rcv_nxt and * tp->irs if seg contains ack then advance tp->snd_una if seg * contains an ECE and ECN support is enabled, the stream is ECN * capable. if SYN has been acked change to ESTABLISHED else * SYN_RCVD state arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); rack_do_drop(m, tp); return (1); } if (thflags & TH_RST) { rack_do_drop(m, tp); return (1); } if (!(thflags & TH_SYN)) { rack_do_drop(m, tp); return (1); } tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); /* * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0) { rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; tp->t_flags |= TF_ACKNOW; } if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. Transitions: * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => simultaneous * open. If segment contains CC option and there is a * cached CC, apply TAO test. If it succeeds, connection is * * half-synchronized. Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If * there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_state_change(tp, TCPS_SYN_RECEIVED); } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. If data, * trim to stay within window, dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. If the * remote host used T/TCP to validate the SYN, our data will be * ACK'd; if so, enter normal data segment processing in the middle * of step 5, ack processing. Otherwise, goto step 6. */ if (thflags & TH_ACK) { if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) return (ret_val); /* We may have changed to FIN_WAIT_1 above */ if (tp->t_state == TCPS_FIN_WAIT_1) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now * acknowledged then enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then * closing user can proceed. Starting the * timer is contrary to the specification, * but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and * use a compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } #ifdef TCP_RFC7413 if (tp->t_flags & TF_FASTOPEN) { /* * When a TFO connection is in SYN_RECEIVED, the only valid * packets are the initial SYN, a retransmit/copy of the * initial SYN (possibly with a subset of the original * data), a valid ACK, a FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { rack_do_drop(m, NULL); return (0); } } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { rack_do_drop(m, NULL); return (0); } } #endif if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know the * sequence numbers haven't wrapped. This is a partial fix for the * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { #ifdef TCP_RFC7413 if (tp->t_flags & TF_FASTOPEN) { tp->snd_wnd = tiwin; cc_conn_init(tp); } #endif return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> * FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); #ifdef TCP_RFC7413 if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; /* * Account for the ACK of our SYN prior to regular * ACK processing below. */ tp->snd_una++; } /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such connections * is not harmless as it would undo the snd_cwnd reduction * that occurs when a TFO SYN|ACK is retransmitted. */ if (!(tp->t_flags & TF_FASTOPEN)) #endif cc_conn_init(tp); } /* * If segment contains data or ACK, will call tcp_reass() later; if * not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (tp->t_state == TCPS_FIN_WAIT_1) { /* We could have went to FIN_WAIT_1 (or EST) above */ /* * In FIN_WAIT_1 STATE in addition to the processing for the * ESTABLISHED state if our FIN is now acknowledged then * enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then closing * user can proceed. Starting the timer is contrary * to the specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; /* * Header prediction: check for the two common cases of a * uni-directional data xfer. If the packet has no control flags, * is in-sequence, the window didn't change and we're not * retransmitting, it's a candidate. If the length is zero and the * ack moved forward, we're the sender side of the xfer. Just free * the data acked & wake any higher level process that was blocked * waiting for space. If the length is non-zero and the ack didn't * move, we're the receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data toc The socket * buffer and note that we need a delayed ack. Make sure that the * hidden state-flags are also off. Since we check for * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. */ if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && __predict_true(SEGQ_EMPTY(tp)) && __predict_true(th->th_seq == tp->rcv_nxt)) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if (tlen == 0) { if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { return (0); } } else { if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt)) { return (0); } } } rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } /* State changes only happen in rack_process_data() */ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static int rack_check_data_after_close(struct mbuf *m, struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) { struct tcp_rack *rack; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->rc_allow_data_af_clo == 0) { close_now: tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); return (1); } if (sbavail(&so->so_snd) == 0) goto close_now; /* Ok we allow data that is ignored and a followup reset */ tp->rcv_nxt = th->th_seq + *tlen; tp->t_flags2 |= TF2_DROP_AF_DATA; rack->r_wanted_output = 1; *tlen = 0; return (0); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { /* * If we can't receive any more data, then closing user can * proceed. Starting the timer is contrary to the * specification, but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); m_freem(m); return (1); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * case TCPS_LAST_ACK: Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); rack_do_drop(m, tp); return (1); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { int32_t ret_val = 0; int32_t ourfinisacked = 0; rack_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { rack_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); return (ret_val); } else { rack_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static void inline rack_clear_rate_sample(struct tcp_rack *rack) { rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; rack->r_ctl.rack_rs.rs_rtt_cnt = 0; rack->r_ctl.rack_rs.rs_rtt_tot = 0; } static int rack_init(struct tcpcb *tp) { struct tcp_rack *rack = NULL; tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); if (tp->t_fb_ptr == NULL) { /* * We need to allocate memory but cant. The INP and INP_INFO * locks and they are recusive (happens during setup. So a * scheme to drop the locks fails :( * */ return (ENOMEM); } memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); rack = (struct tcp_rack *)tp->t_fb_ptr; TAILQ_INIT(&rack->r_ctl.rc_map); TAILQ_INIT(&rack->r_ctl.rc_free); TAILQ_INIT(&rack->r_ctl.rc_tmap); rack->rc_tp = tp; if (tp->t_inpcb) { rack->rc_inp = tp->t_inpcb; } /* Probably not needed but lets be sure */ rack_clear_rate_sample(rack); rack->r_cpu = 0; rack->r_ctl.rc_reorder_fade = rack_reorder_fade; rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; rack->rc_pace_reduce = rack_slot_reduction; if (V_tcp_delack_enabled) tp->t_delayed_ack = 1; else tp->t_delayed_ack = 0; rack->rc_pace_max_segs = rack_hptsi_segments; rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; rack->r_enforce_min_pace = rack_min_pace_time; rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; rack->r_ctl.rc_prop_rate = rack_proportional_rate; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; rack->r_ctl.rc_early_recovery = rack_early_recovery; rack->rc_always_pace = rack_pace_every_seg; rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; rack->rack_tlp_threshold_use = rack_tlp_threshold_use; rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; rack->r_ctl.rc_prr_inc_var = rack_inc_var; if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; rsm = rack_alloc(rack); if (rsm == NULL) { uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; return (ENOMEM); } rsm->r_flags = RACK_OVERMAX; rsm->r_tim_lastsent[0] = tcp_ts_getticks(); rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = tp->snd_una; rsm->r_end = tp->snd_max; rsm->r_sndcnt = 0; TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } rack_stop_all_timers(tp); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); return (0); } static int rack_handoff_ok(struct tcpcb *tp) { if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { /* Sure no problem though it may not stick */ return (0); } if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) { /* * We really don't know you have to get to ESTAB or beyond * to tell. */ return (EAGAIN); } if (tp->t_flags & TF_SACK_PERMIT) { return (0); } /* * If we reach here we don't do SACK on this connection so we can * never do rack. */ return (EINVAL); } static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) { if (tp->t_fb_ptr) { struct tcp_rack *rack; struct rack_sendmap *rsm; rack = (struct tcp_rack *)tp->t_fb_ptr; #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); while (rsm) { TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); } rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); while (rsm) { TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); } rack->rc_free_cnt = 0; uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } /* Make sure snd_nxt is correctly set */ tp->snd_nxt = tp->snd_max; } static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) { switch (tp->t_state) { case TCPS_SYN_SENT: rack->r_state = TCPS_SYN_SENT; rack->r_substate = rack_do_syn_sent; break; case TCPS_SYN_RECEIVED: rack->r_state = TCPS_SYN_RECEIVED; rack->r_substate = rack_do_syn_recv; break; case TCPS_ESTABLISHED: rack->r_state = TCPS_ESTABLISHED; rack->r_substate = rack_do_established; break; case TCPS_CLOSE_WAIT: rack->r_state = TCPS_CLOSE_WAIT; rack->r_substate = rack_do_close_wait; break; case TCPS_FIN_WAIT_1: rack->r_state = TCPS_FIN_WAIT_1; rack->r_substate = rack_do_fin_wait_1; break; case TCPS_CLOSING: rack->r_state = TCPS_CLOSING; rack->r_substate = rack_do_closing; break; case TCPS_LAST_ACK: rack->r_state = TCPS_LAST_ACK; rack->r_substate = rack_do_lastack; break; case TCPS_FIN_WAIT_2: rack->r_state = TCPS_FIN_WAIT_2; rack->r_substate = rack_do_fin_wait_2; break; case TCPS_LISTEN: case TCPS_CLOSED: case TCPS_TIME_WAIT: default: break; }; } static void rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) { /* * We received an ack, and then did not * call send or were bounced out due to the * hpts was running. Now a timer is up as well, is * it the right timer? */ struct rack_sendmap *rsm; int tmr_up; tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) return; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && (tmr_up == PACE_TMR_RXT)) { /* Should be an RXT */ return; } if (rsm == NULL) { /* Nothing outstanding? */ if (tp->t_flags & TF_DELACK) { if (tmr_up == PACE_TMR_DELACK) /* We are supposed to have delayed ack up and we do */ return; } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { /* * if we hit enobufs then we would expect the possiblity * of nothing outstanding and the RXT up (and the hptsi timer). */ return; } else if (((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) && (tmr_up == PACE_TMR_KEEP) && (tp->snd_max == tp->snd_una)) { /* We should have keep alive up and we do */ return; } } if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & RACK_HAS_FIN)) { /* needs to be a RXT */ if (tmr_up == PACE_TMR_RXT) return; } else if (tmr_up == PACE_TMR_RACK) return; } else if (SEQ_GT(tp->snd_max,tp->snd_una) && ((tmr_up == PACE_TMR_TLP) || (tmr_up == PACE_TMR_RXT))) { /* * Either a TLP or RXT is fine if no sack-passed * is in place and data is outstanding. */ return; } else if (tmr_up == PACE_TMR_DELACK) { /* * If the delayed ack was going to go off * before the rtx/tlp/rack timer were going to * expire, then that would be the timer in control. * Note we don't check the time here trusting the * code is correct. */ return; } /* * Ok the timer originally started is not what we want now. * We will force the hpts to be stopped if any, and restart * with the slot set to what was in the saved slot. */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); } static void rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, struct timeval *tv) { int32_t thflags, retval, did_out = 0; int32_t way_out = 0; uint32_t cts; uint32_t tiwin; struct tcpopt to; struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t prev_state = 0; cts = tcp_tv_to_mssectick(tv); rack = (struct tcp_rack *)tp->t_fb_ptr; kern_prefetch(rack, &prev_state); prev_state = 0; thflags = th->th_flags; /* * If this is either a state-changing packet or current state isn't * established, we require a read lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either locked or unlocked, as the * caller may have unnecessarily acquired a lock due to a race. */ INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, &log, true); } /* * Segment received on connection. Reset idle time and keep-alive * timer. XXX: This should be done after segment validation to * ignore broken/spoofed segs. */ if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { counter_u64_add(rack_input_idle_reduces, 1); rack_cc_after_idle(tp, (rack->r_idle_reduce_largest ? 1 :0)); } } rack->r_ctl.rc_rcvtime = cts; tp->t_rcvtime = ticks; /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef NETFLIX_STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif /* * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move * this to occur after we've validated the segment. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { rack_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, fall back to * non RFC1323 RTT calculation. Normalize timestamp if syncookies * were used when this connection was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, cts)) to.to_tsecr = 0; } /* * If its the first time in we need to take care of options and * verify we can do SACK for rack! */ if (rack->r_state == 0) { /* Should be init'd by rack_init() */ KASSERT(rack->rc_inp != NULL, ("%s: rack->rc_inp unexpectedly NULL", __func__)); if (rack->rc_inp == NULL) { rack->rc_inp = tp->t_inpcb; } /* * Process options only when we get SYN/ACK back. The SYN * case for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. XXX * this is traditional behavior, may need to be cleaned up. */ rack->r_cpu = inp_to_cpuid(tp->t_inpcb); if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with the * next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = cts; } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * At this point we are at the initial call. Here we decide * if we are doing RACK or not. We do this by seeing if * TF_SACK_PERMIT is set, if not rack is *not* possible and * we switch to the default code. */ if ((tp->t_flags & TF_SACK_PERMIT) == 0) { tcp_switch_back_to_default(tp); (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, tlen, iptos); return; } /* Set the flag */ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; tcp_set_hpts(tp->t_inpcb); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } /* * This is the one exception case where we set the rack state * always. All other times (timers etc) we must have a rack-state * set (so we assure we have done the checks above for SACK). */ if (rack->r_state != tp->t_state) rack_set_state(tp, rack); if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) kern_prefetch(rsm, &prev_state); prev_state = rack->r_state; rack->r_ctl.rc_tlp_send_cnt = 0; rack_clear_rate_sample(rack); retval = (*rack->r_substate) (m, th, so, tp, &to, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt); #ifdef INVARIANTS if ((retval == 0) && (tp->t_inpcb == NULL)) { panic("retval:%d tp:%p t_inpcb:NULL state:%d", retval, tp, prev_state); } #endif if (retval == 0) { /* * If retval is 1 the tcb is unlocked and most likely the tp * is gone. */ INP_WLOCK_ASSERT(tp->t_inpcb); tcp_rack_xmit_timer_commit(rack, tp); if (nxt_pkt == 0) { if (rack->r_wanted_output != 0) { did_out = 1; (void)tp->t_fb->tfb_tcp_output(tp); } rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); } if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && (SEQ_GT(tp->snd_max, tp->snd_una) || (tp->t_flags & TF_DELACK) || ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)))) { /* We could not send (probably in the hpts but stopped the timer earlier)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* keep alive not needed if we are hptsi output yet */ ; } else { if (rack->rc_inp->inp_in_hpts) tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); } way_out = 1; } else { /* Do we have the correct timer running? */ rack_timer_audit(tp, rack, &so->so_snd); way_out = 2; } rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); if (did_out) rack->r_wanted_output = 0; #ifdef INVARIANTS if (tp->t_inpcb == NULL) { panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", did_out, retval, tp, prev_state); } #endif INP_WUNLOCK(tp->t_inpcb); } } void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) { struct timeval tv; #ifdef RSS struct tcp_function_block *tfb; struct tcp_rack *rack; struct inpcb *inp; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->r_state == 0) { /* * Initial input (ACK to SYN-ACK etc)lets go ahead and get * it processed */ tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, 0, &tv); return; } tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); INP_WUNLOCK(tp->t_inpcb); #else tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, 0, &tv); #endif } struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) { struct rack_sendmap *rsm = NULL; int32_t idx; uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; /* Return the next guy to be re-transmitted */ if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { return (NULL); } if (tp->t_flags & TF_SENTFIN) { /* retran the end FIN? */ return (NULL); } /* ok lets look at this one */ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { goto check_it; } rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) { return (NULL); } check_it: srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; srtt = TICKS_2_MSEC(srtt_cur); if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) srtt = rack->rc_rack_rtt; if (rsm->r_flags & RACK_ACKED) { return (NULL); } if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { /* Its not yet ready */ return (NULL); } idx = rsm->r_rtr_cnt - 1; ts_low = rsm->r_tim_lastsent[idx]; thresh = rack_calc_thresh_rack(rack, srtt, tsused); if (tsused <= ts_low) { return (NULL); } if ((tsused - ts_low) >= thresh) { return (rsm); } return (NULL); } static int rack_output(struct tcpcb *tp) { struct socket *so; uint32_t recwin, sendwin; uint32_t sb_offset; int32_t len, flags, error = 0; struct mbuf *m; struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; uint32_t if_hw_tsomaxsegsize; long tot_len_this_send = 0; struct ip *ip = NULL; #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif #ifdef NETFLIX_TCP_O_UDP struct udphdr *udp = NULL; #endif struct tcp_rack *rack; struct tcphdr *th; uint8_t pass = 0; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef NETFLIX_TCP_O_UDP unsigned ulen; #endif uint32_t rack_seq; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif int32_t idle, sendalot; int32_t sub_from_prr = 0; volatile int32_t sack_rxmit; struct rack_sendmap *rsm = NULL; int32_t tso, mtu, would_have_fin = 0; struct tcpopt to; int32_t slot = 0; uint32_t cts; uint8_t hpts_calling, doing_tlp = 0; int32_t do_a_prefetch; int32_t prefetch_rsm = 0; int32_t prefetch_so_done = 0; struct tcp_log_buffer *lgb = NULL; struct inpcb *inp; struct sockbuf *sb; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif #ifdef KERN_TLS const bool hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; #else const bool hw_tls = false; #endif /* setup and take the cache hits here */ rack = (struct tcp_rack *)tp->t_fb_ptr; inp = rack->rc_inp; so = inp->inp_socket; sb = &so->so_snd; kern_prefetch(sb, &do_a_prefetch); do_a_prefetch = 1; INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif #ifdef TCP_RFC7413 /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED) && SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ return (0); #endif #ifdef INET6 if (rack->r_state) { /* Use the cache line loaded if possible */ isipv6 = rack->r_is_v6; } else { isipv6 = (inp->inp_vflag & INP_IPV6) != 0; } #endif cts = tcp_ts_getticks(); if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && inp->inp_in_hpts) { /* * We are on the hpts for some timer but not hptsi output. * Remove from the hpts unconditionally. */ rack_timer_cancel(tp, rack, cts, __LINE__); } /* Mark that we have called rack_output(). */ if ((rack->r_timer_override) || (tp->t_flags & TF_FORCEDATA) || (tp->t_state < TCPS_ESTABLISHED)) { if (tp->t_inpcb->inp_in_hpts) tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); } else if (tp->t_inpcb->inp_in_hpts) { /* * On the hpts you can't pass even if ACKNOW is on, we will * when the hpts fires. */ counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); return (0); } hpts_calling = inp->inp_hpts_calls; inp->inp_hpts_calls = 0; if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { if (rack_process_timers(tp, rack, cts, hpts_calling)) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); return (0); } } rack->r_wanted_output = 0; rack->r_timer_override = 0; /* * Determine length of data that should be transmitted, and flags * that will be used. If there is some data or critical controls * (SYN, RST) to send, then transmit; otherwise, investigate * further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (tp->t_idle_reduce) { if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) rack_cc_after_idle(tp, (rack->r_idle_reduce_largest ? 1 :0)); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ sendalot = 0; cts = tcp_ts_getticks(); tso = 0; mtu = 0; sb_offset = tp->snd_max - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly * trying to send out new data (when sendalot is 1), bypass this * function. If we retransmit in fast recovery mode, decrement * snd_cwnd, since we're replacing a (future) new transmission with * a retransmission now, and we previously incremented snd_cwnd in * tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ while (rack->rc_free_cnt < rack_free_cache) { rsm = rack_alloc(rack); if (rsm == NULL) { if (inp->inp_hpts_calls) /* Retry in a ms */ slot = 1; goto just_return_nolock; } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt++; rsm = NULL; } if (inp->inp_hpts_calls) inp->inp_hpts_calls = 0; sack_rxmit = 0; len = 0; rsm = NULL; if (flags & TH_RST) { SOCKBUF_LOCK(sb); goto send; } if (rack->r_ctl.rc_tlpsend) { /* Tail loss probe */ long cwin; long tlen; doing_tlp = 1; rsm = rack->r_ctl.rc_tlpsend; rack->r_ctl.rc_tlpsend = NULL; sack_rxmit = 1; tlen = rsm->r_end - rsm->r_start; if (tlen > tp->t_maxseg) tlen = tp->t_maxseg; #ifdef INVARIANTS if (SEQ_GT(tp->snd_una, rsm->r_start)) { panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", tp, rack, tp->snd_una, rsm, rsm->r_start); } #endif sb_offset = rsm->r_start - tp->snd_una; cwin = min(tp->snd_wnd, tlen); len = cwin; } else if (rack->r_ctl.rc_resend) { /* Retransmit timer */ rsm = rack->r_ctl.rc_resend; rack->r_ctl.rc_resend = NULL; len = rsm->r_end - rsm->r_start; sack_rxmit = 1; sendalot = 0; sb_offset = rsm->r_start - tp->snd_una; if (len >= tp->t_maxseg) { len = tp->t_maxseg; } KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", __func__, sb_offset)); } else if ((rack->rc_in_persist == 0) && ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { long tlen; if ((!IN_RECOVERY(tp->t_flags)) && ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { /* Enter recovery if not induced by a time-out */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure we send * one packet. */ rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; } #ifdef INVARIANTS if (SEQ_LT(rsm->r_start, tp->snd_una)) { panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", tp, rack, rsm, rsm->r_start, tp->snd_una); } #endif tlen = rsm->r_end - rsm->r_start; sb_offset = rsm->r_start - tp->snd_una; if (tlen > rack->r_ctl.rc_prr_sndcnt) { len = rack->r_ctl.rc_prr_sndcnt; } else { len = tlen; } if (len >= tp->t_maxseg) { sendalot = 1; len = tp->t_maxseg; } else { sendalot = 0; if ((rack->rc_timer_up == 0) && (len < tlen)) { /* * If its not a timer don't send a partial * segment. */ len = 0; goto just_return_nolock; } } KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", __func__, sb_offset)); if (len > 0) { sub_from_prr = 1; sack_rxmit = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, tp->t_maxseg)); counter_u64_add(rack_rtm_prr_retran, 1); } } if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { /* we are retransmitting the fin */ len--; if (len) { /* * When retransmitting data do *not* include the * FIN. This could happen from a TLP probe. */ flags &= ~TH_FIN; } } #ifdef INVARIANTS /* For debugging */ rack->r_ctl.rc_rsm_at_retran = rsm; #endif /* * Enforce a connection sendmap count limit if set * as long as we are not retransmiting. */ if ((rsm == NULL) && (rack_map_entries_limit > 0) && (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { counter_u64_add(rack_to_alloc_limited, 1); if (!rack->alloc_limit_reported) { rack->alloc_limit_reported = 1; counter_u64_add(rack_alloc_limited_conns, 1); } goto just_return_nolock; } /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { void *end_rsm; end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (end_rsm) kern_prefetch(end_rsm, &prefetch_rsm); prefetch_rsm = 1; } SOCKBUF_LOCK(sb); /* * If in persist timeout with window of 0, send 1 byte. Otherwise, * if window is small but nonzero and time TF_SENTFIN expired, we * will send what we can and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then clear * the FIN bit. Usually this would happen below * when it realizes that we aren't sending all the * data. However, if we have exactly 1 byte of * unsent data, then it won't clear the FIN bit * below, and if we are in persist state, we wind up * sending the packet without recording that we sent * the FIN bit. * * We can't just blindly clear the FIN bit, because * if we don't have any more data to send then the * probe will be the FIN itself. */ if (sb_offset < sbused(sb)) flags &= ~TH_FIN; sendwin = 1; } else { if (rack->rc_in_persist) rack_exit_persist(tp, rack); /* * If we are dropping persist mode then we need to * correct snd_nxt/snd_max and off. */ tp->snd_nxt = tp->snd_max; sb_offset = tp->snd_nxt - tp->snd_una; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a * negative length. This can also occur when TCP opens up its * congestion window while receiving additional duplicate acks after * fast-retransmit because TCP will reset snd_nxt to snd_max after * the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will be * set to snd_una, the sb_offset will be 0, and the length may wind * up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { uint32_t avail; avail = sbavail(sb); if (SEQ_GT(tp->snd_nxt, tp->snd_una)) sb_offset = tp->snd_nxt - tp->snd_una; else sb_offset = 0; if (IN_RECOVERY(tp->t_flags) == 0) { if (rack->r_ctl.rc_tlp_new_data) { /* TLP is forcing out new data */ if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); } if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) len = tp->snd_wnd; else len = rack->r_ctl.rc_tlp_new_data; rack->r_ctl.rc_tlp_new_data = 0; doing_tlp = 1; } else { if (sendwin > avail) { /* use the available */ if (avail > sb_offset) { len = (int32_t)(avail - sb_offset); } else { len = 0; } } else { if (sendwin > sb_offset) { len = (int32_t)(sendwin - sb_offset); } else { len = 0; } } } } else { uint32_t outstanding; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible so far in the scoreboard. */ outstanding = tp->snd_max - tp->snd_una; if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { if (tp->snd_wnd > outstanding) { len = tp->snd_wnd - outstanding; /* Check to see if we have the data */ if (((sb_offset + len) > avail) && (avail > sb_offset)) len = avail - sb_offset; else len = 0; } else len = 0; } else if (avail > sb_offset) len = avail - sb_offset; else len = 0; if (len > 0) { if (len > rack->r_ctl.rc_prr_sndcnt) len = rack->r_ctl.rc_prr_sndcnt; if (len > 0) { sub_from_prr = 1; counter_u64_add(rack_rtm_prr_newdata, 1); } } if (len > tp->t_maxseg) { /* * We should never send more than a MSS when * retransmitting or sending new data in prr * mode unless the override flag is on. Most * likely the PRR algorithm is not going to * let us send a lot as well :-) */ if (rack->r_ctl.rc_prr_sendalot == 0) len = tp->t_maxseg; } else if (len < tp->t_maxseg) { /* * Do we send any? The idea here is if the * send empty's the socket buffer we want to * do it. However if not then lets just wait * for our prr_sndcnt to get bigger. */ long leftinsb; leftinsb = sbavail(sb) - sb_offset; if (leftinsb > len) { /* This send does not empty the sb */ len = 0; } } } } if (prefetch_so_done == 0) { kern_prefetch(so, &prefetch_so_done); prefetch_so_done = 1; } /* * Lop off SYN bit if it has already been sent. However, if this is * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if ((tp->t_state != TCPS_SYN_RECEIVED) && (tp->t_state != TCPS_SYN_SENT)) flags &= ~TH_SYN; #ifdef TCP_RFC7413 /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; #endif } /* * Be careful not to send data and/or FIN on SYN segments. This * measure is needed to prevent interoperability problems with not * fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } #ifdef TCP_RFC7413 /* * When retransmitting SYN|ACK on a passively-created TFO socket, * don't include data, as the presence of data may have caused the * original SYN|ACK to have been dropped by a middlebox. */ if ((tp->t_flags & TF_FASTOPEN) && ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0))) len = 0; #endif if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been * called to retransmit, len will be < 0. Otherwise, window * shrank after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back to (closed) * window, and set the persist timer if it isn't already * going. If the window didn't close completely, just wait * for an ACK. * * We also do a general check here to ensure that we will * set the persist timer when we have data to send, but a * 0-byte window. This makes sure the persist timer is set * even if the packet hits one of the "goto send" lines * below. */ len = 0; if ((tp->snd_wnd == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (sb_offset < (int)sbavail(sb))) { tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, cts); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, sendwin); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP * options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per * generated segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does * the right thing below to provide length of just ip options and thus * checking for ipoptlen is enough to decide if ip options are present. */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); #endif /* INET */ #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && #ifdef NETFLIX_TCP_O_UDP (tp->t_port == 0) && #endif ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0) tso = 1; { uint32_t outstanding; outstanding = tp->snd_max - tp->snd_una; if (tp->t_flags & TF_SENTFIN) { /* * If we sent a fin, snd_max is 1 higher than * snd_una */ outstanding--; } if (outstanding > 0) { /* * This is sub-optimal. We only send a stand alone * FIN on its own segment. */ if (flags & TH_FIN) { flags &= ~TH_FIN; would_have_fin = 1; } } else if (sack_rxmit) { if ((rsm->r_flags & RACK_HAS_FIN) == 0) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + sbused(sb))) flags &= ~TH_FIN; } } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) - This is the last * buffer in a write()/send() and we are either idle or running * NODELAY - we've timed out (e.g. persist timer) - we have more * then 1/2 the maximum send window's worth of data (receiver may be * limited the window size) - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) { pass = 1; goto send; } /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause us * to flush a buffer queued with moretocome. XXX * */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && (tp->t_flags & TF_NOPUSH) == 0) { pass = 2; goto send; } if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ pass = 3; goto send; } if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ goto send; } if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { pass = 4; goto send; } if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ pass = 5; goto send; } if (sack_rxmit) { pass = 6; goto send; } } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read from * the receive buffer, no matter how small, causes a window update * to be sent. We also should avoid sending a flurry of window * updates when the socket buffer had queued a lot of data and the * application is doing small reads. * * Prevent a flurry of pointless window updates by only sending an * update when we can increase the advertized window by more than * 1/4th of the socket buffer capacity. When the buffer is getting * full or is very small be more aggressive and send an update * whenever we can increase by two mss sized segments. In all other * situations the ACK's to new incoming data will carry further * window increases. * * Don't send an independent window update if a delayed ACK is * pending (it will get piggy-backed on it) or the remote side * already has done a half-close and won't send more data. Skip * this if the connection is in T/TCP half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, taking * into account that we are limited by TCP_MAXWIN << * tp->rcv_scale. */ int32_t adv; int oldwin; adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as the old * size when it is scaled, then don't force a window update. */ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (int32_t)(2 * tp->t_maxseg) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { pass = 7; goto send; } if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) { pass = 8; goto send; } if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { pass = 9; goto send; } if (SEQ_GT(tp->snd_up, tp->snd_una)) { pass = 10; goto send; } /* * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. */ if (flags & TH_FIN) { if ((tp->t_flags & TF_SENTFIN) || (((tp->t_flags & TF_SENTFIN) == 0) && (tp->snd_nxt == tp->snd_una))) { pass = 11; goto send; } } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(sb); just_return_nolock: if (tot_len_this_send == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); tp->t_flags &= ~TF_FORCEDATA; return (0); send: if (doing_tlp == 0) { /* * Data not a TLP, and its not the rxt firing. If it is the * rxt firing, we want to leave the tlp_in_progress flag on * so we don't send another TLP. It has to be a rack timer * or normal send (response to acked data) to clear the tlp * in progress flag. */ rack->rc_tlp_in_progress = 0; } SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options unless TCP * set not to do any options. NOTE: we assume that the IP/TCP header * plus TCP options always fit in a single mbuf, leaving room for a * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) * + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else #endif hdrlen = sizeof(struct tcpiphdr); /* * Compute options for segment. We only have to care about SYN and * established connection segments. Options for SYN-ACK segments * are handled in TCP syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&inp->inp_inc); #ifdef NETFLIX_TCP_O_UDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; #endif to.to_flags |= TOF_MSS; #ifdef TCP_RFC7413 /* * Only include the TFO option on the first * transmission of the SYN|ACK on a * passively-created TFO socket, as the presence of * the TFO option may have caused the original * SYN|ACK to have been dropped by a middlebox. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift == 0)) { to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN; to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; to.to_flags |= TOF_FASTOPEN; } #endif } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = cts + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); /* Selective ACK's. */ if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); } #ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ SOCKBUF_UNLOCK(&so->so_snd); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); } #endif - ipoptlen = 0; +#ifdef INET6 + if (isipv6) + ipoptlen = ip6_optlen(tp->t_inpcb); + else +#endif + if (tp->t_inpcb->inp_options) + ipoptlen = tp->t_inpcb->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + else + ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will bump the packet * length beyond the t_maxseg length. Clear the FIN bit because we * cut off the tail of the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { if (flags & TH_FIN) { would_have_fin = 1; flags &= ~TH_FIN; } if (tso) { uint32_t if_hw_tsomax; uint32_t moff; int32_t max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Prevent the last segment from being fractional * unless the send sockbuf can be emptied: */ max_len = (tp->t_maxseg - optlen); if ((sb_offset + len) < sbavail(sb)) { moff = len % (u_int)max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments don't * use TSO: */ if (len <= max_len) { len = max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment after the bulk * sending is done. We don't trust the TSO * implementations to clear the FIN flag on all but * the last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { + if (optlen + ipoptlen > tp->t_maxseg) { + /* + * Since we don't have enough space to put + * the IP header chain and the TCP header in + * one packet as required by RFC 7112, don't + * send it. + */ + SOCKBUF_UNLOCK(&so->so_snd); + error = EMSGSIZE; + sack_rxmit = 0; + goto out; + } len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); #ifdef DIAGNOSTIC #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); #endif /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further * down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); if ((len == 0) && (flags & TH_FIN) && (sbused(sb))) { /* * We have outstanding data, don't send a fin by itself!. */ goto just_return; } /* * Grab a header mbuf, attaching a copy of data to be transmitted, * and initialize the header from the template for sends on this * connection. */ if (len) { uint32_t max_val; uint32_t moff; if (rack->rc_pace_max_segs) max_val = rack->rc_pace_max_segs * tp->t_maxseg; else max_val = len; /* * We allow a limit on sending with hptsi. */ if (len > max_val) { len = max_val; } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(sb); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf to the * sb_offset in the socket buffer chain. */ mb = sbsndptr_noadv(sb, sb_offset, &moff); if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t)+hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) sbsndptr_adv(sb, mb, len); m->m_len += len; } else { struct sockbuf *msb; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) msb = NULL; else msb = sb; m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, hw_tls /*, NULL */); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy * shorten it to no longer need tso. Lets * not put on sendalot since we are low on * mbufs. */ tso = 0; } if (m->m_next == NULL) { SOCKBUF_UNLOCK(sb); (void)m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } if ((tp->t_flags & TF_FORCEDATA) && len == 1) { TCPSTAT_INC(tcps_sndprobe); #ifdef NETFLIX_STATS if (SEQ_LT(tp->snd_nxt, tp->snd_max)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); else stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { if (rsm && (rsm->r_flags & RACK_TLP)) { /* * TLP should not count in retran count, but * in its own bin */ /* tp->t_sndtlppack++;*/ /* tp->t_sndtlpbyte += len;*/ counter_u64_add(rack_tlp_retran, 1); counter_u64_add(rack_tlp_retran_bytes, len); } else { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); } #ifdef NETFLIX_STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); #ifdef NETFLIX_STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif } /* * If we're sending everything we've got, set PUSH. (This * will keep happy those implementations which only give * data to the user when a buffer fills or a PUSH comes in.) */ if (sb_offset + len == sbused(sb) && sbused(sb) && !(flags & TH_SYN)) flags |= TH_PUSH; /* * Are we doing hptsi, if so we must calculate the slot. We * only do hptsi in ESTABLISHED and with no RESET being * sent where we have data to send. */ if (((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_CLOSE_WAIT) || ((tp->t_state == TCPS_FIN_WAIT_1) && ((tp->t_flags & TF_SENTFIN) == 0) && ((flags & TH_FIN) == 0))) && ((flags & TH_RST) == 0) && (rack->rc_always_pace)) { /* * We use the most optimistic possible cwnd/srtt for * sending calculations. This will make our * calculation anticipate getting more through * quicker then possible. But thats ok we don't want * the peer to have a gap in data sending. */ uint32_t srtt, cwnd, tr_perms = 0; if (rack->r_ctl.rc_rack_min_rtt) srtt = rack->r_ctl.rc_rack_min_rtt; else srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); if (rack->r_ctl.rc_rack_largest_cwnd) cwnd = rack->r_ctl.rc_rack_largest_cwnd; else cwnd = tp->snd_cwnd; tr_perms = cwnd / srtt; if (tr_perms == 0) { tr_perms = tp->t_maxseg; } tot_len_this_send += len; /* * Calculate how long this will take to drain, if * the calculation comes out to zero, thats ok we * will use send_a_lot to possibly spin around for * more increasing tot_len_this_send to the point * that its going to require a pace, or we hit the * cwnd. Which in that case we are just waiting for * a ACK. */ slot = tot_len_this_send / tr_perms; /* Now do we reduce the time so we don't run dry? */ if (slot && rack->rc_pace_reduce) { int32_t reduce; reduce = (slot / rack->rc_pace_reduce); if (reduce < slot) { slot -= reduce; } else slot = 0; } if (rack->r_enforce_min_pace && (slot == 0) && (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { /* We are enforcing a minimum pace time of 1ms */ slot = rack->r_enforce_min_pace; } } SOCKBUF_UNLOCK(sb); } else { SOCKBUF_UNLOCK(sb); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(sb); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); #ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif #ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp,/*tp->t_port, */ ip, th); } /* * Fill in fields, remembering maximum advertised window for use in * delaying messages about window sizes. If resending a FIN, be sure * not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup SYN packet. If we * are on a retransmit, we may resend those bits a number of times * as per RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE | TH_CWR; } else flags |= TH_ECE | TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with ECN capable * transmission (ECT). Ignore pure ack packets, * retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will not reflect * the first unsent octet. For ACK only packets, we do not want the * sequence number of the retransmitted packet, we want the sequence * number of the next unsent octet. So, if there is no data (and no * SYN or FIN), use snd_max instead of snd_nxt when filling in * ti_seq. But if we are in persist state, snd_max might reflect * one byte beyond the right edge of the window, so use snd_nxt in * that case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN | TH_FIN)) || rack->rc_in_persist) { th->th_seq = htonl(tp->snd_nxt); rack_seq = tp->snd_nxt; } else if (flags & TH_RST) { /* * For a Reset send the last cum ack in sequence * (this like any other choice may still generate a * challenge ack, if a ack-update packet is in * flight). */ th->th_seq = htonl(tp->snd_una); rack_seq = tp->snd_una; } else { th->th_seq = htonl(tp->snd_max); rack_seq = tp->snd_max; } } else { th->th_seq = htonl(rsm->r_start); rack_seq = rsm->r_start; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, but avoid silly * window syndrome. */ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a or * ) segment itself is never scaled. The case is * handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 * window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is * attempting to read more data than can be buffered prior to * transmitting on the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull the urgent * pointer to the left edge of the send window so that it * doesn't drift into the send window on sequence number * wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. * NOTE: since TCP options buffer doesn't point into * mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ goto out; } } #endif /* * Put TCP length in extended header, and then checksum extended * header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ #ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { #endif m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); #ifdef NETFLIX_TCP_O_UDP } #endif } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { #ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { #endif m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); #ifdef NETFLIX_TCP_O_UDP } #endif /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. The TCP pseudo * header checksum is always provided. XXX: Fixme: This is currently * not the case for IPv6. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), - ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", - __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); -#else - KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), - ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", - __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); -#endif + KASSERT(len + hdrlen == m_length(m, NULL), + ("%s: mbuf chain different than expected: %d + %u != %u", + __func__, len, hdrlen, m_length(m, NULL))); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + * (th->th_off << 2) */ ); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ /* We're getting ready to send; log now. */ if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; if (rsm || sack_rxmit) { log.u_bbr.flex8 = 1; } else { log.u_bbr.flex8 = 0; } lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, &log, false, NULL, NULL, 0, NULL); } else lgb = NULL; /* * Fill in IP length and desired time to live and send to IP level. * There should be a better way to handle ttl and tos; we could keep * them in the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. Also, * desired default hop limit might be changed via Neighbor * Discovery. */ ip6->ip6_hlim = in6_selecthlim(inp, NULL); /* * Set the packet size here for the benefit of DTrace * probes. ip6_output() will set it properly; it's supposed * to include the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &inp->inp_route6, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, inp); if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) mtu = inp->inp_route6.ro_rt->rt_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(inp, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every * packet. This might not be the best thing to do according * to RFC3390 Section 2. However the tcp hostcache migitates * the problem so it affects only the first tcp connection * with a host. * * NB: Don't set DF on small MTU/MSS to have a safe * fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; if (tp->t_port == 0 || len < V_tcp_minmss) { ip->ip_off |= htons(IP_DF); } } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, inp); if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) mtu = inp->inp_route.ro_rt->rt_mtu; } #endif /* INET */ out: if (lgb) { lgb->tlb_errno = error; lgb = NULL; } /* * In transmit state, time the transmission and arrange for the * retransmit. In persist state, just set snd_max. */ if (error == 0) { if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) tcp_clean_dsack_blocks(tp); if (len == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); else if (len == 1) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); } else if (len > 1) { int idx; idx = (len / tp->t_maxseg) + 3; if (idx >= TCP_MSS_ACCT_ATIMER) counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); else counter_u64_add(rack_out_size[idx], 1); } } if (sub_from_prr && (error == 0)) { if (rack->r_ctl.rc_prr_sndcnt >= len) rack->r_ctl.rc_prr_sndcnt -= len; else rack->r_ctl.rc_prr_sndcnt = 0; } sub_from_prr = 0; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, pass, rsm); if ((tp->t_flags & TF_FORCEDATA) == 0 || (rack->rc_in_persist == 0)) { #ifdef NETFLIX_STATS tcp_seq startseq = tp->snd_nxt; #endif /* * Advance snd_nxt over sequence space of this segment. */ if (error) /* We don't log or do anything with errors */ goto timer; if (flags & (TH_SYN | TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } /* In the ENOBUFS case we do *not* update snd_max */ if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { if (tp->snd_una == tp->snd_max) { /* * Update the time we just added data since * none was outstanding. */ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt; #ifdef NETFLIX_STATS if (!(tp->t_flags & TF_GPUTINPROG) && len) { tp->t_flags |= TF_GPUTINPROG; tp->gput_seq = startseq; tp->gput_ack = startseq + ulmin(sbavail(sb) - sb_offset, sendwin); tp->gput_ts = tcp_ts_getticks(); } #endif } /* * Set retransmit timer if not currently set, and not doing * a pure ack or a keep-alive probe. Initial value for * retransmit timer is smoothed round-trip time + 2 * * round-trip time variance. Initialize shift counter which * is used for backoff of retransmit time. */ timer: if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { /* * If the persists timer was set above (right before * the goto send), and still needs to be on. Lets * make sure all is canceled. If the persist timer * is not running, we want to get it up. */ if (rack->rc_in_persist == 0) { rack_enter_persist(tp, rack, cts); } } } else { /* * Persist case, update snd_max but since we are in persist * mode (no window) we do not update snd_nxt. */ int32_t xlen = len; if (error) goto nomore; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } /* In the ENOBUFS case we do *not* update snd_max */ if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { if (tp->snd_una == tp->snd_max) { /* * Update the time we just added data since * none was outstanding. */ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt + len; } } nomore: if (error) { SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ /* * Failures do not advance the seq counter above. For the * case of ENOBUFS we will fall out and retry in 1ms with * the hpts. Everything else will just have to retransmit * with the timer. * * In any case, we do not want to loop around for another * send without a good reason. */ sendalot = 0; switch (error) { case EPERM: tp->t_flags &= ~TF_FORCEDATA; tp->t_softerror = error; return (error); case ENOBUFS: if (slot == 0) { /* * Pace us right away to retry in a some * time */ slot = 1 + rack->rc_enobuf; if (rack->rc_enobuf < 255) rack->rc_enobuf++; if (slot > (rack->rc_rack_rtt / 2)) { slot = rack->rc_rack_rtt / 2; } if (slot < 10) slot = 10; } counter_u64_add(rack_saw_enobuf, 1); error = 0; goto enobufs; case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. If TSO was active we either got an * interface without TSO capabilits or TSO was * turned off. If we obtained mtu from ip_output() * then update it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } slot = 10; rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); tp->t_flags &= ~TF_FORCEDATA; return (error); case ENETUNREACH: counter_u64_add(rack_saw_enetunreach, 1); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; } /* FALLTHROUGH */ default: slot = 10; rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); tp->t_flags &= ~TF_FORCEDATA; return (error); } } else { rack->rc_enobuf = 0; } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). If this advertises a larger * window than any other segment, then remember the size of the * advertised window. Any pending ACK has now been sent. */ if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: rack->r_tlp_running = 0; if ((flags & TH_RST) || (would_have_fin == 1)) { /* * We don't send again after a RST. We also do *not* send * again if we would have had a find, but now have * outstanding data. */ slot = 0; sendalot = 0; } if (slot) { /* set the rack tcb into the slot N */ counter_u64_add(rack_paced_segments, 1); } else if (sendalot) { if (len) counter_u64_add(rack_unpaced_segments, 1); sack_rxmit = 0; tp->t_flags &= ~TF_FORCEDATA; goto again; } else if (len) { counter_u64_add(rack_unpaced_segments, 1); } tp->t_flags &= ~TF_FORCEDATA; rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); return (error); } /* * rack_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { int32_t error = 0, optval; switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: case TCP_RACK_PROP: case TCP_RACK_TLP_REDUCE: case TCP_RACK_EARLY_RECOV: case TCP_RACK_PACE_ALWAYS: case TCP_DELACK: case TCP_RACK_PACE_REDUCE: case TCP_RACK_PACE_MAX_SEG: case TCP_RACK_PRR_SENDALOT: case TCP_RACK_MIN_TO: case TCP_RACK_EARLY_SEG: case TCP_RACK_REORD_THRESH: case TCP_RACK_REORD_FADE: case TCP_RACK_TLP_THRESH: case TCP_RACK_PKT_DELAY: case TCP_RACK_TLP_USE: case TCP_RACK_TLP_INC_VAR: case TCP_RACK_IDLE_REDUCE_HIGH: case TCP_RACK_MIN_PACE: case TCP_RACK_MIN_PACE_SEG: case TCP_BBR_RACK_RTT_USE: case TCP_DATA_AFTER_CLOSE: break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) return (error); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: if ((optval <= 0) || (optval >= 100)) { error = EINVAL; break; } RACK_OPTS_INC(tcp_rack_prop_rate); rack->r_ctl.rc_prop_rate = optval; break; case TCP_RACK_TLP_USE: if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { error = EINVAL; break; } RACK_OPTS_INC(tcp_tlp_use); rack->rack_tlp_threshold_use = optval; break; case TCP_RACK_PROP: /* RACK proportional rate reduction (bool) */ RACK_OPTS_INC(tcp_rack_prop); rack->r_ctl.rc_prop_reduce = optval; break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ RACK_OPTS_INC(tcp_rack_tlp_reduce); rack->r_ctl.rc_tlp_cwnd_reduce = optval; break; case TCP_RACK_EARLY_RECOV: /* Should recovery happen early (bool) */ RACK_OPTS_INC(tcp_rack_early_recov); rack->r_ctl.rc_early_recovery = optval; break; case TCP_RACK_PACE_ALWAYS: /* Use the always pace method (bool) */ RACK_OPTS_INC(tcp_rack_pace_always); if (optval > 0) rack->rc_always_pace = 1; else rack->rc_always_pace = 0; break; case TCP_RACK_PACE_REDUCE: /* RACK Hptsi reduction factor (divisor) */ RACK_OPTS_INC(tcp_rack_pace_reduce); if (optval) /* Must be non-zero */ rack->rc_pace_reduce = optval; else error = EINVAL; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ RACK_OPTS_INC(tcp_rack_max_seg); rack->rc_pace_max_segs = optval; break; case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ RACK_OPTS_INC(tcp_rack_prr_sendalot); rack->r_ctl.rc_prr_sendalot = optval; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ RACK_OPTS_INC(tcp_rack_min_to); rack->r_ctl.rc_min_to = optval; break; case TCP_RACK_EARLY_SEG: /* If early recovery max segments */ RACK_OPTS_INC(tcp_rack_early_seg); rack->r_ctl.rc_early_recovery_segs = optval; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ RACK_OPTS_INC(tcp_rack_reord_thresh); if ((optval > 0) && (optval < 31)) rack->r_ctl.rc_reorder_shift = optval; else error = EINVAL; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ RACK_OPTS_INC(tcp_rack_reord_fade); rack->r_ctl.rc_reorder_fade = optval; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ RACK_OPTS_INC(tcp_rack_tlp_thresh); if (optval) rack->r_ctl.rc_tlp_threshold = optval; else error = EINVAL; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ RACK_OPTS_INC(tcp_rack_pkt_delay); rack->r_ctl.rc_pkt_delay = optval; break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ RACK_OPTS_INC(tcp_rack_tlp_inc_var); rack->r_ctl.rc_prr_inc_var = optval; break; case TCP_RACK_IDLE_REDUCE_HIGH: RACK_OPTS_INC(tcp_rack_idle_reduce_high); if (optval) rack->r_idle_reduce_largest = 1; else rack->r_idle_reduce_largest = 0; break; case TCP_DELACK: if (optval == 0) tp->t_delayed_ack = 0; else tp->t_delayed_ack = 1; if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; rack_output(tp); } break; case TCP_RACK_MIN_PACE: RACK_OPTS_INC(tcp_rack_min_pace); if (optval > 3) rack->r_enforce_min_pace = 3; else rack->r_enforce_min_pace = optval; break; case TCP_RACK_MIN_PACE_SEG: RACK_OPTS_INC(tcp_rack_min_pace_seg); if (optval >= 16) rack->r_min_pace_seg_thresh = 15; else rack->r_min_pace_seg_thresh = optval; break; case TCP_BBR_RACK_RTT_USE: if ((optval != USE_RTT_HIGH) && (optval != USE_RTT_LOW) && (optval != USE_RTT_AVG)) error = EINVAL; else rack->r_ctl.rc_rate_sample_method = optval; break; case TCP_DATA_AFTER_CLOSE: if (optval) rack->rc_allow_data_af_clo = 1; else rack->rc_allow_data_af_clo = 0; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } /* tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/ INP_WUNLOCK(inp); return (error); } static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { int32_t error, optval; /* * Because all our options are either boolean or an int, we can just * pull everything into optval and then unlock and copy. If we ever * add a option that is not a int, then this will have quite an * impact to this routine. */ switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: optval = rack->r_ctl.rc_prop_rate; break; case TCP_RACK_PROP: /* RACK proportional rate reduction (bool) */ optval = rack->r_ctl.rc_prop_reduce; break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ optval = rack->r_ctl.rc_tlp_cwnd_reduce; break; case TCP_RACK_EARLY_RECOV: /* Should recovery happen early (bool) */ optval = rack->r_ctl.rc_early_recovery; break; case TCP_RACK_PACE_REDUCE: /* RACK Hptsi reduction factor (divisor) */ optval = rack->rc_pace_reduce; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ optval = rack->rc_pace_max_segs; break; case TCP_RACK_PACE_ALWAYS: /* Use the always pace method */ optval = rack->rc_always_pace; break; case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ optval = rack->r_ctl.rc_prr_sendalot; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ optval = rack->r_ctl.rc_min_to; break; case TCP_RACK_EARLY_SEG: /* If early recovery max segments */ optval = rack->r_ctl.rc_early_recovery_segs; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ optval = rack->r_ctl.rc_reorder_shift; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ optval = rack->r_ctl.rc_reorder_fade; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ optval = rack->r_ctl.rc_tlp_threshold; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ optval = rack->r_ctl.rc_pkt_delay; break; case TCP_RACK_TLP_USE: optval = rack->rack_tlp_threshold_use; break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ optval = rack->r_ctl.rc_prr_inc_var; break; case TCP_RACK_IDLE_REDUCE_HIGH: optval = rack->r_idle_reduce_largest; break; case TCP_RACK_MIN_PACE: optval = rack->r_enforce_min_pace; break; case TCP_RACK_MIN_PACE_SEG: optval = rack->r_min_pace_seg_thresh; break; case TCP_BBR_RACK_RTT_USE: optval = rack->r_ctl.rc_rate_sample_method; break; case TCP_DELACK: optval = tp->t_delayed_ack; break; case TCP_DATA_AFTER_CLOSE: optval = rack->rc_allow_data_af_clo; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); return (error); } static int rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { int32_t error = EINVAL; struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack == NULL) { /* Huh? */ goto out; } if (sopt->sopt_dir == SOPT_SET) { return (rack_set_sockopt(so, sopt, inp, tp, rack)); } else if (sopt->sopt_dir == SOPT_GET) { return (rack_get_sockopt(so, sopt, inp, tp, rack)); } out: INP_WUNLOCK(inp); return (error); } struct tcp_function_block __tcp_rack = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = rack_output, .tfb_tcp_do_segment = rack_do_segment, .tfb_tcp_ctloutput = rack_ctloutput, .tfb_tcp_fb_init = rack_init, .tfb_tcp_fb_fini = rack_fini, .tfb_tcp_timer_stop_all = rack_stopall, .tfb_tcp_timer_activate = rack_timer_activate, .tfb_tcp_timer_active = rack_timer_active, .tfb_tcp_timer_stop = rack_timer_stop, .tfb_tcp_rexmit_tmr = rack_remxt_tmr, .tfb_tcp_handoff_ok = rack_handoff_ok }; static const char *rack_stack_names[] = { __XSTRING(STACKNAME), #ifdef STACKALIAS __XSTRING(STACKALIAS), #endif }; static int rack_ctor(void *mem, int32_t size, void *arg, int32_t how) { memset(mem, 0, size); return (0); } static void rack_dtor(void *mem, int32_t size, void *arg) { } static bool rack_mod_inited = false; static int tcp_addrack(module_t mod, int32_t type, void *data) { int32_t err = 0; int num_stacks; switch (type) { case MOD_LOAD: rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", sizeof(struct rack_sendmap), rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", sizeof(struct tcp_rack), rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); sysctl_ctx_init(&rack_sysctl_ctx); rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp), OID_AUTO, __XSTRING(STACKNAME), CTLFLAG_RW, 0, ""); if (rack_sysctl_root == NULL) { printf("Failed to add sysctl node\n"); err = EFAULT; goto free_uma; } rack_init_sysctls(); num_stacks = nitems(rack_stack_names); err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, rack_stack_names, &num_stacks); if (err) { printf("Failed to register %s stack name for " "%s module\n", rack_stack_names[num_stacks], __XSTRING(MODNAME)); sysctl_ctx_free(&rack_sysctl_ctx); free_uma: uma_zdestroy(rack_zone); uma_zdestroy(rack_pcb_zone); rack_counter_destroy(); printf("Failed to register rack module -- err:%d\n", err); return (err); } rack_mod_inited = true; break; case MOD_QUIESCE: err = deregister_tcp_functions(&__tcp_rack, true, false); break; case MOD_UNLOAD: err = deregister_tcp_functions(&__tcp_rack, false, true); if (err == EBUSY) break; if (rack_mod_inited) { uma_zdestroy(rack_zone); uma_zdestroy(rack_pcb_zone); sysctl_ctx_free(&rack_sysctl_ctx); rack_counter_destroy(); rack_mod_inited = false; } err = 0; break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t tcp_rack = { .name = __XSTRING(MODNAME), .evhand = tcp_addrack, .priv = 0 }; MODULE_VERSION(MODNAME, 1); DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); Index: projects/clang900-import/sys/netinet6/ip6_input.c =================================================================== --- projects/clang900-import/sys/netinet6/ip6_input.c (revision 352536) +++ projects/clang900-import/sys/netinet6/ip6_input.c (revision 352537) @@ -1,1861 +1,1863 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #include extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DEFINE(u_long, in6_ifaddrhmask); static struct netisr_handler ip6_nh = { .nh_name = "ip6", .nh_handler = ip6_input, .nh_proto = NETISR_IPV6, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; static int sysctl_netinet6_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_nh, qlimit)); } SYSCTL_DECL(_net_inet6_ip6); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet6_intr_queue_maxlen, "I", "Maximum size of the IPv6 input queue"); #ifdef RSS static struct netisr_handler ip6_direct_nh = { .nh_name = "ip6_direct", .nh_handler = ip6_direct_input, .nh_proto = NETISR_IPV6_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; static int sysctl_netinet6_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet6_intr_direct_queue_maxlen, "I", "Maximum size of the IPv6 direct input queue"); #endif VNET_DEFINE(pfil_head_t, inet6_pfil_head); VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat); VNET_PCPUSTAT_SYSINIT(ip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ip6stat); #endif /* VIMAGE */ struct rmlock in6_ifaddr_lock; RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock"); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); #ifdef PULLDOWN_TEST static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void ip6_init(void) { struct pfil_head_args args; struct protosw *pr; int i; TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal", &V_ip6_auto_linklocal); TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv); TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr); CK_STAILQ_INIT(&V_in6_ifaddrhead); V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR, &V_in6_ifaddrhmask); /* Initialize packet filter hooks. */ args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_IP6; args.pa_headname = PFIL_INET6_NAME; V_inet6_pfil_head = pfil_head_register(&args); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6, &V_ipsec_hhh_in[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6, &V_ipsec_hhh_out[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; /* Skip global initialization stuff for non-default instances. */ #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) { netisr_register_vnet(&ip6_nh); #ifdef RSS netisr_register_vnet(&ip6_direct_nh); #endif return; } #endif pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip6_init"); /* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip6_protox[i] = pr - inet6sw; /* * Cycle through IP protocols and put them into the appropriate place * in ip6_protox[]. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip6_protox[pr->pr_protocol] = pr - inet6sw; } netisr_register(&ip6_nh); #ifdef RSS netisr_register(&ip6_direct_nh); #endif } /* * The protocol to be inserted into ip6_protox[] must be already registered * in inet6sw[], either statically or through pf_proto_register(). */ int ip6proto_register(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] != pr - inet6sw) /* IPPROTO_RAW */ return (EEXIST); /* * Find the protocol position in inet6sw[] and set the index. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol == ip6proto) { ip6_protox[pr->pr_protocol] = pr - inet6sw; return (0); } } return (EPROTONOSUPPORT); } int ip6proto_unregister(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] == pr - inet6sw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip6_protox[ip6proto] = pr - inet6sw; return (0); } #ifdef VIMAGE static void ip6_destroy(void *unused __unused) { struct ifaddr *ifa, *nifa; struct ifnet *ifp; int error; #ifdef RSS netisr_unregister_vnet(&ip6_direct_nh); #endif netisr_unregister_vnet(&ip6_nh); pfil_head_unregister(V_inet6_pfil_head); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } /* Cleanup addresses. */ IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_LOCK(ifp); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } /* IF_ADDR_UNLOCK(ifp); */ in6_ifdetach_destroy(ifp); mld_domifdetach(ifp); /* Make sure any routes are gone as well. */ rt_flushifroutes_af(ifp, AF_INET6); } IFNET_RUNLOCK(); nd6_destroy(); in6_ifattach_destroy(); hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask); } VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL); #endif static int -ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off, +ip6_input_hbh(struct mbuf **mp, uint32_t *plen, uint32_t *rtalert, int *off, int *nxt, int *ours) { + struct mbuf *m; struct ip6_hdr *ip6; struct ip6_hbh *hbh; - if (ip6_hopopts_input(plen, rtalert, &m, off)) { + if (ip6_hopopts_input(plen, rtalert, mp, off)) { #if 0 /*touches NULL pointer*/ - in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); + in6_ifstat_inc((*mp)->m_pkthdr.rcvif, ifs6_in_discard); #endif goto out; /* m have already been freed */ } /* adjust pointer */ + m = *mp; ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && *plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ IP6STAT_INC(ip6s_badoptions); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); goto out; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); goto out; } #endif *nxt = hbh->ip6h_nxt; /* * If we are acting as a router and the packet contains a * router alert option, see if we know the option value. * Currently, we only support the option value for MLD, in which * case we should pass the packet to the multicast routing * daemon. */ if (*rtalert != ~0) { switch (*rtalert) { case IP6OPT_RTALERT_MLD: if (V_ip6_forwarding) *ours = 1; break; default: /* * RFC2711 requires unrecognized values must be * silently ignored. */ break; } } return (0); out: return (1); } #ifdef RSS /* * IPv6 direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip6_direct_input(struct mbuf *m) { int off, nxt; int nest; struct m_tag *mtag; struct ip6_direct_ctx *ip6dc; mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL); KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!")); ip6dc = (struct ip6_direct_ctx *)(mtag + 1); nxt = ip6dc->ip6dc_nxt; off = ip6dc->ip6dc_off; nest = 0; m_tag_delete(m, mtag); while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: m_freem(m); } #endif void ip6_input(struct mbuf *m) { struct in6_addr odst; struct ip6_hdr *ip6; struct in6_ifaddr *ia; struct ifnet *rcvif; u_int32_t plen; u_int32_t rtalert = ~0; int off = sizeof(struct ip6_hdr), nest; int nxt, ours = 0; int srcrt = 0; /* * Drop the packet if IPv6 operation is disabled on the interface. */ rcvif = m->m_pkthdr.rcvif; if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED)) goto bad; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * should the inner packet be considered authentic? * see comment in ah4_input(). * NB: m cannot be NULL when passed to the input routine */ m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; #endif /* IPSEC */ if (m->m_flags & M_FASTFWD_OURS) { /* * Firewall changed destination to local. */ ip6 = mtod(m, struct ip6_hdr *); goto passin; } /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) IP6STAT_INC(ip6s_mext2m); else IP6STAT_INC(ip6s_mext1); } else { if (m->m_next) { if (m->m_flags & M_LOOP) { IP6STAT_INC(ip6s_m2m[V_loif->if_index]); } else if (rcvif->if_index < IP6S_M2MMAX) IP6STAT_INC(ip6s_m2m[rcvif->if_index]); else IP6STAT_INC(ip6s_m2m[0]); } else IP6STAT_INC(ip6s_m1); } in6_ifstat_inc(rcvif, ifs6_in_receive); IP6STAT_INC(ip6s_total); #ifndef PULLDOWN_TEST /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; if (m->m_pkthdr.len > MHLEN) n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) goto bad; m_move_pkthdr(n, m); m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = n->m_pkthdr.len; m_freem(m); m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */); #endif if (m->m_len < sizeof(struct ip6_hdr)) { if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]); IP_PROBE(receive, NULL, NULL, ip6, rcvif, NULL, ip6); /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && !(m->m_flags & M_LOOP)) { /* * In this case, the packet should come from the loopback * interface. However, we cannot just check the if_flags, * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) { /* * RFC4291 2.7: * Nodes must not originate a packet to a multicast address * whose scop field contains the reserved value 0; if such * a packet is received, it must be silently dropped. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) { /* packet is dropped by traffic conditioner */ return; } #endif /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif /* * Try to forward the packet, but if we fail continue. * ip6_tryforward() does not generate redirects, so fall * through to normal processing if redirects are required. * ip6_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip6 pointer. */ if (V_ip6_forwarding != 0 && V_ip6_sendredirects == 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv6) || IPSEC_CAPS(ipv6, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { if ((m = ip6_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { ip6 = mtod(m, struct ip6_hdr *); goto passin; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv6) && IPSEC_CAPS(ipv6, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing * (e.g. by NAT rewriting). When this happens, * tell ip6_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_IN(V_inet6_pfil_head)) goto passin; odst = ip6->ip6_dst; if (pfil_run_hooks(V_inet6_pfil_head, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) != PFIL_PASS) return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); if ((m->m_flags & (M_IP6_NEXTHOP | M_FASTFWD_OURS)) == M_IP6_NEXTHOP && m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows forwarding * packets originally destined to us to some other directly * connected host. */ ip6_forward(m, 1); return; } passin: /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, rcvif, NULL) || in6_setscope(&ip6->ip6_dst, rcvif, NULL)) { IP6STAT_INC(ip6s_badscope); goto bad; } if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ours = 1; goto hbhcheck; } /* * Multicast check. Assume packet is for us to avoid * prematurely taking locks. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ours = 1; in6_ifstat_inc(rcvif, ifs6_in_mcast); goto hbhcheck; } /* * Unicast check * XXX: For now we keep link-local IPv6 addresses with embedded * scope zone id, therefore we use zero zoneid here. */ ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL) { if (ia->ia6_flags & IN6_IFF_NOTREADY) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst))); ifa_free(&ia->ia_ifa); goto bad; } /* Count the packet in the ip address stats */ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); ifa_free(&ia->ia_ifa); ours = 1; goto hbhcheck; } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!V_ip6_forwarding) { IP6STAT_INC(ip6s_cantforward); goto bad; } hbhcheck: /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { - if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0) + if (ip6_input_hbh(&m, &plen, &rtalert, &off, &nxt, &ours) != 0) return; } else nxt = ip6->ip6_nxt; /* * Use mbuf flags to propagate Router Alert option to * ICMPv6 layer, as hop-by-hop options have been stripped. */ if (rtalert != ~0) m->m_flags |= M_RTALERT_MLD; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (V_ip6_mrouter && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. * * XXX TODO: Check hlim and multicast scope here to avoid * unnecessarily calling into ip6_mforward(). */ if (ip6_mforward && ip6_mforward(ip6, rcvif, m)) { IP6STAT_INC(ip6s_cantforward); goto bad; } } else if (!ours) { ip6_forward(m, srcrt); return; } ip6 = mtod(m, struct ip6_hdr *); /* * Malicious party may be able to use IPv4 mapped addr to confuse * tcp/udp stack and bypass security checks (act as if it was from * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. * * For SIIT end node behavior, you may want to disable the check. * However, you will become vulnerable to attacks using IPv4 mapped * source. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } /* * Tell launch routine the next header */ IP6STAT_INC(ip6s_delivered); in6_ifstat_inc(rcvif, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: in6_ifstat_inc(rcvif, ifs6_in_discard); if (m != NULL) m_freem(m); } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. * * rtalertp - XXX: should be stored more smart way */ static int ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; /* validation of the length of the header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_CHECK(m, off, hbhlen, -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } #endif off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) return (-1); *offp = off; *mp = m; return (0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in contiguous memory region. */ int ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not contiguous in order to return an ICMPv6 error. */ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ IP6STAT_INC(ip6s_badoptions); ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } /* * Create the "control" list for this pcb. * These functions will not modify mbuf chain at all. * * With KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. * * ip6_savecontrol_v4 will handle those options that are possible to be * set on a v4-mapped socket. * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those * options and handle the v6-only ones itself. */ struct mbuf ** ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, int *v4only) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #ifdef SO_TIMESTAMP if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { union { struct timeval tv; struct bintime bt; struct timespec ts; } t; struct bintime boottimebin, bt1; struct timespec ts1; bool stamped; stamped = false; switch (inp->inp_socket->so_ts_clock) { case SO_TS_REALTIME_MICRO: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &t.tv); } else { microtime(&t.tv); } *mp = sbcreatecontrol((caddr_t) &t.tv, sizeof(t.tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_BINTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &t.bt); getboottimebin(&boottimebin); bintime_add(&t.bt, &boottimebin); } else { bintime(&t.bt); } *mp = sbcreatecontrol((caddr_t)&t.bt, sizeof(t.bt), SCM_BINTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_REALTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &t.ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); timespecadd(&t.ts, &ts1, &t.ts); } else { nanotime(&t.ts); } *mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts), SCM_REALTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_MONOTONIC: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &t.ts); else nanouptime(&t.ts); *mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts), SCM_MONOTONIC, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; default: panic("unknown (corrupted) so_ts_clock"); } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET); if (*mp != NULL) mp = &(*mp)->m_next; } } #endif #define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); pi6.ipi6_addr.s6_addr32[0] = 0; pi6.ipi6_addr.s6_addr32[1] = 0; pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr; #else /* We won't hit this code */ bzero(&pi6.ipi6_addr, sizeof(struct in6_addr)); #endif } else { bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ } pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol((caddr_t) &pi6, sizeof(struct in6_pktinfo), IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); hlim = ip->ip_ttl; #else /* We won't hit this code */ hlim = 0; #endif } else { hlim = ip6->ip6_hlim & 0xff; } *mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int), IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_TCLASS) != 0) { int tclass; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); tclass = ip->ip_tos; #else /* We won't hit this code */ tclass = 0; #endif } else { u_int32_t flowinfo; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; } *mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int), IPV6_TCLASS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if (v4only != NULL) { if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { *v4only = 1; } else { *v4only = 0; } } return (mp); } void ip6_savecontrol(struct inpcb *inp, struct mbuf *m, struct mbuf **mp) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); int v4only = 0; mp = ip6_savecontrol_v4(inp, m, mp, &v4only); if (v4only) return; /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ if ((inp->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which is assured through the * IPv6 input processing. */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen = 0; #ifdef PULLDOWN_TEST struct mbuf *ext; #endif #ifndef PULLDOWN_TEST hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; #else ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), ip6->ip6_nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif /* * XXX: We copy the whole header even if a * jumbo payload option is included, the option which * is to be removed before returning according to * RFC2292. * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, IS2292(inp, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; #ifdef PULLDOWN_TEST m_freem(ext); #endif } } if ((inp->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; #ifdef PULLDOWN_TEST struct mbuf *ext = NULL; #endif /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } #ifndef PULLDOWN_TEST if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; #else ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif switch (nxt) { case IPPROTO_DSTOPTS: if (!(inp->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(inp, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!(inp->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(inp, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ #ifdef PULLDOWN_TEST m_freem(ext); #endif goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; #ifdef PULLDOWN_TEST m_freem(ext); ext = NULL; #endif } loopend: ; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } } #endif } #undef IS2292 void ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu) { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); /* * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. * NOTE: we notify disconnected sockets, because some udp * applications keep sending sockets disconnected. * NOTE: our implementation doesn't notify connected sockets that has * foreign address that is different than given destination addresses * (this is permitted by RFC 3542). */ if ((inp->inp_flags & IN6P_MTU) == 0 || ( !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr))) return; mtuctl.ip6m_mtu = mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) return; so = inp->inp_socket; if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu) == 0) { m_freem(m_mtu); /* XXX: should count statistics */ } else sorwakeup(so); } #ifdef PULLDOWN_TEST /* * pull single extension header from mbuf chain. returns single mbuf that * contains the result, or NULL on error. */ static struct mbuf * ip6_pullexthdr(struct mbuf *m, size_t off, int nxt) { struct ip6_ext ip6e; size_t elen; struct mbuf *n; #ifdef DIAGNOSTIC switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: printf("ip6_pullexthdr: invalid nxt=%d\n", nxt); } #endif m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxt == IPPROTO_AH) elen = (ip6e.ip6e_len + 2) << 2; else elen = (ip6e.ip6e_len + 1) << 3; if (elen > MLEN) n = m_getcl(M_NOWAIT, MT_DATA, 0); else n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) return NULL; m_copydata(m, off, elen, mtod(n, caddr_t)); n->m_len = elen; return n; } #endif /* * Get pointer to the previous header followed by the header * currently processed. */ int ip6_get_prevhdr(const struct mbuf *m, int off) { struct ip6_ext ip6e; struct ip6_hdr *ip6; int len, nlen, nxt; if (off == sizeof(struct ip6_hdr)) return (offsetof(struct ip6_hdr, ip6_nxt)); if (off < sizeof(struct ip6_hdr)) panic("%s: off < sizeof(struct ip6_hdr)", __func__); ip6 = mtod(m, struct ip6_hdr *); nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); nlen = 0; while (len < off) { m_copydata(m, len, sizeof(ip6e), (caddr_t)&ip6e); switch (nxt) { case IPPROTO_FRAGMENT: nlen = sizeof(struct ip6_frag); break; case IPPROTO_AH: nlen = (ip6e.ip6e_len + 2) << 2; break; default: nlen = (ip6e.ip6e_len + 1) << 3; } len += nlen; nxt = ip6e.ip6e_nxt; } return (len - nlen); } /* * get next header offset. m will be retained. */ int ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp) { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } /* NOTREACHED */ } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp) { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } /* * System control for IP6 */ u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; Index: projects/clang900-import/sys/sys/pmc.h =================================================================== --- projects/clang900-import/sys/sys/pmc.h (revision 352536) +++ projects/clang900-import/sys/sys/pmc.h (revision 352537) @@ -1,1230 +1,1232 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2008, Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_PMC_H_ #define _SYS_PMC_H_ #include #include #include #include #include #ifdef _KERNEL #include #include #endif #define PMC_MODULE_NAME "hwpmc" #define PMC_NAME_MAX 64 /* HW counter name size */ #define PMC_CLASS_MAX 8 /* max #classes of PMCs per-system */ /* * Kernel<->userland API version number [MMmmpppp] * * Major numbers are to be incremented when an incompatible change to * the ABI occurs that older clients will not be able to handle. * * Minor numbers are incremented when a backwards compatible change * occurs that allows older correct programs to run unchanged. For * example, when support for a new PMC type is added. * * The patch version is incremented for every bug fix. */ #define PMC_VERSION_MAJOR 0x09 #define PMC_VERSION_MINOR 0x03 #define PMC_VERSION_PATCH 0x0000 #define PMC_VERSION (PMC_VERSION_MAJOR << 24 | \ PMC_VERSION_MINOR << 16 | PMC_VERSION_PATCH) #define PMC_CPUID_LEN 64 /* cpu model name for pmu lookup */ extern char pmc_cpuid[PMC_CPUID_LEN]; /* * Kinds of CPUs known. * * We keep track of CPU variants that need to be distinguished in * some way for PMC operations. CPU names are grouped by manufacturer * and numbered sparsely in order to minimize changes to the ABI involved * when new CPUs are added. */ #define __PMC_CPUS() \ __PMC_CPU(AMD_K7, 0x00, "AMD K7") \ __PMC_CPU(AMD_K8, 0x01, "AMD K8") \ __PMC_CPU(INTEL_P5, 0x80, "Intel Pentium") \ __PMC_CPU(INTEL_P6, 0x81, "Intel Pentium Pro") \ __PMC_CPU(INTEL_CL, 0x82, "Intel Celeron") \ __PMC_CPU(INTEL_PII, 0x83, "Intel Pentium II") \ __PMC_CPU(INTEL_PIII, 0x84, "Intel Pentium III") \ __PMC_CPU(INTEL_PM, 0x85, "Intel Pentium M") \ __PMC_CPU(INTEL_PIV, 0x86, "Intel Pentium IV") \ __PMC_CPU(INTEL_CORE, 0x87, "Intel Core Solo/Duo") \ __PMC_CPU(INTEL_CORE2, 0x88, "Intel Core2") \ __PMC_CPU(INTEL_CORE2EXTREME, 0x89, "Intel Core2 Extreme") \ __PMC_CPU(INTEL_ATOM, 0x8A, "Intel Atom") \ __PMC_CPU(INTEL_COREI7, 0x8B, "Intel Core i7") \ __PMC_CPU(INTEL_WESTMERE, 0x8C, "Intel Westmere") \ __PMC_CPU(INTEL_SANDYBRIDGE, 0x8D, "Intel Sandy Bridge") \ __PMC_CPU(INTEL_IVYBRIDGE, 0x8E, "Intel Ivy Bridge") \ __PMC_CPU(INTEL_SANDYBRIDGE_XEON, 0x8F, "Intel Sandy Bridge Xeon") \ __PMC_CPU(INTEL_IVYBRIDGE_XEON, 0x90, "Intel Ivy Bridge Xeon") \ __PMC_CPU(INTEL_HASWELL, 0x91, "Intel Haswell") \ __PMC_CPU(INTEL_ATOM_SILVERMONT, 0x92, "Intel Atom Silvermont") \ __PMC_CPU(INTEL_NEHALEM_EX, 0x93, "Intel Nehalem Xeon 7500") \ __PMC_CPU(INTEL_WESTMERE_EX, 0x94, "Intel Westmere Xeon E7") \ __PMC_CPU(INTEL_HASWELL_XEON, 0x95, "Intel Haswell Xeon E5 v3") \ __PMC_CPU(INTEL_BROADWELL, 0x96, "Intel Broadwell") \ __PMC_CPU(INTEL_BROADWELL_XEON, 0x97, "Intel Broadwell Xeon") \ __PMC_CPU(INTEL_SKYLAKE, 0x98, "Intel Skylake") \ __PMC_CPU(INTEL_SKYLAKE_XEON, 0x99, "Intel Skylake Xeon") \ __PMC_CPU(INTEL_XSCALE, 0x100, "Intel XScale") \ __PMC_CPU(MIPS_24K, 0x200, "MIPS 24K") \ __PMC_CPU(MIPS_OCTEON, 0x201, "Cavium Octeon") \ __PMC_CPU(MIPS_74K, 0x202, "MIPS 74K") \ + __PMC_CPU(MIPS_BERI, 0x203, "BERI") \ __PMC_CPU(PPC_7450, 0x300, "PowerPC MPC7450") \ __PMC_CPU(PPC_E500, 0x340, "PowerPC e500 Core") \ __PMC_CPU(PPC_970, 0x380, "IBM PowerPC 970") \ __PMC_CPU(GENERIC, 0x400, "Generic") \ __PMC_CPU(ARMV7_CORTEX_A5, 0x500, "ARMv7 Cortex A5") \ __PMC_CPU(ARMV7_CORTEX_A7, 0x501, "ARMv7 Cortex A7") \ __PMC_CPU(ARMV7_CORTEX_A8, 0x502, "ARMv7 Cortex A8") \ __PMC_CPU(ARMV7_CORTEX_A9, 0x503, "ARMv7 Cortex A9") \ __PMC_CPU(ARMV7_CORTEX_A15, 0x504, "ARMv7 Cortex A15") \ __PMC_CPU(ARMV7_CORTEX_A17, 0x505, "ARMv7 Cortex A17") \ __PMC_CPU(ARMV8_CORTEX_A53, 0x600, "ARMv8 Cortex A53") \ __PMC_CPU(ARMV8_CORTEX_A57, 0x601, "ARMv8 Cortex A57") enum pmc_cputype { #undef __PMC_CPU #define __PMC_CPU(S,V,D) PMC_CPU_##S = V, __PMC_CPUS() }; #define PMC_CPU_FIRST PMC_CPU_AMD_K7 #define PMC_CPU_LAST PMC_CPU_GENERIC /* * Classes of PMCs */ #define __PMC_CLASSES() \ __PMC_CLASS(TSC, 0x00, "CPU Timestamp counter") \ __PMC_CLASS(K7, 0x01, "AMD K7 performance counters") \ __PMC_CLASS(K8, 0x02, "AMD K8 performance counters") \ __PMC_CLASS(P5, 0x03, "Intel Pentium counters") \ __PMC_CLASS(P6, 0x04, "Intel Pentium Pro counters") \ __PMC_CLASS(P4, 0x05, "Intel Pentium-IV counters") \ __PMC_CLASS(IAF, 0x06, "Intel Core2/Atom, fixed function") \ __PMC_CLASS(IAP, 0x07, "Intel Core...Atom, programmable") \ __PMC_CLASS(UCF, 0x08, "Intel Uncore fixed function") \ __PMC_CLASS(UCP, 0x09, "Intel Uncore programmable") \ __PMC_CLASS(XSCALE, 0x0A, "Intel XScale counters") \ __PMC_CLASS(MIPS24K, 0x0B, "MIPS 24K") \ __PMC_CLASS(OCTEON, 0x0C, "Cavium Octeon") \ __PMC_CLASS(PPC7450, 0x0D, "Motorola MPC7450 class") \ __PMC_CLASS(PPC970, 0x0E, "IBM PowerPC 970 class") \ __PMC_CLASS(SOFT, 0x0F, "Software events") \ __PMC_CLASS(ARMV7, 0x10, "ARMv7") \ __PMC_CLASS(ARMV8, 0x11, "ARMv8") \ __PMC_CLASS(MIPS74K, 0x12, "MIPS 74K") \ - __PMC_CLASS(E500, 0x13, "Freescale e500 class") + __PMC_CLASS(E500, 0x13, "Freescale e500 class") \ + __PMC_CLASS(BERI, 0x14, "MIPS BERI") enum pmc_class { #undef __PMC_CLASS #define __PMC_CLASS(S,V,D) PMC_CLASS_##S = V, __PMC_CLASSES() }; #define PMC_CLASS_FIRST PMC_CLASS_TSC #define PMC_CLASS_LAST PMC_CLASS_E500 /* * A PMC can be in the following states: * * Hardware states: * DISABLED -- administratively prohibited from being used. * FREE -- HW available for use * Software states: * ALLOCATED -- allocated * STOPPED -- allocated, but not counting events * RUNNING -- allocated, and in operation; 'pm_runcount' * holds the number of CPUs using this PMC at * a given instant * DELETED -- being destroyed */ #define __PMC_HWSTATES() \ __PMC_STATE(DISABLED) \ __PMC_STATE(FREE) #define __PMC_SWSTATES() \ __PMC_STATE(ALLOCATED) \ __PMC_STATE(STOPPED) \ __PMC_STATE(RUNNING) \ __PMC_STATE(DELETED) #define __PMC_STATES() \ __PMC_HWSTATES() \ __PMC_SWSTATES() enum pmc_state { #undef __PMC_STATE #define __PMC_STATE(S) PMC_STATE_##S, __PMC_STATES() __PMC_STATE(MAX) }; #define PMC_STATE_FIRST PMC_STATE_DISABLED #define PMC_STATE_LAST PMC_STATE_DELETED /* * An allocated PMC may used as a 'global' counter or as a * 'thread-private' one. Each such mode of use can be in either * statistical sampling mode or in counting mode. Thus a PMC in use * * SS i.e., SYSTEM STATISTICAL -- system-wide statistical profiling * SC i.e., SYSTEM COUNTER -- system-wide counting mode * TS i.e., THREAD STATISTICAL -- thread virtual, statistical profiling * TC i.e., THREAD COUNTER -- thread virtual, counting mode * * Statistical profiling modes rely on the PMC periodically delivering * a interrupt to the CPU (when the configured number of events have * been measured), so the PMC must have the ability to generate * interrupts. * * In counting modes, the PMC counts its configured events, with the * value of the PMC being read whenever needed by its owner process. * * The thread specific modes "virtualize" the PMCs -- the PMCs appear * to be thread private and count events only when the profiled thread * actually executes on the CPU. * * The system-wide "global" modes keep the PMCs running all the time * and are used to measure the behaviour of the whole system. */ #define __PMC_MODES() \ __PMC_MODE(SS, 0) \ __PMC_MODE(SC, 1) \ __PMC_MODE(TS, 2) \ __PMC_MODE(TC, 3) enum pmc_mode { #undef __PMC_MODE #define __PMC_MODE(M,N) PMC_MODE_##M = N, __PMC_MODES() }; #define PMC_MODE_FIRST PMC_MODE_SS #define PMC_MODE_LAST PMC_MODE_TC #define PMC_IS_COUNTING_MODE(mode) \ ((mode) == PMC_MODE_SC || (mode) == PMC_MODE_TC) #define PMC_IS_SYSTEM_MODE(mode) \ ((mode) == PMC_MODE_SS || (mode) == PMC_MODE_SC) #define PMC_IS_SAMPLING_MODE(mode) \ ((mode) == PMC_MODE_SS || (mode) == PMC_MODE_TS) #define PMC_IS_VIRTUAL_MODE(mode) \ ((mode) == PMC_MODE_TS || (mode) == PMC_MODE_TC) /* * PMC row disposition */ #define __PMC_DISPOSITIONS(N) \ __PMC_DISP(STANDALONE) /* global/disabled counters */ \ __PMC_DISP(FREE) /* free/available */ \ __PMC_DISP(THREAD) /* thread-virtual PMCs */ \ __PMC_DISP(UNKNOWN) /* sentinel */ enum pmc_disp { #undef __PMC_DISP #define __PMC_DISP(D) PMC_DISP_##D , __PMC_DISPOSITIONS() }; #define PMC_DISP_FIRST PMC_DISP_STANDALONE #define PMC_DISP_LAST PMC_DISP_THREAD /* * Counter capabilities * * __PMC_CAPS(NAME, VALUE, DESCRIPTION) */ #define __PMC_CAPS() \ __PMC_CAP(INTERRUPT, 0, "generate interrupts") \ __PMC_CAP(USER, 1, "count user-mode events") \ __PMC_CAP(SYSTEM, 2, "count system-mode events") \ __PMC_CAP(EDGE, 3, "do edge detection of events") \ __PMC_CAP(THRESHOLD, 4, "ignore events below a threshold") \ __PMC_CAP(READ, 5, "read PMC counter") \ __PMC_CAP(WRITE, 6, "reprogram PMC counter") \ __PMC_CAP(INVERT, 7, "invert comparison sense") \ __PMC_CAP(QUALIFIER, 8, "further qualify monitored events") \ __PMC_CAP(PRECISE, 9, "perform precise sampling") \ __PMC_CAP(TAGGING, 10, "tag upstream events") \ __PMC_CAP(CASCADE, 11, "cascade counters") enum pmc_caps { #undef __PMC_CAP #define __PMC_CAP(NAME, VALUE, DESCR) PMC_CAP_##NAME = (1 << VALUE) , __PMC_CAPS() }; #define PMC_CAP_FIRST PMC_CAP_INTERRUPT #define PMC_CAP_LAST PMC_CAP_CASCADE /* * PMC Event Numbers * * These are generated from the definitions in "dev/hwpmc/pmc_events.h". */ enum pmc_event { #undef __PMC_EV #undef __PMC_EV_BLOCK #define __PMC_EV_BLOCK(C,V) PMC_EV_ ## C ## __BLOCK_START = (V) - 1 , #define __PMC_EV(C,N) PMC_EV_ ## C ## _ ## N , __PMC_EVENTS() }; /* * PMC SYSCALL INTERFACE */ /* * "PMC_OPS" -- these are the commands recognized by the kernel * module, and are used when performing a system call from userland. */ #define __PMC_OPS() \ __PMC_OP(CONFIGURELOG, "Set log file") \ __PMC_OP(FLUSHLOG, "Flush log file") \ __PMC_OP(GETCPUINFO, "Get system CPU information") \ __PMC_OP(GETDRIVERSTATS, "Get driver statistics") \ __PMC_OP(GETMODULEVERSION, "Get module version") \ __PMC_OP(GETPMCINFO, "Get per-cpu PMC information") \ __PMC_OP(PMCADMIN, "Set PMC state") \ __PMC_OP(PMCALLOCATE, "Allocate and configure a PMC") \ __PMC_OP(PMCATTACH, "Attach a PMC to a process") \ __PMC_OP(PMCDETACH, "Detach a PMC from a process") \ __PMC_OP(PMCGETMSR, "Get a PMC's hardware address") \ __PMC_OP(PMCRELEASE, "Release a PMC") \ __PMC_OP(PMCRW, "Read/Set a PMC") \ __PMC_OP(PMCSETCOUNT, "Set initial count/sampling rate") \ __PMC_OP(PMCSTART, "Start a PMC") \ __PMC_OP(PMCSTOP, "Stop a PMC") \ __PMC_OP(WRITELOG, "Write a cookie to the log file") \ __PMC_OP(CLOSELOG, "Close log file") \ __PMC_OP(GETDYNEVENTINFO, "Get dynamic events list") enum pmc_ops { #undef __PMC_OP #define __PMC_OP(N, D) PMC_OP_##N, __PMC_OPS() }; /* * Flags used in operations on PMCs. */ #define PMC_F_UNUSED1 0x00000001 /* unused */ #define PMC_F_DESCENDANTS 0x00000002 /*OP ALLOCATE track descendants */ #define PMC_F_LOG_PROCCSW 0x00000004 /*OP ALLOCATE track ctx switches */ #define PMC_F_LOG_PROCEXIT 0x00000008 /*OP ALLOCATE log proc exits */ #define PMC_F_NEWVALUE 0x00000010 /*OP RW write new value */ #define PMC_F_OLDVALUE 0x00000020 /*OP RW get old value */ /* V2 API */ #define PMC_F_CALLCHAIN 0x00000080 /*OP ALLOCATE capture callchains */ #define PMC_F_USERCALLCHAIN 0x00000100 /*OP ALLOCATE use userspace stack */ /* internal flags */ #define PMC_F_ATTACHED_TO_OWNER 0x00010000 /*attached to owner*/ #define PMC_F_NEEDS_LOGFILE 0x00020000 /*needs log file */ #define PMC_F_ATTACH_DONE 0x00040000 /*attached at least once */ #define PMC_CALLCHAIN_DEPTH_MAX 512 #define PMC_CC_F_USERSPACE 0x01 /*userspace callchain*/ /* * Cookies used to denote allocated PMCs, and the values of PMCs. */ typedef uint32_t pmc_id_t; typedef uint64_t pmc_value_t; #define PMC_ID_INVALID (~ (pmc_id_t) 0) /* * PMC IDs have the following format: * * +-----------------------+-------+-----------+ * | CPU | PMC MODE | CLASS | ROW INDEX | * +-----------------------+-------+-----------+ * * where CPU is 12 bits, MODE 8, CLASS 4, and ROW INDEX 8 Field 'CPU' * is set to the requested CPU for system-wide PMCs or PMC_CPU_ANY for * process-mode PMCs. Field 'PMC MODE' is the allocated PMC mode. * Field 'PMC CLASS' is the class of the PMC. Field 'ROW INDEX' is the * row index for the PMC. * * The 'ROW INDEX' ranges over 0..NWPMCS where NHWPMCS is the total * number of hardware PMCs on this cpu. */ #define PMC_ID_TO_ROWINDEX(ID) ((ID) & 0xFF) #define PMC_ID_TO_CLASS(ID) (((ID) & 0xF00) >> 8) #define PMC_ID_TO_MODE(ID) (((ID) & 0xFF000) >> 12) #define PMC_ID_TO_CPU(ID) (((ID) & 0xFFF00000) >> 20) #define PMC_ID_MAKE_ID(CPU,MODE,CLASS,ROWINDEX) \ ((((CPU) & 0xFFF) << 20) | (((MODE) & 0xFF) << 12) | \ (((CLASS) & 0xF) << 8) | ((ROWINDEX) & 0xFF)) /* * Data structures for system calls supported by the pmc driver. */ /* * OP PMCALLOCATE * * Allocate a PMC on the named CPU. */ #define PMC_CPU_ANY ~0 struct pmc_op_pmcallocate { uint32_t pm_caps; /* PMC_CAP_* */ uint32_t pm_cpu; /* CPU number or PMC_CPU_ANY */ enum pmc_class pm_class; /* class of PMC desired */ enum pmc_event pm_ev; /* [enum pmc_event] desired */ uint32_t pm_flags; /* additional modifiers PMC_F_* */ enum pmc_mode pm_mode; /* desired mode */ pmc_id_t pm_pmcid; /* [return] process pmc id */ pmc_value_t pm_count; /* initial/sample count */ union pmc_md_op_pmcallocate pm_md; /* MD layer extensions */ }; /* * OP PMCADMIN * * Set the administrative state (i.e., whether enabled or disabled) of * a PMC 'pm_pmc' on CPU 'pm_cpu'. Note that 'pm_pmc' specifies an * absolute PMC number and need not have been first allocated by the * calling process. */ struct pmc_op_pmcadmin { int pm_cpu; /* CPU# */ uint32_t pm_flags; /* flags */ int pm_pmc; /* PMC# */ enum pmc_state pm_state; /* desired state */ }; /* * OP PMCATTACH / OP PMCDETACH * * Attach/detach a PMC and a process. */ struct pmc_op_pmcattach { pmc_id_t pm_pmc; /* PMC to attach to */ pid_t pm_pid; /* target process */ }; /* * OP PMCSETCOUNT * * Set the sampling rate (i.e., the reload count) for statistical counters. * 'pm_pmcid' need to have been previously allocated using PMCALLOCATE. */ struct pmc_op_pmcsetcount { pmc_value_t pm_count; /* initial/sample count */ pmc_id_t pm_pmcid; /* PMC id to set */ }; /* * OP PMCRW * * Read the value of a PMC named by 'pm_pmcid'. 'pm_pmcid' needs * to have been previously allocated using PMCALLOCATE. */ struct pmc_op_pmcrw { uint32_t pm_flags; /* PMC_F_{OLD,NEW}VALUE*/ pmc_id_t pm_pmcid; /* pmc id */ pmc_value_t pm_value; /* new&returned value */ }; /* * OP GETPMCINFO * * retrieve PMC state for a named CPU. The caller is expected to * allocate 'npmc' * 'struct pmc_info' bytes of space for the return * values. */ struct pmc_info { char pm_name[PMC_NAME_MAX]; /* pmc name */ enum pmc_class pm_class; /* enum pmc_class */ int pm_enabled; /* whether enabled */ enum pmc_disp pm_rowdisp; /* FREE, THREAD or STANDLONE */ pid_t pm_ownerpid; /* owner, or -1 */ enum pmc_mode pm_mode; /* current mode [enum pmc_mode] */ enum pmc_event pm_event; /* current event */ uint32_t pm_flags; /* current flags */ pmc_value_t pm_reloadcount; /* sampling counters only */ }; struct pmc_op_getpmcinfo { int32_t pm_cpu; /* 0 <= cpu < mp_maxid */ struct pmc_info pm_pmcs[]; /* space for 'npmc' structures */ }; /* * OP GETCPUINFO * * Retrieve system CPU information. */ struct pmc_classinfo { enum pmc_class pm_class; /* class id */ uint32_t pm_caps; /* counter capabilities */ uint32_t pm_width; /* width of the PMC */ uint32_t pm_num; /* number of PMCs in class */ }; struct pmc_op_getcpuinfo { enum pmc_cputype pm_cputype; /* what kind of CPU */ uint32_t pm_ncpu; /* max CPU number */ uint32_t pm_npmc; /* #PMCs per CPU */ uint32_t pm_nclass; /* #classes of PMCs */ struct pmc_classinfo pm_classes[PMC_CLASS_MAX]; }; /* * OP CONFIGURELOG * * Configure a log file for writing system-wide statistics to. */ struct pmc_op_configurelog { int pm_flags; int pm_logfd; /* logfile fd (or -1) */ }; /* * OP GETDRIVERSTATS * * Retrieve pmc(4) driver-wide statistics. */ #ifdef _KERNEL struct pmc_driverstats { counter_u64_t pm_intr_ignored; /* #interrupts ignored */ counter_u64_t pm_intr_processed; /* #interrupts processed */ counter_u64_t pm_intr_bufferfull; /* #interrupts with ENOSPC */ counter_u64_t pm_syscalls; /* #syscalls */ counter_u64_t pm_syscall_errors; /* #syscalls with errors */ counter_u64_t pm_buffer_requests; /* #buffer requests */ counter_u64_t pm_buffer_requests_failed; /* #failed buffer requests */ counter_u64_t pm_log_sweeps; /* #sample buffer processing passes */ counter_u64_t pm_merges; /* merged k+u */ counter_u64_t pm_overwrites; /* UR overwrites */ }; #endif struct pmc_op_getdriverstats { unsigned int pm_intr_ignored; /* #interrupts ignored */ unsigned int pm_intr_processed; /* #interrupts processed */ unsigned int pm_intr_bufferfull; /* #interrupts with ENOSPC */ unsigned int pm_syscalls; /* #syscalls */ unsigned int pm_syscall_errors; /* #syscalls with errors */ unsigned int pm_buffer_requests; /* #buffer requests */ unsigned int pm_buffer_requests_failed; /* #failed buffer requests */ unsigned int pm_log_sweeps; /* #sample buffer processing passes */ }; /* * OP RELEASE / OP START / OP STOP * * Simple operations on a PMC id. */ struct pmc_op_simple { pmc_id_t pm_pmcid; }; /* * OP WRITELOG * * Flush the current log buffer and write 4 bytes of user data to it. */ struct pmc_op_writelog { uint32_t pm_userdata; }; /* * OP GETMSR * * Retrieve the machine specific address associated with the allocated * PMC. This number can be used subsequently with a read-performance-counter * instruction. */ struct pmc_op_getmsr { uint32_t pm_msr; /* machine specific address */ pmc_id_t pm_pmcid; /* allocated pmc id */ }; /* * OP GETDYNEVENTINFO * * Retrieve a PMC dynamic class events list. */ struct pmc_dyn_event_descr { char pm_ev_name[PMC_NAME_MAX]; enum pmc_event pm_ev_code; }; struct pmc_op_getdyneventinfo { enum pmc_class pm_class; unsigned int pm_nevent; struct pmc_dyn_event_descr pm_events[PMC_EV_DYN_COUNT]; }; #ifdef _KERNEL #include #include #include #include #define PMC_HASH_SIZE 1024 #define PMC_MTXPOOL_SIZE 2048 #define PMC_LOG_BUFFER_SIZE 256 #define PMC_NLOGBUFFERS_PCPU 32 #define PMC_NSAMPLES 256 #define PMC_CALLCHAIN_DEPTH 128 #define PMC_THREADLIST_MAX 128 #define PMC_SYSCTL_NAME_PREFIX "kern." PMC_MODULE_NAME "." /* * Locking keys * * (b) - pmc_bufferlist_mtx (spin lock) * (k) - pmc_kthread_mtx (sleep lock) * (o) - po->po_mtx (spin lock) * (g) - global_epoch_preempt (epoch) * (p) - pmc_sx (sx) */ /* * PMC commands */ struct pmc_syscall_args { register_t pmop_code; /* one of PMC_OP_* */ void *pmop_data; /* syscall parameter */ }; /* * Interface to processor specific s1tuff */ /* * struct pmc_descr * * Machine independent (i.e., the common parts) of a human readable * PMC description. */ struct pmc_descr { char pd_name[PMC_NAME_MAX]; /* name */ uint32_t pd_caps; /* capabilities */ enum pmc_class pd_class; /* class of the PMC */ uint32_t pd_width; /* width in bits */ }; /* * struct pmc_target * * This structure records all the target processes associated with a * PMC. */ struct pmc_target { LIST_ENTRY(pmc_target) pt_next; struct pmc_process *pt_process; /* target descriptor */ }; /* * struct pmc * * Describes each allocated PMC. * * Each PMC has precisely one owner, namely the process that allocated * the PMC. * * A PMC may be attached to multiple target processes. The * 'pm_targets' field links all the target processes being monitored * by this PMC. * * The 'pm_savedvalue' field is protected by a mutex. * * On a multi-cpu machine, multiple target threads associated with a * process-virtual PMC could be concurrently executing on different * CPUs. The 'pm_runcount' field is atomically incremented every time * the PMC gets scheduled on a CPU and atomically decremented when it * get descheduled. Deletion of a PMC is only permitted when this * field is '0'. * */ struct pmc_pcpu_state { uint8_t pps_stalled; uint8_t pps_cpustate; } __aligned(CACHE_LINE_SIZE); struct pmc { LIST_HEAD(,pmc_target) pm_targets; /* list of target processes */ LIST_ENTRY(pmc) pm_next; /* owner's list */ /* * System-wide PMCs are allocated on a CPU and are not moved * around. For system-wide PMCs we record the CPU the PMC was * allocated on in the 'CPU' field of the pmc ID. * * Virtual PMCs run on whichever CPU is currently executing * their targets' threads. For these PMCs we need to save * their current PMC counter values when they are taken off * CPU. */ union { pmc_value_t pm_savedvalue; /* Virtual PMCS */ } pm_gv; /* * For sampling mode PMCs, we keep track of the PMC's "reload * count", which is the counter value to be loaded in when * arming the PMC for the next counting session. For counting * modes on PMCs that are read-only (e.g., the x86 TSC), we * keep track of the initial value at the start of * counting-mode operation. */ union { pmc_value_t pm_reloadcount; /* sampling PMC modes */ pmc_value_t pm_initial; /* counting PMC modes */ } pm_sc; struct pmc_pcpu_state *pm_pcpu_state; volatile cpuset_t pm_cpustate; /* CPUs where PMC should be active */ uint32_t pm_caps; /* PMC capabilities */ enum pmc_event pm_event; /* event being measured */ uint32_t pm_flags; /* additional flags PMC_F_... */ struct pmc_owner *pm_owner; /* owner thread state */ counter_u64_t pm_runcount; /* #cpus currently on */ enum pmc_state pm_state; /* current PMC state */ uint32_t pm_overflowcnt; /* count overflow interrupts */ /* * The PMC ID field encodes the row-index for the PMC, its * mode, class and the CPU# associated with the PMC. */ pmc_id_t pm_id; /* allocated PMC id */ enum pmc_class pm_class; /* md extensions */ union pmc_md_pmc pm_md; }; /* * Accessor macros for 'struct pmc' */ #define PMC_TO_MODE(P) PMC_ID_TO_MODE((P)->pm_id) #define PMC_TO_CLASS(P) PMC_ID_TO_CLASS((P)->pm_id) #define PMC_TO_ROWINDEX(P) PMC_ID_TO_ROWINDEX((P)->pm_id) #define PMC_TO_CPU(P) PMC_ID_TO_CPU((P)->pm_id) /* * struct pmc_threadpmcstate * * Record per-PMC, per-thread state. */ struct pmc_threadpmcstate { pmc_value_t pt_pmcval; /* per-thread reload count */ }; /* * struct pmc_thread * * Record a 'target' thread being profiled. */ struct pmc_thread { LIST_ENTRY(pmc_thread) pt_next; /* linked list */ struct thread *pt_td; /* target thread */ struct pmc_threadpmcstate pt_pmcs[]; /* per-PMC state */ }; /* * struct pmc_process * * Record a 'target' process being profiled. * * The target process being profiled could be different from the owner * process which allocated the PMCs. Each target process descriptor * is associated with NHWPMC 'struct pmc *' pointers. Each PMC at a * given hardware row-index 'n' will use slot 'n' of the 'pp_pmcs[]' * array. The size of this structure is thus PMC architecture * dependent. * */ struct pmc_targetstate { struct pmc *pp_pmc; /* target PMC */ pmc_value_t pp_pmcval; /* per-process value */ }; struct pmc_process { LIST_ENTRY(pmc_process) pp_next; /* hash chain */ LIST_HEAD(,pmc_thread) pp_tds; /* list of threads */ struct mtx *pp_tdslock; /* lock on pp_tds thread list */ int pp_refcnt; /* reference count */ uint32_t pp_flags; /* flags PMC_PP_* */ struct proc *pp_proc; /* target process */ struct pmc_targetstate pp_pmcs[]; /* NHWPMCs */ }; #define PMC_PP_ENABLE_MSR_ACCESS 0x00000001 /* * struct pmc_owner * * We associate a PMC with an 'owner' process. * * A process can be associated with 0..NCPUS*NHWPMC PMCs during its * lifetime, where NCPUS is the numbers of CPUS in the system and * NHWPMC is the number of hardware PMCs per CPU. These are * maintained in the list headed by the 'po_pmcs' to save on space. * */ struct pmc_owner { LIST_ENTRY(pmc_owner) po_next; /* hash chain */ CK_LIST_ENTRY(pmc_owner) po_ssnext; /* (g/p) list of SS PMC owners */ LIST_HEAD(, pmc) po_pmcs; /* owned PMC list */ TAILQ_HEAD(, pmclog_buffer) po_logbuffers; /* (o) logbuffer list */ struct mtx po_mtx; /* spin lock for (o) */ struct proc *po_owner; /* owner proc */ uint32_t po_flags; /* (k) flags PMC_PO_* */ struct proc *po_kthread; /* (k) helper kthread */ struct file *po_file; /* file reference */ int po_error; /* recorded error */ short po_sscount; /* # SS PMCs owned */ short po_logprocmaps; /* global mappings done */ struct pmclog_buffer *po_curbuf[MAXCPU]; /* current log buffer */ }; #define PMC_PO_OWNS_LOGFILE 0x00000001 /* has a log file */ #define PMC_PO_SHUTDOWN 0x00000010 /* in the process of shutdown */ #define PMC_PO_INITIAL_MAPPINGS_DONE 0x00000020 /* * struct pmc_hw -- describe the state of the PMC hardware * * When in use, a HW PMC is associated with one allocated 'struct pmc' * pointed to by field 'phw_pmc'. When inactive, this field is NULL. * * On an SMP box, one or more HW PMC's in process virtual mode with * the same 'phw_pmc' could be executing on different CPUs. In order * to handle this case correctly, we need to ensure that only * incremental counts get added to the saved value in the associated * 'struct pmc'. The 'phw_save' field is used to keep the saved PMC * value at the time the hardware is started during this context * switch (i.e., the difference between the new (hardware) count and * the saved count is atomically added to the count field in 'struct * pmc' at context switch time). * */ struct pmc_hw { uint32_t phw_state; /* see PHW_* macros below */ struct pmc *phw_pmc; /* current thread PMC */ }; #define PMC_PHW_RI_MASK 0x000000FF #define PMC_PHW_CPU_SHIFT 8 #define PMC_PHW_CPU_MASK 0x0000FF00 #define PMC_PHW_FLAGS_SHIFT 16 #define PMC_PHW_FLAGS_MASK 0xFFFF0000 #define PMC_PHW_INDEX_TO_STATE(ri) ((ri) & PMC_PHW_RI_MASK) #define PMC_PHW_STATE_TO_INDEX(state) ((state) & PMC_PHW_RI_MASK) #define PMC_PHW_CPU_TO_STATE(cpu) (((cpu) << PMC_PHW_CPU_SHIFT) & \ PMC_PHW_CPU_MASK) #define PMC_PHW_STATE_TO_CPU(state) (((state) & PMC_PHW_CPU_MASK) >> \ PMC_PHW_CPU_SHIFT) #define PMC_PHW_FLAGS_TO_STATE(flags) (((flags) << PMC_PHW_FLAGS_SHIFT) & \ PMC_PHW_FLAGS_MASK) #define PMC_PHW_STATE_TO_FLAGS(state) (((state) & PMC_PHW_FLAGS_MASK) >> \ PMC_PHW_FLAGS_SHIFT) #define PMC_PHW_FLAG_IS_ENABLED (PMC_PHW_FLAGS_TO_STATE(0x01)) #define PMC_PHW_FLAG_IS_SHAREABLE (PMC_PHW_FLAGS_TO_STATE(0x02)) /* * struct pmc_sample * * Space for N (tunable) PC samples and associated control data. */ struct pmc_sample { uint16_t ps_nsamples; /* callchain depth */ uint16_t ps_nsamples_actual; uint16_t ps_cpu; /* cpu number */ uint16_t ps_flags; /* other flags */ lwpid_t ps_tid; /* thread id */ pid_t ps_pid; /* process PID or -1 */ int ps_ticks; /* ticks at sample time */ /* pad */ struct thread *ps_td; /* which thread */ struct pmc *ps_pmc; /* interrupting PMC */ uintptr_t *ps_pc; /* (const) callchain start */ uint64_t ps_tsc; /* tsc value */ }; #define PMC_SAMPLE_FREE ((uint16_t) 0) #define PMC_USER_CALLCHAIN_PENDING ((uint16_t) 0xFFFF) struct pmc_samplebuffer { volatile uint64_t ps_prodidx; /* producer index */ volatile uint64_t ps_considx; /* consumer index */ uintptr_t *ps_callchains; /* all saved call chains */ struct pmc_sample ps_samples[]; /* array of sample entries */ }; #define PMC_CONS_SAMPLE(psb) \ (&(psb)->ps_samples[(psb)->ps_considx & pmc_sample_mask]) #define PMC_CONS_SAMPLE_OFF(psb, off) \ (&(psb)->ps_samples[(off) & pmc_sample_mask]) #define PMC_PROD_SAMPLE(psb) \ (&(psb)->ps_samples[(psb)->ps_prodidx & pmc_sample_mask]) /* * struct pmc_cpustate * * A CPU is modelled as a collection of HW PMCs with space for additional * flags. */ struct pmc_cpu { uint32_t pc_state; /* physical cpu number + flags */ struct pmc_samplebuffer *pc_sb[3]; /* space for samples */ struct pmc_hw *pc_hwpmcs[]; /* 'npmc' pointers */ }; #define PMC_PCPU_CPU_MASK 0x000000FF #define PMC_PCPU_FLAGS_MASK 0xFFFFFF00 #define PMC_PCPU_FLAGS_SHIFT 8 #define PMC_PCPU_STATE_TO_CPU(S) ((S) & PMC_PCPU_CPU_MASK) #define PMC_PCPU_STATE_TO_FLAGS(S) (((S) & PMC_PCPU_FLAGS_MASK) >> PMC_PCPU_FLAGS_SHIFT) #define PMC_PCPU_FLAGS_TO_STATE(F) (((F) << PMC_PCPU_FLAGS_SHIFT) & PMC_PCPU_FLAGS_MASK) #define PMC_PCPU_CPU_TO_STATE(C) ((C) & PMC_PCPU_CPU_MASK) #define PMC_PCPU_FLAG_HTT (PMC_PCPU_FLAGS_TO_STATE(0x1)) /* * struct pmc_binding * * CPU binding information. */ struct pmc_binding { int pb_bound; /* is bound? */ int pb_cpu; /* if so, to which CPU */ }; struct pmc_mdep; /* * struct pmc_classdep * * PMC class-dependent operations. */ struct pmc_classdep { uint32_t pcd_caps; /* class capabilities */ enum pmc_class pcd_class; /* class id */ int pcd_num; /* number of PMCs */ int pcd_ri; /* row index of the first PMC in class */ int pcd_width; /* width of the PMC */ /* configuring/reading/writing the hardware PMCs */ int (*pcd_config_pmc)(int _cpu, int _ri, struct pmc *_pm); int (*pcd_get_config)(int _cpu, int _ri, struct pmc **_ppm); int (*pcd_read_pmc)(int _cpu, int _ri, pmc_value_t *_value); int (*pcd_write_pmc)(int _cpu, int _ri, pmc_value_t _value); /* pmc allocation/release */ int (*pcd_allocate_pmc)(int _cpu, int _ri, struct pmc *_t, const struct pmc_op_pmcallocate *_a); int (*pcd_release_pmc)(int _cpu, int _ri, struct pmc *_pm); /* starting and stopping PMCs */ int (*pcd_start_pmc)(int _cpu, int _ri); int (*pcd_stop_pmc)(int _cpu, int _ri); /* description */ int (*pcd_describe)(int _cpu, int _ri, struct pmc_info *_pi, struct pmc **_ppmc); /* class-dependent initialization & finalization */ int (*pcd_pcpu_init)(struct pmc_mdep *_md, int _cpu); int (*pcd_pcpu_fini)(struct pmc_mdep *_md, int _cpu); /* machine-specific interface */ int (*pcd_get_msr)(int _ri, uint32_t *_msr); }; /* * struct pmc_mdep * * Machine dependent bits needed per CPU type. */ struct pmc_mdep { uint32_t pmd_cputype; /* from enum pmc_cputype */ uint32_t pmd_npmc; /* number of PMCs per CPU */ uint32_t pmd_nclass; /* number of PMC classes present */ /* * Machine dependent methods. */ /* per-cpu initialization and finalization */ int (*pmd_pcpu_init)(struct pmc_mdep *_md, int _cpu); int (*pmd_pcpu_fini)(struct pmc_mdep *_md, int _cpu); /* thread context switch in/out */ int (*pmd_switch_in)(struct pmc_cpu *_p, struct pmc_process *_pp); int (*pmd_switch_out)(struct pmc_cpu *_p, struct pmc_process *_pp); /* handle a PMC interrupt */ int (*pmd_intr)(struct trapframe *_tf); /* * PMC class dependent information. */ struct pmc_classdep pmd_classdep[]; }; /* * Per-CPU state. This is an array of 'mp_ncpu' pointers * to struct pmc_cpu descriptors. */ extern struct pmc_cpu **pmc_pcpu; /* driver statistics */ extern struct pmc_driverstats pmc_stats; #if defined(HWPMC_DEBUG) #include /* debug flags, major flag groups */ struct pmc_debugflags { int pdb_CPU; int pdb_CSW; int pdb_LOG; int pdb_MDP; int pdb_MOD; int pdb_OWN; int pdb_PMC; int pdb_PRC; int pdb_SAM; }; extern struct pmc_debugflags pmc_debugflags; #define KTR_PMC KTR_SUBSYS #define PMC_DEBUG_STRSIZE 128 #define PMC_DEBUG_DEFAULT_FLAGS { 0, 0, 0, 0, 0, 0, 0, 0, 0 } #define PMCDBG0(M, N, L, F) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR0(KTR_PMC, #M ":" #N ":" #L ": " F); \ } while (0) #define PMCDBG1(M, N, L, F, p1) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR1(KTR_PMC, #M ":" #N ":" #L ": " F, p1); \ } while (0) #define PMCDBG2(M, N, L, F, p1, p2) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR2(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2); \ } while (0) #define PMCDBG3(M, N, L, F, p1, p2, p3) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR3(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2, p3); \ } while (0) #define PMCDBG4(M, N, L, F, p1, p2, p3, p4) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR4(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2, p3, p4);\ } while (0) #define PMCDBG5(M, N, L, F, p1, p2, p3, p4, p5) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR5(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2, p3, p4, \ p5); \ } while (0) #define PMCDBG6(M, N, L, F, p1, p2, p3, p4, p5, p6) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR6(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2, p3, p4, \ p5, p6); \ } while (0) /* Major numbers */ #define PMC_DEBUG_MAJ_CPU 0 /* cpu switches */ #define PMC_DEBUG_MAJ_CSW 1 /* context switches */ #define PMC_DEBUG_MAJ_LOG 2 /* logging */ #define PMC_DEBUG_MAJ_MDP 3 /* machine dependent */ #define PMC_DEBUG_MAJ_MOD 4 /* misc module infrastructure */ #define PMC_DEBUG_MAJ_OWN 5 /* owner */ #define PMC_DEBUG_MAJ_PMC 6 /* pmc management */ #define PMC_DEBUG_MAJ_PRC 7 /* processes */ #define PMC_DEBUG_MAJ_SAM 8 /* sampling */ /* Minor numbers */ /* Common (8 bits) */ #define PMC_DEBUG_MIN_ALL 0 /* allocation */ #define PMC_DEBUG_MIN_REL 1 /* release */ #define PMC_DEBUG_MIN_OPS 2 /* ops: start, stop, ... */ #define PMC_DEBUG_MIN_INI 3 /* init */ #define PMC_DEBUG_MIN_FND 4 /* find */ /* MODULE */ #define PMC_DEBUG_MIN_PMH 14 /* pmc_hook */ #define PMC_DEBUG_MIN_PMS 15 /* pmc_syscall */ /* OWN */ #define PMC_DEBUG_MIN_ORM 8 /* owner remove */ #define PMC_DEBUG_MIN_OMR 9 /* owner maybe remove */ /* PROCESSES */ #define PMC_DEBUG_MIN_TLK 8 /* link target */ #define PMC_DEBUG_MIN_TUL 9 /* unlink target */ #define PMC_DEBUG_MIN_EXT 10 /* process exit */ #define PMC_DEBUG_MIN_EXC 11 /* process exec */ #define PMC_DEBUG_MIN_FRK 12 /* process fork */ #define PMC_DEBUG_MIN_ATT 13 /* attach/detach */ #define PMC_DEBUG_MIN_SIG 14 /* signalling */ /* CONTEXT SWITCHES */ #define PMC_DEBUG_MIN_SWI 8 /* switch in */ #define PMC_DEBUG_MIN_SWO 9 /* switch out */ /* PMC */ #define PMC_DEBUG_MIN_REG 8 /* pmc register */ #define PMC_DEBUG_MIN_ALR 9 /* allocate row */ /* MACHINE DEPENDENT LAYER */ #define PMC_DEBUG_MIN_REA 8 /* read */ #define PMC_DEBUG_MIN_WRI 9 /* write */ #define PMC_DEBUG_MIN_CFG 10 /* config */ #define PMC_DEBUG_MIN_STA 11 /* start */ #define PMC_DEBUG_MIN_STO 12 /* stop */ #define PMC_DEBUG_MIN_INT 13 /* interrupts */ /* CPU */ #define PMC_DEBUG_MIN_BND 8 /* bind */ #define PMC_DEBUG_MIN_SEL 9 /* select */ /* LOG */ #define PMC_DEBUG_MIN_GTB 8 /* get buf */ #define PMC_DEBUG_MIN_SIO 9 /* schedule i/o */ #define PMC_DEBUG_MIN_FLS 10 /* flush */ #define PMC_DEBUG_MIN_SAM 11 /* sample */ #define PMC_DEBUG_MIN_CLO 12 /* close */ #else #define PMCDBG0(M, N, L, F) /* nothing */ #define PMCDBG1(M, N, L, F, p1) #define PMCDBG2(M, N, L, F, p1, p2) #define PMCDBG3(M, N, L, F, p1, p2, p3) #define PMCDBG4(M, N, L, F, p1, p2, p3, p4) #define PMCDBG5(M, N, L, F, p1, p2, p3, p4, p5) #define PMCDBG6(M, N, L, F, p1, p2, p3, p4, p5, p6) #endif /* declare a dedicated memory pool */ MALLOC_DECLARE(M_PMC); /* * Functions */ struct pmc_mdep *pmc_md_initialize(void); /* MD init function */ void pmc_md_finalize(struct pmc_mdep *_md); /* MD fini function */ int pmc_getrowdisp(int _ri); int pmc_process_interrupt(int _ring, struct pmc *_pm, struct trapframe *_tf); int pmc_save_kernel_callchain(uintptr_t *_cc, int _maxsamples, struct trapframe *_tf); int pmc_save_user_callchain(uintptr_t *_cc, int _maxsamples, struct trapframe *_tf); struct pmc_mdep *pmc_mdep_alloc(int nclasses); void pmc_mdep_free(struct pmc_mdep *md); uint64_t pmc_rdtsc(void); #endif /* _KERNEL */ #endif /* _SYS_PMC_H_ */ Index: projects/clang900-import/sys/sys/sockio.h =================================================================== --- projects/clang900-import/sys/sys/sockio.h (revision 352536) +++ projects/clang900-import/sys/sys/sockio.h (revision 352537) @@ -1,146 +1,148 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sockio.h 8.1 (Berkeley) 3/28/94 * $FreeBSD$ */ #ifndef _SYS_SOCKIO_H_ #define _SYS_SOCKIO_H_ #include /* Socket ioctl's. */ #define SIOCSHIWAT _IOW('s', 0, int) /* set high watermark */ #define SIOCGHIWAT _IOR('s', 1, int) /* get high watermark */ #define SIOCSLOWAT _IOW('s', 2, int) /* set low watermark */ #define SIOCGLOWAT _IOR('s', 3, int) /* get low watermark */ #define SIOCATMARK _IOR('s', 7, int) /* at oob mark? */ #define SIOCSPGRP _IOW('s', 8, int) /* set process group */ #define SIOCGPGRP _IOR('s', 9, int) /* get process group */ /* SIOCADDRT _IOW('r', 10, struct ortentry) 4.3BSD */ /* SIOCDELRT _IOW('r', 11, struct ortentry) 4.3BSD */ #define SIOCGETVIFCNT _IOWR('r', 15, struct sioc_vif_req)/* get vif pkt cnt */ #define SIOCGETSGCNT _IOWR('r', 16, struct sioc_sg_req) /* get s,g pkt cnt */ #define SIOCSIFADDR _IOW('i', 12, struct ifreq) /* set ifnet address */ /* OSIOCGIFADDR _IOWR('i', 13, struct ifreq) 4.3BSD */ #define SIOCGIFADDR _IOWR('i', 33, struct ifreq) /* get ifnet address */ #define SIOCSIFDSTADDR _IOW('i', 14, struct ifreq) /* set p-p address */ /* OSIOCGIFDSTADDR _IOWR('i', 15, struct ifreq) 4.3BSD */ #define SIOCGIFDSTADDR _IOWR('i', 34, struct ifreq) /* get p-p address */ #define SIOCSIFFLAGS _IOW('i', 16, struct ifreq) /* set ifnet flags */ #define SIOCGIFFLAGS _IOWR('i', 17, struct ifreq) /* get ifnet flags */ /* OSIOCGIFBRDADDR _IOWR('i', 18, struct ifreq) 4.3BSD */ #define SIOCGIFBRDADDR _IOWR('i', 35, struct ifreq) /* get broadcast addr */ #define SIOCSIFBRDADDR _IOW('i', 19, struct ifreq) /* set broadcast addr */ /* OSIOCGIFCONF _IOWR('i', 20, struct ifconf) 4.3BSD */ #define SIOCGIFCONF _IOWR('i', 36, struct ifconf) /* get ifnet list */ /* OSIOCGIFNETMASK _IOWR('i', 21, struct ifreq) 4.3BSD */ #define SIOCGIFNETMASK _IOWR('i', 37, struct ifreq) /* get net addr mask */ #define SIOCSIFNETMASK _IOW('i', 22, struct ifreq) /* set net addr mask */ #define SIOCGIFMETRIC _IOWR('i', 23, struct ifreq) /* get IF metric */ #define SIOCSIFMETRIC _IOW('i', 24, struct ifreq) /* set IF metric */ #define SIOCDIFADDR _IOW('i', 25, struct ifreq) /* delete IF addr */ #define OSIOCAIFADDR _IOW('i', 26, struct oifaliasreq) /* FreeBSD 9.x */ /* SIOCALIFADDR _IOW('i', 27, struct if_laddrreq) KAME */ /* SIOCGLIFADDR _IOWR('i', 28, struct if_laddrreq) KAME */ /* SIOCDLIFADDR _IOW('i', 29, struct if_laddrreq) KAME */ #define SIOCSIFCAP _IOW('i', 30, struct ifreq) /* set IF features */ #define SIOCGIFCAP _IOWR('i', 31, struct ifreq) /* get IF features */ #define SIOCGIFINDEX _IOWR('i', 32, struct ifreq) /* get IF index */ #define SIOCGIFMAC _IOWR('i', 38, struct ifreq) /* get IF MAC label */ #define SIOCSIFMAC _IOW('i', 39, struct ifreq) /* set IF MAC label */ #define SIOCSIFNAME _IOW('i', 40, struct ifreq) /* set IF name */ #define SIOCSIFDESCR _IOW('i', 41, struct ifreq) /* set ifnet descr */ #define SIOCGIFDESCR _IOWR('i', 42, struct ifreq) /* get ifnet descr */ #define SIOCAIFADDR _IOW('i', 43, struct ifaliasreq)/* add/chg IF alias */ #define SIOCADDMULTI _IOW('i', 49, struct ifreq) /* add m'cast addr */ #define SIOCDELMULTI _IOW('i', 50, struct ifreq) /* del m'cast addr */ #define SIOCGIFMTU _IOWR('i', 51, struct ifreq) /* get IF mtu */ #define SIOCSIFMTU _IOW('i', 52, struct ifreq) /* set IF mtu */ #define SIOCGIFPHYS _IOWR('i', 53, struct ifreq) /* get IF wire */ #define SIOCSIFPHYS _IOW('i', 54, struct ifreq) /* set IF wire */ #define SIOCSIFMEDIA _IOWR('i', 55, struct ifreq) /* set net media */ #define SIOCGIFMEDIA _IOWR('i', 56, struct ifmediareq) /* get net media */ #define SIOCSIFGENERIC _IOW('i', 57, struct ifreq) /* generic IF set op */ #define SIOCGIFGENERIC _IOWR('i', 58, struct ifreq) /* generic IF get op */ #define SIOCGIFSTATUS _IOWR('i', 59, struct ifstat) /* get IF status */ #define SIOCSIFLLADDR _IOW('i', 60, struct ifreq) /* set linklevel addr */ #define SIOCGI2C _IOWR('i', 61, struct ifreq) /* get I2C data */ #define SIOCGHWADDR _IOWR('i', 62, struct ifreq) /* get hardware lladdr */ #define SIOCSIFPHYADDR _IOW('i', 70, struct ifaliasreq) /* set gif address */ #define SIOCGIFPSRCADDR _IOWR('i', 71, struct ifreq) /* get gif psrc addr */ #define SIOCGIFPDSTADDR _IOWR('i', 72, struct ifreq) /* get gif pdst addr */ #define SIOCDIFPHYADDR _IOW('i', 73, struct ifreq) /* delete gif addrs */ /* SIOCSLIFPHYADDR _IOW('i', 74, struct if_laddrreq) KAME */ /* SIOCGLIFPHYADDR _IOWR('i', 75, struct if_laddrreq) KAME */ #define SIOCGPRIVATE_0 _IOWR('i', 80, struct ifreq) /* device private 0 */ #define SIOCGPRIVATE_1 _IOWR('i', 81, struct ifreq) /* device private 1 */ #define SIOCSIFVNET _IOWR('i', 90, struct ifreq) /* move IF jail/vnet */ #define SIOCSIFRVNET _IOWR('i', 91, struct ifreq) /* reclaim vnet IF */ #define SIOCGIFFIB _IOWR('i', 92, struct ifreq) /* get IF fib */ #define SIOCSIFFIB _IOW('i', 93, struct ifreq) /* set IF fib */ #define SIOCGTUNFIB _IOWR('i', 94, struct ifreq) /* get tunnel fib */ #define SIOCSTUNFIB _IOW('i', 95, struct ifreq) /* set tunnel fib */ #define SIOCSDRVSPEC _IOW('i', 123, struct ifdrv) /* set driver-specific parameters */ #define SIOCGDRVSPEC _IOWR('i', 123, struct ifdrv) /* get driver-specific parameters */ #define SIOCIFCREATE _IOWR('i', 122, struct ifreq) /* create clone if */ #define SIOCIFCREATE2 _IOWR('i', 124, struct ifreq) /* create clone if */ #define SIOCIFDESTROY _IOW('i', 121, struct ifreq) /* destroy clone if */ #define SIOCIFGCLONERS _IOWR('i', 120, struct if_clonereq) /* get cloners */ #define SIOCAIFGROUP _IOW('i', 135, struct ifgroupreq) /* add an ifgroup */ #define SIOCGIFGROUP _IOWR('i', 136, struct ifgroupreq) /* get ifgroups */ #define SIOCDIFGROUP _IOW('i', 137, struct ifgroupreq) /* delete ifgroup */ #define SIOCGIFGMEMB _IOWR('i', 138, struct ifgroupreq) /* get members */ #define SIOCGIFXMEDIA _IOWR('i', 139, struct ifmediareq) /* get net xmedia */ #define SIOCGIFRSSKEY _IOWR('i', 150, struct ifrsskey)/* get RSS key */ #define SIOCGIFRSSHASH _IOWR('i', 151, struct ifrsshash)/* get the current RSS type/func settings */ #define SIOCGLANPCP _IOWR('i', 152, struct ifreq) /* Get (V)LAN PCP */ #define SIOCSLANPCP _IOW('i', 153, struct ifreq) /* Set (V)LAN PCP */ +#define SIOCGIFDOWNREASON _IOWR('i', 154, struct ifdownreason) + #endif /* !_SYS_SOCKIO_H_ */ Index: projects/clang900-import/sys/sys/sysctl.h =================================================================== --- projects/clang900-import/sys/sys/sysctl.h (revision 352536) +++ projects/clang900-import/sys/sys/sysctl.h (revision 352537) @@ -1,1148 +1,1159 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sysctl.h 8.1 (Berkeley) 6/2/93 * $FreeBSD$ */ #ifndef _SYS_SYSCTL_H_ #define _SYS_SYSCTL_H_ #ifdef _KERNEL #include #endif struct thread; /* * Definitions for sysctl call. The sysctl call uses a hierarchical name * for objects that can be examined or modified. The name is expressed as * a sequence of integers. Like a file path name, the meaning of each * component depends on its place in the hierarchy. The top-level and kern * identifiers are defined here, and other identifiers are defined in the * respective subsystem header files. */ #define CTL_MAXNAME 24 /* largest number of components supported */ /* * Each subsystem defined by sysctl defines a list of variables * for that subsystem. Each name is either a node with further * levels defined below it, or it is a leaf of some particular * type given below. Each sysctl level defines a set of name/type * pairs to be used by sysctl(8) in manipulating the subsystem. */ struct ctlname { char *ctl_name; /* subsystem name */ int ctl_type; /* type of name */ }; #define CTLTYPE 0xf /* mask for the type */ #define CTLTYPE_NODE 1 /* name is a node */ #define CTLTYPE_INT 2 /* name describes an integer */ #define CTLTYPE_STRING 3 /* name describes a string */ #define CTLTYPE_S64 4 /* name describes a signed 64-bit number */ #define CTLTYPE_OPAQUE 5 /* name describes a structure */ #define CTLTYPE_STRUCT CTLTYPE_OPAQUE /* name describes a structure */ #define CTLTYPE_UINT 6 /* name describes an unsigned integer */ #define CTLTYPE_LONG 7 /* name describes a long */ #define CTLTYPE_ULONG 8 /* name describes an unsigned long */ #define CTLTYPE_U64 9 /* name describes an unsigned 64-bit number */ #define CTLTYPE_U8 0xa /* name describes an unsigned 8-bit number */ #define CTLTYPE_U16 0xb /* name describes an unsigned 16-bit number */ #define CTLTYPE_S8 0xc /* name describes a signed 8-bit number */ #define CTLTYPE_S16 0xd /* name describes a signed 16-bit number */ #define CTLTYPE_S32 0xe /* name describes a signed 32-bit number */ #define CTLTYPE_U32 0xf /* name describes an unsigned 32-bit number */ #define CTLFLAG_RD 0x80000000 /* Allow reads of variable */ #define CTLFLAG_WR 0x40000000 /* Allow writes to the variable */ #define CTLFLAG_RW (CTLFLAG_RD|CTLFLAG_WR) #define CTLFLAG_DORMANT 0x20000000 /* This sysctl is not active yet */ #define CTLFLAG_ANYBODY 0x10000000 /* All users can set this var */ #define CTLFLAG_SECURE 0x08000000 /* Permit set only if securelevel<=0 */ #define CTLFLAG_PRISON 0x04000000 /* Prisoned roots can fiddle */ #define CTLFLAG_DYN 0x02000000 /* Dynamic oid - can be freed */ #define CTLFLAG_SKIP 0x01000000 /* Skip this sysctl when listing */ #define CTLMASK_SECURE 0x00F00000 /* Secure level */ #define CTLFLAG_TUN 0x00080000 /* Default value is loaded from getenv() */ #define CTLFLAG_RDTUN (CTLFLAG_RD|CTLFLAG_TUN) #define CTLFLAG_RWTUN (CTLFLAG_RW|CTLFLAG_TUN) #define CTLFLAG_MPSAFE 0x00040000 /* Handler is MP safe */ #define CTLFLAG_VNET 0x00020000 /* Prisons with vnet can fiddle */ #define CTLFLAG_DYING 0x00010000 /* Oid is being removed */ #define CTLFLAG_CAPRD 0x00008000 /* Can be read in capability mode */ #define CTLFLAG_CAPWR 0x00004000 /* Can be written in capability mode */ #define CTLFLAG_STATS 0x00002000 /* Statistics, not a tuneable */ #define CTLFLAG_NOFETCH 0x00001000 /* Don't fetch tunable from getenv() */ #define CTLFLAG_CAPRW (CTLFLAG_CAPRD|CTLFLAG_CAPWR) /* * Secure level. Note that CTLFLAG_SECURE == CTLFLAG_SECURE1. * * Secure when the securelevel is raised to at least N. */ #define CTLSHIFT_SECURE 20 #define CTLFLAG_SECURE1 (CTLFLAG_SECURE | (0 << CTLSHIFT_SECURE)) #define CTLFLAG_SECURE2 (CTLFLAG_SECURE | (1 << CTLSHIFT_SECURE)) #define CTLFLAG_SECURE3 (CTLFLAG_SECURE | (2 << CTLSHIFT_SECURE)) /* * USE THIS instead of a hardwired number from the categories below * to get dynamically assigned sysctl entries using the linker-set * technology. This is the way nearly all new sysctl variables should * be implemented. * e.g. SYSCTL_INT(_parent, OID_AUTO, name, CTLFLAG_RW, &variable, 0, ""); */ #define OID_AUTO (-1) /* * The starting number for dynamically-assigned entries. WARNING! * ALL static sysctl entries should have numbers LESS than this! */ #define CTL_AUTO_START 0x100 #ifdef _KERNEL #include #ifdef KLD_MODULE /* XXX allow overspecification of type in external kernel modules */ #define SYSCTL_CT_ASSERT_MASK CTLTYPE #else #define SYSCTL_CT_ASSERT_MASK 0 #endif #define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, \ intmax_t arg2, struct sysctl_req *req /* definitions for sysctl_req 'lock' member */ #define REQ_UNWIRED 1 #define REQ_WIRED 2 /* definitions for sysctl_req 'flags' member */ #if defined(__aarch64__) || defined(__amd64__) || defined(__powerpc64__) ||\ (defined(__mips__) && defined(__mips_n64)) #define SCTL_MASK32 1 /* 32 bit emulation */ #endif /* * This describes the access space for a sysctl request. This is needed * so that we can use the interface from the kernel or from user-space. */ struct sysctl_req { struct thread *td; /* used for access checking */ int lock; /* wiring state */ void *oldptr; size_t oldlen; size_t oldidx; int (*oldfunc)(struct sysctl_req *, const void *, size_t); const void *newptr; size_t newlen; size_t newidx; int (*newfunc)(struct sysctl_req *, void *, size_t); size_t validlen; int flags; }; SLIST_HEAD(sysctl_oid_list, sysctl_oid); /* * This describes one "oid" in the MIB tree. Potentially more nodes can * be hidden behind it, expanded by the handler. */ struct sysctl_oid { struct sysctl_oid_list oid_children; struct sysctl_oid_list *oid_parent; SLIST_ENTRY(sysctl_oid) oid_link; int oid_number; u_int oid_kind; void *oid_arg1; intmax_t oid_arg2; const char *oid_name; int (*oid_handler)(SYSCTL_HANDLER_ARGS); const char *oid_fmt; int oid_refcnt; u_int oid_running; const char *oid_descr; const char *oid_label; }; #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) #define SYSCTL_OUT(r, p, l) (r->oldfunc)(r, p, l) #define SYSCTL_OUT_STR(r, p) (r->oldfunc)(r, p, strlen(p) + 1) int sysctl_handle_bool(SYSCTL_HANDLER_ARGS); int sysctl_handle_8(SYSCTL_HANDLER_ARGS); int sysctl_handle_16(SYSCTL_HANDLER_ARGS); int sysctl_handle_32(SYSCTL_HANDLER_ARGS); int sysctl_handle_64(SYSCTL_HANDLER_ARGS); int sysctl_handle_int(SYSCTL_HANDLER_ARGS); int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS); int sysctl_handle_long(SYSCTL_HANDLER_ARGS); int sysctl_handle_string(SYSCTL_HANDLER_ARGS); int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS); int sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS); int sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS); int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS); int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS); int sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS); int sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS); int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS); /* * These functions are used to add/remove an oid from the mib. */ void sysctl_register_oid(struct sysctl_oid *oidp); void sysctl_register_disabled_oid(struct sysctl_oid *oidp); void sysctl_enable_oid(struct sysctl_oid *oidp); void sysctl_unregister_oid(struct sysctl_oid *oidp); /* Declare a static oid to allow child oids to be added to it. */ #define SYSCTL_DECL(name) \ extern struct sysctl_oid sysctl__##name /* Hide these in macros. */ #define SYSCTL_CHILDREN(oid_ptr) (&(oid_ptr)->oid_children) #define SYSCTL_PARENT(oid_ptr) \ (((oid_ptr)->oid_parent != &sysctl__children) ? \ __containerof((oid_ptr)->oid_parent, struct sysctl_oid, \ oid_children) : (struct sysctl_oid *)NULL) #define SYSCTL_STATIC_CHILDREN(oid_name) (&sysctl__##oid_name.oid_children) /* === Structs and macros related to context handling. === */ /* All dynamically created sysctls can be tracked in a context list. */ struct sysctl_ctx_entry { struct sysctl_oid *entry; TAILQ_ENTRY(sysctl_ctx_entry) link; }; TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); #define SYSCTL_NODE_CHILDREN(parent, name) \ sysctl__##parent##_##name.oid_children #ifndef NO_SYSCTL_DESCR #define __DESCR(d) d #else #define __DESCR(d) "" #endif /* This macro is only for internal use */ #define SYSCTL_OID_RAW(id, parent_child_head, nbr, name, kind, a1, a2, handler, fmt, descr, label) \ struct sysctl_oid id = { \ .oid_parent = (parent_child_head), \ .oid_children = SLIST_HEAD_INITIALIZER(&id.oid_children), \ .oid_number = (nbr), \ .oid_kind = (kind), \ .oid_arg1 = (a1), \ .oid_arg2 = (a2), \ .oid_name = (name), \ .oid_handler = (handler), \ .oid_fmt = (fmt), \ .oid_descr = __DESCR(descr), \ .oid_label = (label), \ }; \ DATA_SET(sysctl_set, id) /* This constructs a static "raw" MIB oid. */ #define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ SYSCTL_OID_WITH_LABEL(parent, nbr, name, kind, a1, a2, \ handler, fmt, descr, NULL) #define SYSCTL_OID_WITH_LABEL(parent, nbr, name, kind, a1, a2, handler, fmt, descr, label) \ static SYSCTL_OID_RAW(sysctl__##parent##_##name, \ SYSCTL_CHILDREN(&sysctl__##parent), \ nbr, #name, kind, a1, a2, handler, fmt, descr, label) /* This constructs a global "raw" MIB oid. */ #define SYSCTL_OID_GLOBAL(parent, nbr, name, kind, a1, a2, handler, fmt, descr, label) \ SYSCTL_OID_RAW(sysctl__##parent##_##name, \ SYSCTL_CHILDREN(&sysctl__##parent), \ nbr, #name, kind, a1, a2, handler, fmt, descr, label) #define SYSCTL_ADD_OID(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ sysctl_add_oid(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, __DESCR(descr), NULL) /* This constructs a root node from which other nodes can hang. */ #define SYSCTL_ROOT_NODE(nbr, name, access, handler, descr) \ SYSCTL_OID_RAW(sysctl___##name, &sysctl__children, \ nbr, #name, CTLTYPE_NODE|(access), NULL, 0, \ handler, "N", descr, NULL); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE) /* This constructs a node from which other oids can hang. */ #define SYSCTL_NODE(parent, nbr, name, access, handler, descr) \ SYSCTL_NODE_WITH_LABEL(parent, nbr, name, access, handler, descr, NULL) #define SYSCTL_NODE_WITH_LABEL(parent, nbr, name, access, handler, descr, label) \ SYSCTL_OID_GLOBAL(parent, nbr, name, CTLTYPE_NODE|(access), \ NULL, 0, handler, "N", descr, label); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE) #define SYSCTL_ADD_NODE(ctx, parent, nbr, name, access, handler, descr) \ SYSCTL_ADD_NODE_WITH_LABEL(ctx, parent, nbr, name, access, \ handler, descr, NULL) #define SYSCTL_ADD_NODE_WITH_LABEL(ctx, parent, nbr, name, access, handler, descr, label) \ ({ \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE); \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_NODE|(access), \ NULL, 0, handler, "N", __DESCR(descr), label); \ }) #define SYSCTL_ADD_ROOT_NODE(ctx, nbr, name, access, handler, descr) \ ({ \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE); \ sysctl_add_oid(ctx, &sysctl__children, nbr, name, \ CTLTYPE_NODE|(access), \ NULL, 0, handler, "N", __DESCR(descr), NULL); \ }) /* Oid for a string. len can be 0 to indicate '\0' termination. */ #define SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access), \ arg, len, sysctl_handle_string, "A", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING) #define SYSCTL_ADD_STRING(ctx, parent, nbr, name, access, arg, len, descr) \ ({ \ char *__arg = (arg); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING); \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING|(access), \ __arg, len, sysctl_handle_string, "A", __DESCR(descr), \ NULL); \ }) /* Oid for a constant '\0' terminated string. */ #define SYSCTL_CONST_STRING(parent, nbr, name, access, arg, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING|(access), \ __DECONST(char *, arg), 0, sysctl_handle_string, "A", descr); \ CTASSERT(!(access & CTLFLAG_WR)); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING) #define SYSCTL_ADD_CONST_STRING(ctx, parent, nbr, name, access, arg, descr) \ ({ \ char *__arg = __DECONST(char *, arg); \ CTASSERT(!(access & CTLFLAG_WR)); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING); \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING|(access), \ __arg, 0, sysctl_handle_string, "A", __DESCR(descr), \ NULL); \ }) /* Oid for a bool. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_BOOL_PTR ((bool *)NULL) #define SYSCTL_BOOL(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_bool, "CU", descr); \ CTASSERT(((access) & CTLTYPE) == 0 && \ sizeof(bool) == sizeof(*(ptr))) #define SYSCTL_ADD_BOOL(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ bool *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_bool, "CU", __DESCR(descr), \ NULL); \ }) /* Oid for a signed 8-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_S8_PTR ((int8_t *)NULL) #define SYSCTL_S8(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S8 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_8, "C", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8) && \ sizeof(int8_t) == sizeof(*(ptr))) #define SYSCTL_ADD_S8(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int8_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S8 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_8, "C", __DESCR(descr), NULL); \ }) /* Oid for an unsigned 8-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_U8_PTR ((uint8_t *)NULL) #define SYSCTL_U8(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_8, "CU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8) && \ sizeof(uint8_t) == sizeof(*(ptr))) #define SYSCTL_ADD_U8(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ uint8_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_8, "CU", __DESCR(descr), NULL); \ }) /* Oid for a signed 16-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_S16_PTR ((int16_t *)NULL) #define SYSCTL_S16(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S16 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_16, "S", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16) && \ sizeof(int16_t) == sizeof(*(ptr))) #define SYSCTL_ADD_S16(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int16_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S16 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_16, "S", __DESCR(descr), NULL); \ }) /* Oid for an unsigned 16-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_U16_PTR ((uint16_t *)NULL) #define SYSCTL_U16(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U16 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_16, "SU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16) && \ sizeof(uint16_t) == sizeof(*(ptr))) #define SYSCTL_ADD_U16(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ uint16_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U16 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_16, "SU", __DESCR(descr), NULL); \ }) /* Oid for a signed 32-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_S32_PTR ((int32_t *)NULL) #define SYSCTL_S32(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S32 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_32, "I", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32) && \ sizeof(int32_t) == sizeof(*(ptr))) #define SYSCTL_ADD_S32(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int32_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S32 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_32, "I", __DESCR(descr), NULL); \ }) /* Oid for an unsigned 32-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_U32_PTR ((uint32_t *)NULL) #define SYSCTL_U32(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U32 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_32, "IU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32) && \ sizeof(uint32_t) == sizeof(*(ptr))) #define SYSCTL_ADD_U32(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ uint32_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U32 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_32, "IU", __DESCR(descr), NULL); \ }) /* Oid for a signed 64-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_S64_PTR ((int64_t *)NULL) #define SYSCTL_S64(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "Q", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \ sizeof(int64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_S64(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_64, "Q", __DESCR(descr), NULL); \ }) /* Oid for an unsigned 64-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_U64_PTR ((uint64_t *)NULL) #define SYSCTL_U64(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "QU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \ sizeof(uint64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_U64(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ uint64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_64, "QU", __DESCR(descr), NULL); \ }) /* Oid for an int. If ptr is SYSCTL_NULL_INT_PTR, val is returned. */ #define SYSCTL_NULL_INT_PTR ((int *)NULL) #define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_INT_WITH_LABEL(parent, nbr, name, access, ptr, val, descr, NULL) #define SYSCTL_INT_WITH_LABEL(parent, nbr, name, access, ptr, val, descr, label) \ SYSCTL_OID_WITH_LABEL(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_int, "I", descr, label); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) && \ sizeof(int) == sizeof(*(ptr))) #define SYSCTL_ADD_INT(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_int, "I", __DESCR(descr), NULL); \ }) /* Oid for an unsigned int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_UINT_PTR ((unsigned *)NULL) #define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_UINT | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_int, "IU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT) && \ sizeof(unsigned) == sizeof(*(ptr))) #define SYSCTL_ADD_UINT(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ unsigned *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_UINT | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_int, "IU", __DESCR(descr), NULL); \ }) /* Oid for a long. The pointer must be non NULL. */ #define SYSCTL_NULL_LONG_PTR ((long *)NULL) #define SYSCTL_LONG(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_LONG | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_long, "L", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG) && \ sizeof(long) == sizeof(*(ptr))) #define SYSCTL_ADD_LONG(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ long *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_LONG | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_long, "L", __DESCR(descr), NULL); \ }) /* Oid for an unsigned long. The pointer must be non NULL. */ #define SYSCTL_NULL_ULONG_PTR ((unsigned long *)NULL) #define SYSCTL_ULONG(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_long, "LU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG) && \ sizeof(unsigned long) == sizeof(*(ptr))) #define SYSCTL_ADD_ULONG(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ unsigned long *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_long, "LU", __DESCR(descr), NULL); \ }) /* Oid for a quad. The pointer must be non NULL. */ #define SYSCTL_NULL_QUAD_PTR ((int64_t *)NULL) #define SYSCTL_QUAD(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "Q", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \ sizeof(int64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_QUAD(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ int64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_64, "Q", __DESCR(descr), NULL); \ }) #define SYSCTL_NULL_UQUAD_PTR ((uint64_t *)NULL) #define SYSCTL_UQUAD(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "QU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \ sizeof(uint64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_UQUAD(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ uint64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_64, "QU", __DESCR(descr), NULL); \ }) /* Oid for a CPU dependent variable */ #define SYSCTL_ADD_UAUTO(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ struct sysctl_oid *__ret; \ CTASSERT((sizeof(uint64_t) == sizeof(*(ptr)) || \ sizeof(unsigned) == sizeof(*(ptr))) && \ ((access) & CTLTYPE) == 0); \ if (sizeof(uint64_t) == sizeof(*(ptr))) { \ __ret = sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ (ptr), 0, sysctl_handle_64, "QU", \ __DESCR(descr), NULL); \ } else { \ __ret = sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_UINT | CTLFLAG_MPSAFE | (access), \ (ptr), 0, sysctl_handle_int, "IU", \ __DESCR(descr), NULL); \ } \ __ret; \ }) /* Oid for a 64-bit unsigned counter(9). The pointer must be non NULL. */ #define SYSCTL_COUNTER_U64(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ (ptr), 0, sysctl_handle_counter_u64, "QU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \ sizeof(counter_u64_t) == sizeof(*(ptr)) && \ sizeof(uint64_t) == sizeof(**(ptr))) #define SYSCTL_ADD_COUNTER_U64(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ counter_u64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_counter_u64, "QU", __DESCR(descr), \ NULL); \ }) /* Oid for an array of counter(9)s. The pointer and length must be non zero. */ #define SYSCTL_COUNTER_U64_ARRAY(parent, nbr, name, access, ptr, len, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access), \ (ptr), (len), sysctl_handle_counter_u64_array, "S", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) && \ sizeof(counter_u64_t) == sizeof(*(ptr)) && \ sizeof(uint64_t) == sizeof(**(ptr))) #define SYSCTL_ADD_COUNTER_U64_ARRAY(ctx, parent, nbr, name, access, \ ptr, len, descr) \ ({ \ counter_u64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access), \ __ptr, len, sysctl_handle_counter_u64_array, "S", \ __DESCR(descr), NULL); \ }) /* Oid for an opaque object. Specified by a pointer and a length. */ #define SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE|(access), \ ptr, len, sysctl_handle_opaque, fmt, descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) #define SYSCTL_ADD_OPAQUE(ctx, parent, nbr, name, access, ptr, len, fmt, descr) \ ({ \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE); \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_OPAQUE|(access), \ ptr, len, sysctl_handle_opaque, fmt, __DESCR(descr), NULL); \ }) /* Oid for a struct. Specified by a pointer and a type. */ #define SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE|(access), \ ptr, sizeof(struct type), sysctl_handle_opaque, \ "S," #type, descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) #define SYSCTL_ADD_STRUCT(ctx, parent, nbr, name, access, ptr, type, descr) \ ({ \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE); \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_OPAQUE|(access), \ (ptr), sizeof(struct type), \ sysctl_handle_opaque, "S," #type, __DESCR(descr), NULL); \ }) /* Oid for a procedure. Specified by a pointer and an arg. */ #define SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \ SYSCTL_OID(parent, nbr, name, (access), \ ptr, arg, handler, fmt, descr); \ CTASSERT(((access) & CTLTYPE) != 0) #define SYSCTL_ADD_PROC(ctx, parent, nbr, name, access, ptr, arg, handler, fmt, descr) \ ({ \ CTASSERT(((access) & CTLTYPE) != 0); \ sysctl_add_oid(ctx, parent, nbr, name, (access), \ (ptr), (arg), (handler), (fmt), __DESCR(descr), NULL); \ }) /* Oid to handle limits on uma(9) zone specified by pointer. */ #define SYSCTL_UMA_MAX(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | (access), \ (ptr), 0, sysctl_handle_uma_zone_max, "I", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) #define SYSCTL_ADD_UMA_MAX(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ uma_zone_t __ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_uma_zone_max, "I", __DESCR(descr), \ NULL); \ }) /* Oid to obtain current use of uma(9) zone specified by pointer. */ #define SYSCTL_UMA_CUR(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ (ptr), 0, sysctl_handle_uma_zone_cur, "I", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) #define SYSCTL_ADD_UMA_CUR(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ uma_zone_t __ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ __ptr, 0, sysctl_handle_uma_zone_cur, "I", __DESCR(descr), \ NULL); \ }) /* OID expressing a sbintime_t as microseconds */ #define SYSCTL_SBINTIME_USEC(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ (ptr), 0, sysctl_usec_to_sbintime, "Q", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) #define SYSCTL_ADD_SBINTIME_USEC(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ sbintime_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ __ptr, 0, sysctl_usec_to_sbintime, "Q", __DESCR(descr), \ NULL); \ }) /* OID expressing a sbintime_t as milliseconds */ #define SYSCTL_SBINTIME_MSEC(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ (ptr), 0, sysctl_msec_to_sbintime, "Q", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) #define SYSCTL_ADD_SBINTIME_MSEC(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ sbintime_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ __ptr, 0, sysctl_msec_to_sbintime, "Q", __DESCR(descr), \ NULL); \ }) /* OID expressing a struct timeval as seconds */ #define SYSCTL_TIMEVAL_SEC(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ (ptr), 0, sysctl_sec_to_timeval, "I", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) #define SYSCTL_ADD_TIMEVAL_SEC(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ struct timeval *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ __ptr, 0, sysctl_sec_to_timeval, "I", __DESCR(descr), \ NULL); \ }) /* * A macro to generate a read-only sysctl to indicate the presence of optional * kernel features. */ #define FEATURE(name, desc) \ SYSCTL_INT_WITH_LABEL(_kern_features, OID_AUTO, name, \ CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1, desc, "feature") #endif /* _KERNEL */ /* * Top-level identifiers */ -#define CTL_UNSPEC 0 /* unused */ +#define CTL_SYSCTL 0 /* "magic" numbers */ #define CTL_KERN 1 /* "high kernel": proc, limits */ #define CTL_VM 2 /* virtual memory */ #define CTL_VFS 3 /* filesystem, mount type is next */ #define CTL_NET 4 /* network, see socket.h */ #define CTL_DEBUG 5 /* debugging parameters */ #define CTL_HW 6 /* generic cpu/io */ #define CTL_MACHDEP 7 /* machine dependent */ #define CTL_USER 8 /* user-level */ #define CTL_P1003_1B 9 /* POSIX 1003.1B */ + +/* + * CTL_SYSCTL identifiers + */ +#define CTL_SYSCTL_DEBUG 0 /* printf all nodes */ +#define CTL_SYSCTL_NAME 1 /* string name of OID */ +#define CTL_SYSCTL_NEXT 2 /* next OID */ +#define CTL_SYSCTL_NAME2OID 3 /* int array of name */ +#define CTL_SYSCTL_OIDFMT 4 /* OID's kind and format */ +#define CTL_SYSCTL_OIDDESCR 5 /* OID's description */ +#define CTL_SYSCTL_OIDLABEL 6 /* aggregation label */ /* * CTL_KERN identifiers */ #define KERN_OSTYPE 1 /* string: system version */ #define KERN_OSRELEASE 2 /* string: system release */ #define KERN_OSREV 3 /* int: system revision */ #define KERN_VERSION 4 /* string: compile time info */ #define KERN_MAXVNODES 5 /* int: max vnodes */ #define KERN_MAXPROC 6 /* int: max processes */ #define KERN_MAXFILES 7 /* int: max open files */ #define KERN_ARGMAX 8 /* int: max arguments to exec */ #define KERN_SECURELVL 9 /* int: system security level */ #define KERN_HOSTNAME 10 /* string: hostname */ #define KERN_HOSTID 11 /* int: host identifier */ #define KERN_CLOCKRATE 12 /* struct: struct clockrate */ #define KERN_VNODE 13 /* struct: vnode structures */ #define KERN_PROC 14 /* struct: process entries */ #define KERN_FILE 15 /* struct: file entries */ #define KERN_PROF 16 /* node: kernel profiling info */ #define KERN_POSIX1 17 /* int: POSIX.1 version */ #define KERN_NGROUPS 18 /* int: # of supplemental group ids */ #define KERN_JOB_CONTROL 19 /* int: is job control available */ #define KERN_SAVED_IDS 20 /* int: saved set-user/group-ID */ #define KERN_BOOTTIME 21 /* struct: time kernel was booted */ #define KERN_NISDOMAINNAME 22 /* string: YP domain name */ #define KERN_UPDATEINTERVAL 23 /* int: update process sleep time */ #define KERN_OSRELDATE 24 /* int: kernel release date */ #define KERN_NTP_PLL 25 /* node: NTP PLL control */ #define KERN_BOOTFILE 26 /* string: name of booted kernel */ #define KERN_MAXFILESPERPROC 27 /* int: max open files per proc */ #define KERN_MAXPROCPERUID 28 /* int: max processes per uid */ #define KERN_DUMPDEV 29 /* struct cdev *: device to dump on */ #define KERN_IPC 30 /* node: anything related to IPC */ #define KERN_DUMMY 31 /* unused */ #define KERN_PS_STRINGS 32 /* int: address of PS_STRINGS */ #define KERN_USRSTACK 33 /* int: address of USRSTACK */ #define KERN_LOGSIGEXIT 34 /* int: do we log sigexit procs? */ #define KERN_IOV_MAX 35 /* int: value of UIO_MAXIOV */ #define KERN_HOSTUUID 36 /* string: host UUID identifier */ #define KERN_ARND 37 /* int: from arc4rand() */ #define KERN_MAXPHYS 38 /* int: MAXPHYS value */ /* * KERN_PROC subtypes */ #define KERN_PROC_ALL 0 /* everything */ #define KERN_PROC_PID 1 /* by process id */ #define KERN_PROC_PGRP 2 /* by process group id */ #define KERN_PROC_SESSION 3 /* by session of pid */ #define KERN_PROC_TTY 4 /* by controlling tty */ #define KERN_PROC_UID 5 /* by effective uid */ #define KERN_PROC_RUID 6 /* by real uid */ #define KERN_PROC_ARGS 7 /* get/set arguments/proctitle */ #define KERN_PROC_PROC 8 /* only return procs */ #define KERN_PROC_SV_NAME 9 /* get syscall vector name */ #define KERN_PROC_RGID 10 /* by real group id */ #define KERN_PROC_GID 11 /* by effective group id */ #define KERN_PROC_PATHNAME 12 /* path to executable */ #define KERN_PROC_OVMMAP 13 /* Old VM map entries for process */ #define KERN_PROC_OFILEDESC 14 /* Old file descriptors for process */ #define KERN_PROC_KSTACK 15 /* Kernel stacks for process */ #define KERN_PROC_INC_THREAD 0x10 /* * modifier for pid, pgrp, tty, * uid, ruid, gid, rgid and proc * This effectively uses 16-31 */ #define KERN_PROC_VMMAP 32 /* VM map entries for process */ #define KERN_PROC_FILEDESC 33 /* File descriptors for process */ #define KERN_PROC_GROUPS 34 /* process groups */ #define KERN_PROC_ENV 35 /* get environment */ #define KERN_PROC_AUXV 36 /* get ELF auxiliary vector */ #define KERN_PROC_RLIMIT 37 /* process resource limits */ #define KERN_PROC_PS_STRINGS 38 /* get ps_strings location */ #define KERN_PROC_UMASK 39 /* process umask */ #define KERN_PROC_OSREL 40 /* osreldate for process binary */ #define KERN_PROC_SIGTRAMP 41 /* signal trampoline location */ #define KERN_PROC_CWD 42 /* process current working directory */ #define KERN_PROC_NFDS 43 /* number of open file descriptors */ /* * KERN_IPC identifiers */ #define KIPC_MAXSOCKBUF 1 /* int: max size of a socket buffer */ #define KIPC_SOCKBUF_WASTE 2 /* int: wastage factor in sockbuf */ #define KIPC_SOMAXCONN 3 /* int: max length of connection q */ #define KIPC_MAX_LINKHDR 4 /* int: max length of link header */ #define KIPC_MAX_PROTOHDR 5 /* int: max length of network header */ #define KIPC_MAX_HDR 6 /* int: max total length of headers */ #define KIPC_MAX_DATALEN 7 /* int: max length of data? */ /* * CTL_HW identifiers */ #define HW_MACHINE 1 /* string: machine class */ #define HW_MODEL 2 /* string: specific machine model */ #define HW_NCPU 3 /* int: number of cpus */ #define HW_BYTEORDER 4 /* int: machine byte order */ #define HW_PHYSMEM 5 /* int: total memory */ #define HW_USERMEM 6 /* int: non-kernel memory */ #define HW_PAGESIZE 7 /* int: software page size */ #define HW_DISKNAMES 8 /* strings: disk drive names */ #define HW_DISKSTATS 9 /* struct: diskstats[] */ #define HW_FLOATINGPT 10 /* int: has HW floating point? */ #define HW_MACHINE_ARCH 11 /* string: machine architecture */ #define HW_REALMEM 12 /* int: 'real' memory */ /* * CTL_USER definitions */ #define USER_CS_PATH 1 /* string: _CS_PATH */ #define USER_BC_BASE_MAX 2 /* int: BC_BASE_MAX */ #define USER_BC_DIM_MAX 3 /* int: BC_DIM_MAX */ #define USER_BC_SCALE_MAX 4 /* int: BC_SCALE_MAX */ #define USER_BC_STRING_MAX 5 /* int: BC_STRING_MAX */ #define USER_COLL_WEIGHTS_MAX 6 /* int: COLL_WEIGHTS_MAX */ #define USER_EXPR_NEST_MAX 7 /* int: EXPR_NEST_MAX */ #define USER_LINE_MAX 8 /* int: LINE_MAX */ #define USER_RE_DUP_MAX 9 /* int: RE_DUP_MAX */ #define USER_POSIX2_VERSION 10 /* int: POSIX2_VERSION */ #define USER_POSIX2_C_BIND 11 /* int: POSIX2_C_BIND */ #define USER_POSIX2_C_DEV 12 /* int: POSIX2_C_DEV */ #define USER_POSIX2_CHAR_TERM 13 /* int: POSIX2_CHAR_TERM */ #define USER_POSIX2_FORT_DEV 14 /* int: POSIX2_FORT_DEV */ #define USER_POSIX2_FORT_RUN 15 /* int: POSIX2_FORT_RUN */ #define USER_POSIX2_LOCALEDEF 16 /* int: POSIX2_LOCALEDEF */ #define USER_POSIX2_SW_DEV 17 /* int: POSIX2_SW_DEV */ #define USER_POSIX2_UPE 18 /* int: POSIX2_UPE */ #define USER_STREAM_MAX 19 /* int: POSIX2_STREAM_MAX */ #define USER_TZNAME_MAX 20 /* int: POSIX2_TZNAME_MAX */ #define CTL_P1003_1B_ASYNCHRONOUS_IO 1 /* boolean */ #define CTL_P1003_1B_MAPPED_FILES 2 /* boolean */ #define CTL_P1003_1B_MEMLOCK 3 /* boolean */ #define CTL_P1003_1B_MEMLOCK_RANGE 4 /* boolean */ #define CTL_P1003_1B_MEMORY_PROTECTION 5 /* boolean */ #define CTL_P1003_1B_MESSAGE_PASSING 6 /* boolean */ #define CTL_P1003_1B_PRIORITIZED_IO 7 /* boolean */ #define CTL_P1003_1B_PRIORITY_SCHEDULING 8 /* boolean */ #define CTL_P1003_1B_REALTIME_SIGNALS 9 /* boolean */ #define CTL_P1003_1B_SEMAPHORES 10 /* boolean */ #define CTL_P1003_1B_FSYNC 11 /* boolean */ #define CTL_P1003_1B_SHARED_MEMORY_OBJECTS 12 /* boolean */ #define CTL_P1003_1B_SYNCHRONIZED_IO 13 /* boolean */ #define CTL_P1003_1B_TIMERS 14 /* boolean */ #define CTL_P1003_1B_AIO_LISTIO_MAX 15 /* int */ #define CTL_P1003_1B_AIO_MAX 16 /* int */ #define CTL_P1003_1B_AIO_PRIO_DELTA_MAX 17 /* int */ #define CTL_P1003_1B_DELAYTIMER_MAX 18 /* int */ #define CTL_P1003_1B_MQ_OPEN_MAX 19 /* int */ #define CTL_P1003_1B_PAGESIZE 20 /* int */ #define CTL_P1003_1B_RTSIG_MAX 21 /* int */ #define CTL_P1003_1B_SEM_NSEMS_MAX 22 /* int */ #define CTL_P1003_1B_SEM_VALUE_MAX 23 /* int */ #define CTL_P1003_1B_SIGQUEUE_MAX 24 /* int */ #define CTL_P1003_1B_TIMER_MAX 25 /* int */ #define CTL_P1003_1B_MAXID 26 #ifdef _KERNEL /* * Declare some common oids. */ extern struct sysctl_oid_list sysctl__children; SYSCTL_DECL(_kern); SYSCTL_DECL(_kern_features); SYSCTL_DECL(_kern_ipc); SYSCTL_DECL(_kern_proc); SYSCTL_DECL(_kern_sched); SYSCTL_DECL(_kern_sched_stats); SYSCTL_DECL(_sysctl); SYSCTL_DECL(_vm); SYSCTL_DECL(_vm_stats); SYSCTL_DECL(_vm_stats_misc); SYSCTL_DECL(_vfs); SYSCTL_DECL(_net); SYSCTL_DECL(_debug); SYSCTL_DECL(_debug_sizeof); SYSCTL_DECL(_dev); SYSCTL_DECL(_hw); SYSCTL_DECL(_hw_bus); SYSCTL_DECL(_hw_bus_devices); SYSCTL_DECL(_hw_bus_info); SYSCTL_DECL(_machdep); SYSCTL_DECL(_user); SYSCTL_DECL(_compat); SYSCTL_DECL(_regression); SYSCTL_DECL(_security); SYSCTL_DECL(_security_bsd); extern char machine[]; extern char osrelease[]; extern char ostype[]; extern char kern_ident[]; /* Dynamic oid handling */ struct sysctl_oid *sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int nbr, const char *name, int kind, void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr, const char *label); int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, int recurse); void sysctl_rename_oid(struct sysctl_oid *oidp, const char *name); int sysctl_move_oid(struct sysctl_oid *oidp, struct sysctl_oid_list *parent); int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse); int sysctl_ctx_init(struct sysctl_ctx_list *clist); int sysctl_ctx_free(struct sysctl_ctx_list *clist); struct sysctl_ctx_entry *sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp); struct sysctl_ctx_entry *sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp); int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp); int kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags); int kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags); int userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, const void *new, size_t newlen, size_t *retval, int flags); int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req); void sysctl_wlock(void); void sysctl_wunlock(void); int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len); int kern___sysctlbyname(struct thread *td, const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags, bool inkernel); struct sbuf; struct sbuf *sbuf_new_for_sysctl(struct sbuf *, char *, int, struct sysctl_req *); #else /* !_KERNEL */ #include __BEGIN_DECLS int sysctl(const int *, u_int, void *, size_t *, const void *, size_t); int sysctlbyname(const char *, void *, size_t *, const void *, size_t); int sysctlnametomib(const char *, int *, size_t *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_SYSCTL_H_ */ Index: projects/clang900-import/sys/vm/vm_glue.c =================================================================== --- projects/clang900-import/sys/vm/vm_glue.c (revision 352536) +++ projects/clang900-import/sys/vm/vm_glue.c (revision 352537) @@ -1,602 +1,602 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include "opt_kstack_usage_prof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. In most cases * just checking the vm_map_entry is sufficient within the kernel's address * space. */ int kernacc(void *addr, int len, int rw) { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); if ((vm_offset_t)addr + len > vm_map_max(kernel_map) || (vm_offset_t)addr + len < (vm_offset_t)addr) return (FALSE); prot = rw; saddr = trunc_page((vm_offset_t)addr); eaddr = round_page((vm_offset_t)addr + len); vm_map_lock_read(kernel_map); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return (rv == TRUE); } /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. vmapbuf(), * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be * used in conjunction with this call. */ int useracc(void *addr, int len, int rw) { boolean_t rv; vm_prot_t prot; vm_map_t map; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to useracc (%x)\n", rw)); prot = rw; map = &curproc->p_vmspace->vm_map; if ((vm_offset_t)addr + len > vm_map_max(map) || (vm_offset_t)addr + len < (vm_offset_t)addr) { return (FALSE); } vm_map_lock_read(map); rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot); vm_map_unlock_read(map); return (rv == TRUE); } int vslock(void *addr, size_t len) { vm_offset_t end, last, start; vm_size_t npages; int error; last = (vm_offset_t)addr + len; start = trunc_page((vm_offset_t)addr); end = round_page(last); if (last < (vm_offset_t)addr || end < (vm_offset_t)addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_user_wired) return (ENOMEM); error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (error == KERN_SUCCESS) { curthread->td_vslock_sz += len; return (0); } /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ return (EFAULT); } void vsunlock(void *addr, size_t len) { /* Rely on the parameter sanity checks performed by vslock(). */ MPASS(curthread->td_vslock_sz >= len); curthread->td_vslock_sz -= len; (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); } /* * Pin the page contained within the given object at the given offset. If the * page is not resident, allocate and load it using the given object's pager. * Return the pinned page if successful; otherwise, return NULL. */ static vm_page_t vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m; vm_pindex_t pindex; pindex = OFF_TO_IDX(offset); VM_OBJECT_WLOCK(object); (void)vm_page_grab_valid(&m, object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); VM_OBJECT_WUNLOCK(object); return (m); } /* * Return a CPU private mapping to the page at the given offset within the * given object. The page is pinned before it is mapped. */ struct sf_buf * vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m; m = vm_imgact_hold_page(object, offset); if (m == NULL) return (NULL); sched_pin(); return (sf_buf_alloc(m, SFB_CPUPRIVATE)); } /* * Destroy the given CPU private mapping and unpin the page that it mapped. */ void vm_imgact_unmap_page(struct sf_buf *sf) { vm_page_t m; m = sf_buf_page(sf); sf_buf_free(sf); sched_unpin(); vm_page_unwire(m, PQ_ACTIVE); } void vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz) { pmap_sync_icache(map->pmap, va, sz); } static uma_zone_t kstack_cache; static int kstack_cache_size = 128; static int kstack_domain_iter; static int sysctl_kstack_cache_size(SYSCTL_HANDLER_ARGS) { int error, newsize; newsize = kstack_cache_size; error = sysctl_handle_int(oidp, &newsize, 0, req); if (error == 0 && req->newptr && newsize != kstack_cache_size) kstack_cache_size = uma_zone_set_maxcache(kstack_cache, newsize); return (error); } SYSCTL_PROC(_vm, OID_AUTO, kstack_cache_size, CTLTYPE_INT|CTLFLAG_RW, &kstack_cache_size, 0, sysctl_kstack_cache_size, "IU", "Maximum number of cached kernel stacks"); /* * Create the kernel stack (including pcb for i386) for a new thread. * This routine directly affects the fork perf for a process and * create performance for a thread. */ static vm_offset_t vm_thread_stack_create(struct domainset *ds, vm_object_t *ksobjp, int pages) { vm_page_t ma[KSTACK_MAX_PAGES]; vm_object_t ksobj; vm_offset_t ks; int i; /* * Allocate an object for the kstack. */ ksobj = vm_object_allocate(OBJT_DEFAULT, pages); /* * Get a kernel virtual address for this thread's kstack. */ #if defined(__mips__) /* * We need to align the kstack's mapped address to fit within * a single TLB entry. */ if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE, PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_BESTFIT | M_NOWAIT, &ks)) { ks = 0; } #else ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); #endif if (ks == 0) { - printf("vm_thread_new: kstack allocation failed\n"); + printf("%s: kstack allocation failed\n", __func__); vm_object_deallocate(ksobj); return (0); } if (vm_ndomains > 1) { ksobj->domain.dr_policy = ds; ksobj->domain.dr_iter = atomic_fetchadd_int(&kstack_domain_iter, 1); } if (KSTACK_GUARD_PAGES != 0) { pmap_qremove(ks, KSTACK_GUARD_PAGES); ks += KSTACK_GUARD_PAGES * PAGE_SIZE; } /* * For the length of the stack, link in a real page of ram for each * page of stack. */ VM_OBJECT_WLOCK(ksobj); (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, ma, pages); for (i = 0; i < pages; i++) ma[i]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(ksobj); pmap_qenter(ks, ma, pages); *ksobjp = ksobj; return (ks); } static void vm_thread_stack_dispose(vm_object_t ksobj, vm_offset_t ks, int pages) { vm_page_t m; int i; pmap_qremove(ks, pages); VM_OBJECT_WLOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) - panic("vm_thread_dispose: kstack already missing?"); + panic("%s: kstack already missing?", __func__); vm_page_unwire_noq(m); vm_page_free(m); } VM_OBJECT_WUNLOCK(ksobj); vm_object_deallocate(ksobj); kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE), (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); } /* * Allocate the kernel stack for a new thread. */ int vm_thread_new(struct thread *td, int pages) { vm_object_t ksobj; vm_offset_t ks; /* Bounds check */ if (pages <= 1) pages = kstack_pages; else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; ks = 0; ksobj = NULL; if (pages == kstack_pages && kstack_cache != NULL) { ks = (vm_offset_t)uma_zalloc(kstack_cache, M_NOWAIT); if (ks != 0) ksobj = PHYS_TO_VM_PAGE(pmap_kextract(ks))->object; } /* * Ensure that kstack objects can draw pages from any memory * domain. Otherwise a local memory shortage can block a process * swap-in. */ if (ks == 0) ks = vm_thread_stack_create(DOMAINSET_PREF(PCPU_GET(domain)), &ksobj, pages); if (ks == 0) return (0); td->td_kstack_obj = ksobj; td->td_kstack = ks; td->td_kstack_pages = pages; return (1); } /* * Dispose of a thread's kernel stack. */ void vm_thread_dispose(struct thread *td) { vm_object_t ksobj; vm_offset_t ks; int pages; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; ks = td->td_kstack; td->td_kstack = 0; td->td_kstack_pages = 0; if (pages == kstack_pages) uma_zfree(kstack_cache, (void *)ks); else vm_thread_stack_dispose(ksobj, ks, pages); } static int kstack_import(void *arg, void **store, int cnt, int domain, int flags) { struct domainset *ds; vm_object_t ksobj; int i; if (domain == UMA_ANYDOMAIN) ds = DOMAINSET_RR(); else ds = DOMAINSET_PREF(domain); for (i = 0; i < cnt; i++) { store[i] = (void *)vm_thread_stack_create(ds, &ksobj, kstack_pages); if (store[i] == NULL) break; } return (i); } static void kstack_release(void *arg, void **store, int cnt) { vm_offset_t ks; int i; for (i = 0; i < cnt; i++) { ks = (vm_offset_t)store[i]; vm_thread_stack_dispose( PHYS_TO_VM_PAGE(pmap_kextract(ks))->object, ks, kstack_pages); } } static void kstack_cache_init(void *null) { kstack_cache = uma_zcache_create("kstack_cache", kstack_pages * PAGE_SIZE, NULL, NULL, NULL, NULL, kstack_import, kstack_release, NULL, UMA_ZONE_NUMA|UMA_ZONE_MINBUCKET); uma_zone_set_maxcache(kstack_cache, kstack_cache_size); } SYSINIT(vm_kstacks, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, kstack_cache_init, NULL); #ifdef KSTACK_USAGE_PROF /* * Track maximum stack used by a thread in kernel. */ static int max_kstack_used; SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD, &max_kstack_used, 0, "Maxiumum stack depth used by a thread in kernel"); void intr_prof_stack_use(struct thread *td, struct trapframe *frame) { vm_offset_t stack_top; vm_offset_t current; int used, prev_used; /* * Testing for interrupted kernel mode isn't strictly * needed. It optimizes the execution, since interrupts from * usermode will have only the trap frame on the stack. */ if (TRAPF_USERMODE(frame)) return; stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE; current = (vm_offset_t)(uintptr_t)&stack_top; /* * Try to detect if interrupt is using kernel thread stack. * Hardware could use a dedicated stack for interrupt handling. */ if (stack_top <= current || current < td->td_kstack) return; used = stack_top - current; for (;;) { prev_used = max_kstack_used; if (prev_used >= used) break; if (atomic_cmpset_int(&max_kstack_used, prev_used, used)) break; } } #endif /* KSTACK_USAGE_PROF */ /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. */ int vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2, struct vmspace *vm2, int flags) { struct proc *p1 = td->td_proc; struct domainset *dset; int error; if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { if (p1->p_vmspace->vm_refcnt > 1) { error = vmspace_unshare(p1); if (error) return (error); } } cpu_fork(td, p2, td2, flags); return (0); } if (flags & RFMEM) { p2->p_vmspace = p1->p_vmspace; atomic_add_int(&p1->p_vmspace->vm_refcnt, 1); } dset = td2->td_domain.dr_policy; while (vm_page_count_severe_set(&dset->ds_mask)) { vm_wait_doms(&dset->ds_mask); } if ((flags & RFMEM) == 0) { p2->p_vmspace = vm2; if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } /* * cpu_fork will copy and update the pcb, set up the kernel stack, * and make the child ready to run. */ cpu_fork(td, p2, td2, flags); return (0); } /* * Called after process has been wait(2)'ed upon and is being reaped. * The idea is to reclaim resources that we could not reclaim while * the process was still executing. */ void vm_waitproc(p) struct proc *p; { vmspace_exitfree(p); /* and clean-out the vmspace */ } void kick_proc0(void) { wakeup(&proc0); } Index: projects/clang900-import/tests/sys/netpfil/common/forward.sh =================================================================== --- projects/clang900-import/tests/sys/netpfil/common/forward.sh (revision 352536) +++ projects/clang900-import/tests/sys/netpfil/common/forward.sh (revision 352537) @@ -1,101 +1,105 @@ #- # SPDX-License-Identifier: BSD-2-Clause-FreeBSD # # Copyright (c) 2019 Ahsan Barkati # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ # . $(atf_get_srcdir)/utils.subr . $(atf_get_srcdir)/runner.subr v4_head() { atf_set descr 'Basic forwarding test' atf_set require.user root atf_set require.progs scapy } v4_body() { firewall=$1 + if [ "$(atf_config_get ci false)" = "true" ] && \ + [ "$(uname -p)" = "i386" ] && [ "${firewall}" = "pf" ]; then + atf_skip "https://bugs.freebsd.org/240085" + fi firewall_init $firewall epair_send=$(vnet_mkepair) ifconfig ${epair_send}a 192.0.2.1/24 up epair_recv=$(vnet_mkepair) ifconfig ${epair_recv}a up vnet_mkjail iron ${epair_send}b ${epair_recv}b jexec iron ifconfig ${epair_send}b 192.0.2.2/24 up jexec iron ifconfig ${epair_recv}b 198.51.100.2/24 up jexec iron sysctl net.inet.ip.forwarding=1 jexec iron arp -s 198.51.100.3 00:01:02:03:04:05 route add -net 198.51.100.0/24 192.0.2.2 atf_check -s exit:0 $(atf_get_srcdir)/pft_ping.py \ --sendif ${epair_send}a \ --to 198.51.100.3 \ --recvif ${epair_recv}a firewall_config "iron" ${firewall} \ "pf" \ "block in" \ "ipfw" \ "ipfw -q add 100 deny all from any to any in" \ "ipf" \ "block in all" \ atf_check -s exit:1 $(atf_get_srcdir)/pft_ping.py \ --sendif ${epair_send}a \ --to 198.51.100.3 \ --recvif ${epair_recv}a firewall_config "iron" ${firewall} \ "pf" \ "block out" \ "ipfw" \ "ipfw -q add 100 deny all from any to any out" \ "ipf" \ "block out all" \ atf_check -s exit:1 $(atf_get_srcdir)/pft_ping.py \ --sendif ${epair_send}a \ --to 198.51.100.3 \ --recv ${epair_recv}a } v4_cleanup() { firewall=$1 firewall_cleanup $firewall } setup_tests \ v4 \ pf \ ipfw \ ipf Index: projects/clang900-import/tests/sys/netpfil/common/tos.sh =================================================================== --- projects/clang900-import/tests/sys/netpfil/common/tos.sh (revision 352536) +++ projects/clang900-import/tests/sys/netpfil/common/tos.sh (revision 352537) @@ -1,118 +1,122 @@ #- # SPDX-License-Identifier: BSD-2-Clause-FreeBSD # # Copyright (c) 2019 Ahsan Barkati # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ # . $(atf_get_srcdir)/utils.subr . $(atf_get_srcdir)/runner.subr tos_head() { atf_set descr 'set-tos test' atf_set require.user root atf_set require.progs scapy } tos_body() { firewall=$1 + if [ "$(atf_config_get ci false)" = "true" ] && \ + [ "$(uname -p)" = "i386" ] && [ "${firewall}" = "pf" ]; then + atf_skip "https://bugs.freebsd.org/240086" + fi firewall_init $firewall epair_send=$(vnet_mkepair) ifconfig ${epair_send}a 192.0.2.1/24 up epair_recv=$(vnet_mkepair) ifconfig ${epair_recv}a up vnet_mkjail iron ${epair_send}b ${epair_recv}b jexec iron ifconfig ${epair_send}b 192.0.2.2/24 up jexec iron ifconfig ${epair_recv}b 198.51.100.2/24 up jexec iron sysctl net.inet.ip.forwarding=1 jexec iron arp -s 198.51.100.3 00:01:02:03:04:05 route add -net 198.51.100.0/24 192.0.2.2 # Check if the firewall is able to set the ToS bits firewall_config "iron" ${firewall} \ "pf" \ "scrub out proto icmp set-tos 36" \ "ipfw" \ "ipfw -q add 100 setdscp 9 ip from any to any" # dscp is set to 9 because last two bits are for # EN and hence tos would be 36 atf_check -s exit:0 $(atf_get_srcdir)/pft_ping.py \ --sendif ${epair_send}a \ --to 198.51.100.3 \ --recvif ${epair_recv}a \ --expect-tos 36 # Check if the firewall is able to set the ToS bits # and persists the EN bits (if already set) firewall_config "iron" ${firewall} \ "pf" \ "scrub out proto icmp set-tos 36" \ "ipfw" \ "ipfw -q add 100 setdscp 9 ip from any to any" atf_check -s exit:0 $(atf_get_srcdir)/pft_ping.py \ --sendif ${epair_send}a \ --to 198.51.100.3 \ --recvif ${epair_recv}a \ --send-tos 3 \ --expect-tos 39 # Check if the firewall is able to filter the # packets based on the ToS value firewall_config "iron" ${firewall} \ "pf" \ "block all tos 36" \ "ipfw" \ "ipfw -q add 100 deny all from any to any dscp 9" atf_check -s exit:1 $(atf_get_srcdir)/pft_ping.py \ --sendif ${epair_send}a \ --to 198.51.100.3 \ --recvif ${epair_recv}a \ --send-tos 36 atf_check -s exit:0 $(atf_get_srcdir)/pft_ping.py \ --sendif ${epair_send}a \ --to 198.51.100.3 \ --recvif ${epair_recv}a \ --send-tos 32 } tos_cleanup() { firewall=$1 firewall_cleanup $firewall } setup_tests \ "tos" \ "pf" \ "ipfw" Index: projects/clang900-import/tests/sys/vm/Makefile =================================================================== --- projects/clang900-import/tests/sys/vm/Makefile (revision 352536) +++ projects/clang900-import/tests/sys/vm/Makefile (revision 352537) @@ -1,10 +1,11 @@ # $FreeBSD$ PACKAGE= tests TESTSDIR= ${TESTSBASE}/sys/vm ATF_TESTS_C+= mlock_test \ - mmap_test + mmap_test \ + page_fault_signal .include Index: projects/clang900-import/tests/sys/vm/page_fault_signal.c =================================================================== --- projects/clang900-import/tests/sys/vm/page_fault_signal.c (nonexistent) +++ projects/clang900-import/tests/sys/vm/page_fault_signal.c (revision 352537) @@ -0,0 +1,184 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2019 Jilles Tjoelker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include + +#include +#include +#include +#include +#include + +static sigjmp_buf sig_env; +static volatile int last_sig, last_code; + +static void +sighandler(int sig, siginfo_t *info, void *context __unused) +{ + + last_sig = sig; + last_code = info->si_code; + siglongjmp(sig_env, 1); +} + +static void +setup_signals(void) +{ + struct sigaction sa; + int r; + + sa.sa_sigaction = sighandler; + sa.sa_flags = SA_RESTART | SA_RESETHAND | SA_SIGINFO; + r = sigfillset(&sa.sa_mask); + ATF_REQUIRE(r != -1); + r = sigaction(SIGILL, &sa, NULL); + ATF_REQUIRE(r != -1); + r = sigaction(SIGBUS, &sa, NULL); + ATF_REQUIRE(r != -1); + r = sigaction(SIGSEGV, &sa, NULL); + ATF_REQUIRE(r != -1); +} + +ATF_TC_WITHOUT_HEAD(page_fault_signal__segv_maperr_1); +ATF_TC_BODY(page_fault_signal__segv_maperr_1, tc) +{ + int *p; + int r; + int sz; + + sz = getpagesize(); + p = mmap(NULL, sz, PROT_READ, MAP_ANON, -1, 0); + ATF_REQUIRE(p != MAP_FAILED); + r = munmap(p, sz); + ATF_REQUIRE(r != -1); + if (sigsetjmp(sig_env, 1) == 0) { + setup_signals(); + *(volatile int *)p = 1; + } + ATF_CHECK_EQ(SIGSEGV, last_sig); + ATF_CHECK_EQ(SEGV_MAPERR, last_code); +} + +ATF_TC_WITHOUT_HEAD(page_fault_signal__segv_accerr_1); +ATF_TC_BODY(page_fault_signal__segv_accerr_1, tc) +{ + int *p; + int sz; + + sz = getpagesize(); + p = mmap(NULL, sz, PROT_READ, MAP_ANON, -1, 0); + ATF_REQUIRE(p != MAP_FAILED); + if (sigsetjmp(sig_env, 1) == 0) { + setup_signals(); + *(volatile int *)p = 1; + } + (void)munmap(p, sz); + ATF_CHECK_EQ(SIGSEGV, last_sig); + ATF_CHECK_EQ(SEGV_ACCERR, last_code); +} + +ATF_TC_WITHOUT_HEAD(page_fault_signal__segv_accerr_2); +ATF_TC_BODY(page_fault_signal__segv_accerr_2, tc) +{ + int *p; + volatile int dummy; + int sz; + + sz = getpagesize(); + p = mmap(NULL, sz, PROT_NONE, MAP_ANON, -1, 0); + ATF_REQUIRE(p != MAP_FAILED); + if (sigsetjmp(sig_env, 1) == 0) { + setup_signals(); + dummy = *p; + } + (void)munmap(p, sz); + ATF_CHECK_EQ(SIGSEGV, last_sig); + ATF_CHECK_EQ(SEGV_ACCERR, last_code); +} + +ATF_TC_WITHOUT_HEAD(page_fault_signal__bus_objerr_1); +ATF_TC_BODY(page_fault_signal__bus_objerr_1, tc) +{ + int *p; + int fd; + int sz; + + atf_tc_expect_fail("bug 211924"); + sz = getpagesize(); + fd = shm_open(SHM_ANON, O_RDWR | O_CREAT, 0600); + ATF_REQUIRE(fd != -1); + p = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + ATF_REQUIRE(p != MAP_FAILED); + if (sigsetjmp(sig_env, 1) == 0) { + setup_signals(); + *(volatile int *)p = 1; + } + (void)munmap(p, sz); + (void)close(fd); + ATF_CHECK_EQ(SIGBUS, last_sig); + ATF_CHECK_EQ(BUS_OBJERR, last_code); +} + +ATF_TC_WITHOUT_HEAD(page_fault_signal__bus_objerr_2); +ATF_TC_BODY(page_fault_signal__bus_objerr_2, tc) +{ + int *p; + int fd; + int r; + int sz; + + atf_tc_expect_fail("bug 211924"); + sz = getpagesize(); + fd = shm_open(SHM_ANON, O_RDWR | O_CREAT, 0600); + ATF_REQUIRE(fd != -1); + r = ftruncate(fd, sz); + ATF_REQUIRE(r != -1); + p = mmap(NULL, sz * 2, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + ATF_REQUIRE(p != MAP_FAILED); + if (sigsetjmp(sig_env, 1) == 0) { + setup_signals(); + ((volatile int *)p)[sz / sizeof(int)] = 1; + } + (void)munmap(p, sz * 2); + (void)close(fd); + ATF_CHECK_EQ(SIGBUS, last_sig); + ATF_CHECK_EQ(BUS_OBJERR, last_code); +} + +ATF_TP_ADD_TCS(tp) +{ + + ATF_TP_ADD_TC(tp, page_fault_signal__segv_maperr_1); + ATF_TP_ADD_TC(tp, page_fault_signal__segv_accerr_1); + ATF_TP_ADD_TC(tp, page_fault_signal__segv_accerr_2); + ATF_TP_ADD_TC(tp, page_fault_signal__bus_objerr_1); + ATF_TP_ADD_TC(tp, page_fault_signal__bus_objerr_2); + + return (atf_no_error()); +} Property changes on: projects/clang900-import/tests/sys/vm/page_fault_signal.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang900-import/tools/build/options/WITH_GOOGLETEST =================================================================== --- projects/clang900-import/tools/build/options/WITH_GOOGLETEST (nonexistent) +++ projects/clang900-import/tools/build/options/WITH_GOOGLETEST (revision 352537) @@ -0,0 +1,5 @@ +.\" $FreeBSD$ +Set to build and install +.Lb libgmock , +.Lb libgtest , +and dependent tests. Property changes on: projects/clang900-import/tools/build/options/WITH_GOOGLETEST ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang900-import/usr.bin/truss/syscall.h =================================================================== --- projects/clang900-import/usr.bin/truss/syscall.h (revision 352536) +++ projects/clang900-import/usr.bin/truss/syscall.h (revision 352537) @@ -1,276 +1,277 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright 1997 Sean Eric Fagan * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan * 4. Neither the name of the author may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * System call arguments come in several flavours: * Hex -- values that should be printed in hex (addresses) * Octal -- Same as above, but octal * Int -- normal integer values (file descriptors, for example) * LongHex -- long value that should be printed in hex * Name -- pointer to a NULL-terminated string. * BinString -- pointer to an array of chars, printed via strvisx(). * Ptr -- pointer to some unspecified structure. Just print as hex for now. * Stat -- a pointer to a stat buffer. Prints a couple fields. * Stat11 -- a pointer to a freebsd 11 stat buffer. Prints a couple fields. * StatFs -- a pointer to a statfs buffer. Prints a few fields. * Ioctl -- an ioctl command. Woefully limited. * Quad -- a double-word value. e.g., lseek(int, offset_t, int) * Signal -- a signal number. Prints the signal name (SIGxxx) * Sockaddr -- a pointer to a struct sockaddr. Prints symbolic AF, and IP:Port * StringArray -- a pointer to an array of string pointers. * Timespec -- a pointer to a struct timespec. Prints both elements. * Timeval -- a pointer to a struct timeval. Prints both elements. * Timeval2 -- a pointer to two struct timevals. Prints both elements of both. * Itimerval -- a pointer to a struct itimerval. Prints all elements. * Pollfd -- a pointer to an array of struct pollfd. Prints .fd and .events. * Fd_set -- a pointer to an array of fd_set. Prints the fds that are set. * Sigaction -- a pointer to a struct sigaction. Prints all elements. * Sigset -- a pointer to a sigset_t. Prints the signals that are set. * Sigprocmask -- the first argument to sigprocmask(). Prints the name. * Kevent -- a pointer to an array of struct kevents. Prints all elements. * Pathconf -- the 2nd argument of pathconf(). * Utrace -- utrace(2) buffer. * CapRights -- a pointer to a cap_rights_t. Prints all set capabilities. * * In addition, the pointer types (String, Ptr) may have OUT masked in -- * this means that the data is set on *return* from the system call -- or * IN (meaning that the data is passed *into* the system call). */ enum Argtype { None = 1, /* Scalar integers. */ Socklent, Octal, Int, UInt, Hex, Long, LongHex, Sizet, Quad, QuadHex, /* Encoded scalar values. */ Accessmode, Acltype, Atfd, Atflags, CapFcntlRights, Extattrnamespace, Fadvice, Fcntl, Fcntlflag, FileFlags, Flockop, Getfsstatmode, Idtype, Ioctl, Kldsymcmd, Kldunloadflags, Madvice, Minherit, Msgflags, Mlockall, Mmapflags, Mountflags, Mprot, Msync, Open, Pathconf, Pipe2, Procctl, Priowhich, Ptraceop, Quotactlcmd, Reboothowto, Resource, Rforkflags, Rtpriofunc, RusageWho, Schedpolicy, Shutdown, Signal, Sigprocmask, Sockdomain, Sockoptlevel, Sockoptname, Sockprotocol, Socktype, Sysarch, + Sysctl, Umtxop, Waitoptions, Whence, /* Pointers to non-structures. */ Ptr, BinString, CapRights, ExecArgs, ExecEnv, ExitStatus, Fd_set, IntArray, Iovec, Name, PipeFds, PSig, PQuadHex, PUInt, Readlinkres, ShmName, StringArray, /* Pointers to structures. */ Itimerval, Kevent, Kevent11, LinuxSockArgs, Msghdr, Pollfd, Rlimit, Rusage, Schedparam, Sctpsndrcvinfo, Sigaction, Siginfo, Sigset, Sockaddr, Stat, Stat11, StatFs, Timespec, Timespec2, Timeval, Timeval2, Utrace, CloudABIAdvice, CloudABIClockID, CloudABIFDSFlags, CloudABIFDStat, CloudABIFileStat, CloudABIFileType, CloudABIFSFlags, CloudABILookup, CloudABIMFlags, CloudABIMProt, CloudABIMSFlags, CloudABIOFlags, CloudABISDFlags, CloudABISignal, CloudABISockStat, CloudABISSFlags, CloudABITimestamp, CloudABIULFlags, CloudABIWhence, MAX_ARG_TYPE, }; #define ARG_MASK 0xff #define OUT 0x100 #define IN /*0x20*/0 _Static_assert(ARG_MASK > MAX_ARG_TYPE, "ARG_MASK overlaps with Argtype values"); struct syscall_args { enum Argtype type; int offset; }; struct syscall { STAILQ_ENTRY(syscall) entries; const char *name; u_int ret_type; /* 0, 1, or 2 return values */ u_int nargs; /* actual number of meaningful arguments */ /* Hopefully, no syscalls with > 10 args */ struct syscall_args args[10]; struct timespec time; /* Time spent for this call */ int ncalls; /* Number of calls */ int nerror; /* Number of calls that returned with error */ bool unknown; /* Unknown system call */ }; struct syscall *get_syscall(struct threadinfo *, u_int, u_int); char *print_arg(struct syscall_args *, unsigned long*, register_t *, struct trussinfo *); /* * Linux Socket defines */ #define LINUX_SOCKET 1 #define LINUX_BIND 2 #define LINUX_CONNECT 3 #define LINUX_LISTEN 4 #define LINUX_ACCEPT 5 #define LINUX_GETSOCKNAME 6 #define LINUX_GETPEERNAME 7 #define LINUX_SOCKETPAIR 8 #define LINUX_SEND 9 #define LINUX_RECV 10 #define LINUX_SENDTO 11 #define LINUX_RECVFROM 12 #define LINUX_SHUTDOWN 13 #define LINUX_SETSOCKOPT 14 #define LINUX_GETSOCKOPT 15 #define LINUX_SENDMSG 16 #define LINUX_RECVMSG 17 #define PAD_(t) (sizeof(register_t) <= sizeof(t) ? \ 0 : sizeof(register_t) - sizeof(t)) #if BYTE_ORDER == LITTLE_ENDIAN #define PADL_(t) 0 #define PADR_(t) PAD_(t) #else #define PADL_(t) PAD_(t) #define PADR_(t) 0 #endif typedef int l_int; typedef uint32_t l_ulong; struct linux_socketcall_args { char what_l_[PADL_(l_int)]; l_int what; char what_r_[PADR_(l_int)]; char args_l_[PADL_(l_ulong)]; l_ulong args; char args_r_[PADR_(l_ulong)]; }; void init_syscalls(void); void print_syscall(struct trussinfo *); void print_syscall_ret(struct trussinfo *, int, register_t *); void print_summary(struct trussinfo *trussinfo); Index: projects/clang900-import/usr.bin/truss/syscalls.c =================================================================== --- projects/clang900-import/usr.bin/truss/syscalls.c (revision 352536) +++ projects/clang900-import/usr.bin/truss/syscalls.c (revision 352537) @@ -1,2730 +1,2802 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright 1997 Sean Eric Fagan * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan * 4. Neither the name of the author may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * This file has routines used to print out system calls and their * arguments. */ #include #include #define _WANT_FREEBSD11_KEVENT #include #include #include #include #include #include #include #define _WANT_FREEBSD11_STAT #include +#include #include #include #include #include #include #include #include #include #include #define _WANT_KERNEL_ERRNO #include #include #include #include #include #include #include #include #include #include #include #include #include #include "truss.h" #include "extern.h" #include "syscall.h" /* * This should probably be in its own file, sorted alphabetically. */ static struct syscall decoded_syscalls[] = { /* Native ABI */ { .name = "__acl_aclcheck_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_aclcheck_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_aclcheck_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_delete_fd", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Acltype, 1 } } }, { .name = "__acl_delete_file", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Acltype, 1 } } }, { .name = "__acl_delete_link", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Acltype, 1 } } }, { .name = "__acl_get_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_get_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_get_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_set_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_set_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_set_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__cap_rights_get", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { CapRights | OUT, 2 } } }, { .name = "__getcwd", .ret_type = 1, .nargs = 2, .args = { { Name | OUT, 0 }, { Int, 1 } } }, { .name = "_umtx_op", .ret_type = 1, .nargs = 5, .args = { { Ptr, 0 }, { Umtxop, 1 }, { LongHex, 2 }, { Ptr, 3 }, { Ptr, 4 } } }, { .name = "accept", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } }, { .name = "access", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Accessmode, 1 } } }, { .name = "bind", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | IN, 1 }, { Socklent, 2 } } }, { .name = "bindat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Int, 1 }, { Sockaddr | IN, 2 }, { Int, 3 } } }, { .name = "break", .ret_type = 1, .nargs = 1, .args = { { Ptr, 0 } } }, { .name = "cap_fcntls_get", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CapFcntlRights | OUT, 1 } } }, { .name = "cap_fcntls_limit", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CapFcntlRights, 1 } } }, { .name = "cap_getmode", .ret_type = 1, .nargs = 1, .args = { { PUInt | OUT, 0 } } }, { .name = "cap_rights_limit", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CapRights, 1 } } }, { .name = "chdir", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "chflags", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { FileFlags, 1 } } }, { .name = "chflagsat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { FileFlags, 2 }, { Atflags, 3 } } }, { .name = "chmod", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "chown", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "chroot", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "clock_gettime", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timespec | OUT, 1 } } }, { .name = "close", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "compat11.fstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Stat11 | OUT, 1 } } }, { .name = "compat11.fstatat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Stat11 | OUT, 2 }, { Atflags, 3 } } }, { .name = "compat11.kevent", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { Kevent11, 1 }, { Int, 2 }, { Kevent11 | OUT, 3 }, { Int, 4 }, { Timespec, 5 } } }, { .name = "compat11.lstat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat11 | OUT, 1 } } }, { .name = "compat11.stat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat11 | OUT, 1 } } }, { .name = "connect", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | IN, 1 }, { Socklent, 2 } } }, { .name = "connectat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Int, 1 }, { Sockaddr | IN, 2 }, { Int, 3 } } }, { .name = "dup", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "dup2", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Int, 1 } } }, { .name = "eaccess", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Accessmode, 1 } } }, { .name = "execve", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { ExecArgs | IN, 1 }, { ExecEnv | IN, 2 } } }, { .name = "exit", .ret_type = 0, .nargs = 1, .args = { { Hex, 0 } } }, { .name = "extattr_delete_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } }, { .name = "extattr_delete_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } }, { .name = "extattr_delete_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } }, { .name = "extattr_get_fd", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | OUT, 3 }, { Sizet, 4 } } }, { .name = "extattr_get_file", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | OUT, 3 }, { Sizet, 4 } } }, { .name = "extattr_get_link", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | OUT, 3 }, { Sizet, 4 } } }, { .name = "extattr_list_fd", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 }, { Sizet, 3 } } }, { .name = "extattr_list_file", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 }, { Sizet, 3 } } }, { .name = "extattr_list_link", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 }, { Sizet, 3 } } }, { .name = "extattr_set_fd", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | IN, 3 }, { Sizet, 4 } } }, { .name = "extattr_set_file", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | IN, 3 }, { Sizet, 4 } } }, { .name = "extattr_set_link", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | IN, 3 }, { Sizet, 4 } } }, { .name = "extattrctl", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Hex, 1 }, { Name, 2 }, { Extattrnamespace, 3 }, { Name, 4 } } }, { .name = "faccessat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Accessmode, 2 }, { Atflags, 3 } } }, { .name = "fchflags", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { FileFlags, 1 } } }, { .name = "fchmod", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Octal, 1 } } }, { .name = "fchmodat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 }, { Atflags, 3 } } }, { .name = "fchown", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "fchownat", .ret_type = 1, .nargs = 5, .args = { { Atfd, 0 }, { Name, 1 }, { Int, 2 }, { Int, 3 }, { Atflags, 4 } } }, { .name = "fcntl", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Fcntl, 1 }, { Fcntlflag, 2 } } }, { .name = "fdatasync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "flock", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Flockop, 1 } } }, { .name = "fstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Stat | OUT, 1 } } }, { .name = "fstatat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Stat | OUT, 2 }, { Atflags, 3 } } }, { .name = "fstatfs", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { StatFs | OUT, 1 } } }, { .name = "fsync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "ftruncate", .ret_type = 1, .nargs = 2, .args = { { Int | IN, 0 }, { QuadHex | IN, 1 } } }, { .name = "futimens", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timespec2 | IN, 1 } } }, { .name = "futimes", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timeval2 | IN, 1 } } }, { .name = "futimesat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Timeval2 | IN, 2 } } }, { .name = "getdirentries", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Int, 2 }, { PQuadHex | OUT, 3 } } }, { .name = "getfsstat", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Long, 1 }, { Getfsstatmode, 2 } } }, { .name = "getitimer", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Itimerval | OUT, 2 } } }, { .name = "getpeername", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } }, { .name = "getpgid", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "getpriority", .ret_type = 1, .nargs = 2, .args = { { Priowhich, 0 }, { Int, 1 } } }, { .name = "getrandom", .ret_type = 1, .nargs = 3, .args = { { BinString | OUT, 0 }, { Sizet, 1 }, { UInt, 2 } } }, { .name = "getrlimit", .ret_type = 1, .nargs = 2, .args = { { Resource, 0 }, { Rlimit | OUT, 1 } } }, { .name = "getrusage", .ret_type = 1, .nargs = 2, .args = { { RusageWho, 0 }, { Rusage | OUT, 1 } } }, { .name = "getsid", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "getsockname", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } }, { .name = "getsockopt", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Sockoptlevel, 1 }, { Sockoptname, 2 }, { Ptr | OUT, 3 }, { Ptr | OUT, 4 } } }, { .name = "gettimeofday", .ret_type = 1, .nargs = 2, .args = { { Timeval | OUT, 0 }, { Ptr, 1 } } }, { .name = "ioctl", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Ioctl, 1 }, { Ptr, 2 } } }, { .name = "kevent", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { Kevent, 1 }, { Int, 2 }, { Kevent | OUT, 3 }, { Int, 4 }, { Timespec, 5 } } }, { .name = "kill", .ret_type = 1, .nargs = 2, .args = { { Int | IN, 0 }, { Signal | IN, 1 } } }, { .name = "kldfind", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "kldfirstmod", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "kldload", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "kldnext", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "kldstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Ptr, 1 } } }, { .name = "kldsym", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Kldsymcmd, 1 }, { Ptr, 2 } } }, { .name = "kldunload", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "kldunloadf", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Kldunloadflags, 1 } } }, { .name = "kse_release", .ret_type = 0, .nargs = 1, .args = { { Timespec, 0 } } }, { .name = "lchflags", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { FileFlags, 1 } } }, { .name = "lchmod", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "lchown", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "link", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Name, 1 } } }, { .name = "linkat", .ret_type = 1, .nargs = 5, .args = { { Atfd, 0 }, { Name, 1 }, { Atfd, 2 }, { Name, 3 }, { Atflags, 4 } } }, { .name = "listen", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Int, 1 } } }, { .name = "lseek", .ret_type = 2, .nargs = 3, .args = { { Int, 0 }, { QuadHex, 1 }, { Whence, 2 } } }, { .name = "lstat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat | OUT, 1 } } }, { .name = "lutimes", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Timeval2 | IN, 1 } } }, { .name = "madvise", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Madvice, 2 } } }, { .name = "minherit", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Minherit, 2 } } }, { .name = "mkdir", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "mkdirat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 } } }, { .name = "mkfifo", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "mkfifoat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 } } }, { .name = "mknod", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Octal, 1 }, { Int, 2 } } }, { .name = "mknodat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 }, { Int, 3 } } }, { .name = "mlock", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Sizet, 1 } } }, { .name = "mlockall", .ret_type = 1, .nargs = 1, .args = { { Mlockall, 0 } } }, { .name = "mmap", .ret_type = 1, .nargs = 6, .args = { { Ptr, 0 }, { Sizet, 1 }, { Mprot, 2 }, { Mmapflags, 3 }, { Int, 4 }, { QuadHex, 5 } } }, { .name = "modfind", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "mount", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Name, 1 }, { Mountflags, 2 }, { Ptr, 3 } } }, { .name = "mprotect", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Mprot, 2 } } }, { .name = "msync", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Msync, 2 } } }, { .name = "munlock", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Sizet, 1 } } }, { .name = "munmap", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Sizet, 1 } } }, { .name = "nanosleep", .ret_type = 1, .nargs = 1, .args = { { Timespec, 0 } } }, { .name = "nmount", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { UInt, 1 }, { Mountflags, 2 } } }, { .name = "open", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { Open, 1 }, { Octal, 2 } } }, { .name = "openat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Open, 2 }, { Octal, 3 } } }, { .name = "pathconf", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Pathconf, 1 } } }, { .name = "pipe", .ret_type = 1, .nargs = 1, .args = { { PipeFds | OUT, 0 } } }, { .name = "pipe2", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Pipe2, 1 } } }, { .name = "poll", .ret_type = 1, .nargs = 3, .args = { { Pollfd, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "posix_fadvise", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { QuadHex, 1 }, { QuadHex, 2 }, { Fadvice, 3 } } }, { .name = "posix_openpt", .ret_type = 1, .nargs = 1, .args = { { Open, 0 } } }, { .name = "pread", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 }, { QuadHex, 3 } } }, { .name = "procctl", .ret_type = 1, .nargs = 4, .args = { { Idtype, 0 }, { Quad, 1 }, { Procctl, 2 }, { Ptr, 3 } } }, { .name = "ptrace", .ret_type = 1, .nargs = 4, .args = { { Ptraceop, 0 }, { Int, 1 }, { Ptr, 2 }, { Int, 3 } } }, { .name = "pwrite", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 }, { QuadHex, 3 } } }, { .name = "quotactl", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Quotactlcmd, 1 }, { Int, 2 }, { Ptr, 3 } } }, { .name = "read", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 } } }, { .name = "readlink", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Readlinkres | OUT, 1 }, { Sizet, 2 } } }, { .name = "readlinkat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Readlinkres | OUT, 2 }, { Sizet, 3 } } }, { .name = "readv", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Iovec | OUT, 1 }, { Int, 2 } } }, { .name = "reboot", .ret_type = 1, .nargs = 1, .args = { { Reboothowto, 0 } } }, { .name = "recvfrom", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 }, { Msgflags, 3 }, { Sockaddr | OUT, 4 }, { Ptr | OUT, 5 } } }, { .name = "recvmsg", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Msghdr | OUT, 1 }, { Msgflags, 2 } } }, { .name = "rename", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Name, 1 } } }, { .name = "renameat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Atfd, 2 }, { Name, 3 } } }, { .name = "rfork", .ret_type = 1, .nargs = 1, .args = { { Rforkflags, 0 } } }, { .name = "rmdir", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "rtprio", .ret_type = 1, .nargs = 3, .args = { { Rtpriofunc, 0 }, { Int, 1 }, { Ptr, 2 } } }, { .name = "rtprio_thread", .ret_type = 1, .nargs = 3, .args = { { Rtpriofunc, 0 }, { Int, 1 }, { Ptr, 2 } } }, { .name = "sched_get_priority_max", .ret_type = 1, .nargs = 1, .args = { { Schedpolicy, 0 } } }, { .name = "sched_get_priority_min", .ret_type = 1, .nargs = 1, .args = { { Schedpolicy, 0 } } }, { .name = "sched_getparam", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Schedparam | OUT, 1 } } }, { .name = "sched_getscheduler", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "sched_rr_get_interval", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timespec | OUT, 1 } } }, { .name = "sched_setparam", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Schedparam, 1 } } }, { .name = "sched_setscheduler", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Schedpolicy, 1 }, { Schedparam, 2 } } }, { .name = "sctp_generic_recvmsg", .ret_type = 1, .nargs = 7, .args = { { Int, 0 }, { Iovec | OUT, 1 }, { Int, 2 }, { Sockaddr | OUT, 3 }, { Ptr | OUT, 4 }, { Sctpsndrcvinfo | OUT, 5 }, { Ptr | OUT, 6 } } }, { .name = "sctp_generic_sendmsg", .ret_type = 1, .nargs = 7, .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 2 }, { Sockaddr | IN, 3 }, { Socklent, 4 }, { Sctpsndrcvinfo | IN, 5 }, { Msgflags, 6 } } }, { .name = "sctp_generic_sendmsg_iov", .ret_type = 1, .nargs = 7, .args = { { Int, 0 }, { Iovec | IN, 1 }, { Int, 2 }, { Sockaddr | IN, 3 }, { Socklent, 4 }, { Sctpsndrcvinfo | IN, 5 }, { Msgflags, 6 } } }, { .name = "select", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Fd_set, 1 }, { Fd_set, 2 }, { Fd_set, 3 }, { Timeval, 4 } } }, { .name = "sendmsg", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Msghdr | IN, 1 }, { Msgflags, 2 } } }, { .name = "sendto", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 }, { Msgflags, 3 }, { Sockaddr | IN, 4 }, { Socklent | IN, 5 } } }, { .name = "setitimer", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Itimerval, 1 }, { Itimerval | OUT, 2 } } }, { .name = "setpriority", .ret_type = 1, .nargs = 3, .args = { { Priowhich, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "setrlimit", .ret_type = 1, .nargs = 2, .args = { { Resource, 0 }, { Rlimit | IN, 1 } } }, { .name = "setsockopt", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Sockoptlevel, 1 }, { Sockoptname, 2 }, { Ptr | IN, 3 }, { Socklent, 4 } } }, { .name = "shm_open", .ret_type = 1, .nargs = 3, .args = { { ShmName | IN, 0 }, { Open, 1 }, { Octal, 2 } } }, { .name = "shm_unlink", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "shutdown", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Shutdown, 1 } } }, { .name = "sigaction", .ret_type = 1, .nargs = 3, .args = { { Signal, 0 }, { Sigaction | IN, 1 }, { Sigaction | OUT, 2 } } }, { .name = "sigpending", .ret_type = 1, .nargs = 1, .args = { { Sigset | OUT, 0 } } }, { .name = "sigprocmask", .ret_type = 1, .nargs = 3, .args = { { Sigprocmask, 0 }, { Sigset, 1 }, { Sigset | OUT, 2 } } }, { .name = "sigqueue", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Signal, 1 }, { LongHex, 2 } } }, { .name = "sigreturn", .ret_type = 1, .nargs = 1, .args = { { Ptr, 0 } } }, { .name = "sigsuspend", .ret_type = 1, .nargs = 1, .args = { { Sigset | IN, 0 } } }, { .name = "sigtimedwait", .ret_type = 1, .nargs = 3, .args = { { Sigset | IN, 0 }, { Siginfo | OUT, 1 }, { Timespec | IN, 2 } } }, { .name = "sigwait", .ret_type = 1, .nargs = 2, .args = { { Sigset | IN, 0 }, { PSig | OUT, 1 } } }, { .name = "sigwaitinfo", .ret_type = 1, .nargs = 2, .args = { { Sigset | IN, 0 }, { Siginfo | OUT, 1 } } }, { .name = "socket", .ret_type = 1, .nargs = 3, .args = { { Sockdomain, 0 }, { Socktype, 1 }, { Sockprotocol, 2 } } }, { .name = "stat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat | OUT, 1 } } }, { .name = "statfs", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { StatFs | OUT, 1 } } }, { .name = "symlink", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Name, 1 } } }, { .name = "symlinkat", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Atfd, 1 }, { Name, 2 } } }, { .name = "sysarch", .ret_type = 1, .nargs = 2, .args = { { Sysarch, 0 }, { Ptr, 1 } } }, + { .name = "__sysctl", .ret_type = 1, .nargs = 6, + .args = { { Sysctl, 0 }, { Sizet, 1 }, { Ptr, 2 }, { Ptr, 3 }, + { Ptr, 4 }, { Sizet, 5 } } }, + { .name = "__sysctlbyname", .ret_type = 1, .nargs = 6, + .args = { { Name, 0 }, { Sizet, 1 }, { Ptr, 2 }, { Ptr, 3 }, + { Ptr, 4}, { Sizet, 5 } } }, { .name = "thr_kill", .ret_type = 1, .nargs = 2, .args = { { Long, 0 }, { Signal, 1 } } }, { .name = "thr_self", .ret_type = 1, .nargs = 1, .args = { { Ptr, 0 } } }, { .name = "thr_set_name", .ret_type = 1, .nargs = 2, .args = { { Long, 0 }, { Name, 1 } } }, { .name = "truncate", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { QuadHex | IN, 1 } } }, #if 0 /* Does not exist */ { .name = "umount", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Int, 2 } } }, #endif { .name = "unlink", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "unlinkat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name, 1 }, { Atflags, 2 } } }, { .name = "unmount", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Mountflags, 1 } } }, { .name = "utimensat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Timespec2 | IN, 2 }, { Atflags, 3 } } }, { .name = "utimes", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Timeval2 | IN, 1 } } }, { .name = "utrace", .ret_type = 1, .nargs = 1, .args = { { Utrace, 0 } } }, { .name = "wait4", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { ExitStatus | OUT, 1 }, { Waitoptions, 2 }, { Rusage | OUT, 3 } } }, { .name = "wait6", .ret_type = 1, .nargs = 6, .args = { { Idtype, 0 }, { Quad, 1 }, { ExitStatus | OUT, 2 }, { Waitoptions, 3 }, { Rusage | OUT, 4 }, { Siginfo | OUT, 5 } } }, { .name = "write", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 } } }, { .name = "writev", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Iovec | IN, 1 }, { Int, 2 } } }, /* Linux ABI */ { .name = "linux_access", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Accessmode, 1 } } }, { .name = "linux_execve", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { ExecArgs | IN, 1 }, { ExecEnv | IN, 2 } } }, { .name = "linux_lseek", .ret_type = 2, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { Whence, 2 } } }, { .name = "linux_mkdir", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Int, 1 } } }, { .name = "linux_newfstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Ptr | OUT, 1 } } }, { .name = "linux_newstat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Ptr | OUT, 1 } } }, { .name = "linux_open", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Hex, 1 }, { Octal, 2 } } }, { .name = "linux_readlink", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Name | OUT, 1 }, { Sizet, 2 } } }, { .name = "linux_socketcall", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { LinuxSockArgs, 1 } } }, { .name = "linux_stat64", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Ptr | OUT, 1 } } }, /* CloudABI system calls. */ { .name = "cloudabi_sys_clock_res_get", .ret_type = 1, .nargs = 1, .args = { { CloudABIClockID, 0 } } }, { .name = "cloudabi_sys_clock_time_get", .ret_type = 1, .nargs = 2, .args = { { CloudABIClockID, 0 }, { CloudABITimestamp, 1 } } }, { .name = "cloudabi_sys_condvar_signal", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { CloudABIMFlags, 1 }, { UInt, 2 } } }, { .name = "cloudabi_sys_fd_close", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_fd_create1", .ret_type = 1, .nargs = 1, .args = { { CloudABIFileType, 0 } } }, { .name = "cloudabi_sys_fd_create2", .ret_type = 1, .nargs = 2, .args = { { CloudABIFileType, 0 }, { PipeFds | OUT, 0 } } }, { .name = "cloudabi_sys_fd_datasync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_fd_dup", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_fd_replace", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Int, 1 } } }, { .name = "cloudabi_sys_fd_seek", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { CloudABIWhence, 2 } } }, { .name = "cloudabi_sys_fd_stat_get", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CloudABIFDStat | OUT, 1 } } }, { .name = "cloudabi_sys_fd_stat_put", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { CloudABIFDStat | IN, 1 }, { CloudABIFDSFlags, 2 } } }, { .name = "cloudabi_sys_fd_sync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_file_advise", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { Int, 1 }, { Int, 2 }, { CloudABIAdvice, 3 } } }, { .name = "cloudabi_sys_file_allocate", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "cloudabi_sys_file_create", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | IN, 1 }, { CloudABIFileType, 3 } } }, { .name = "cloudabi_sys_file_link", .ret_type = 1, .nargs = 4, .args = { { CloudABILookup, 0 }, { BinString | IN, 1 }, { Int, 3 }, { BinString | IN, 4 } } }, { .name = "cloudabi_sys_file_open", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { CloudABIOFlags, 3 }, { CloudABIFDStat | IN, 4 } } }, { .name = "cloudabi_sys_file_readdir", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Int, 2 }, { Int, 3 } } }, { .name = "cloudabi_sys_file_readlink", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { BinString | OUT, 3 }, { Int, 4 } } }, { .name = "cloudabi_sys_file_rename", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 3 }, { BinString | IN, 4 } } }, { .name = "cloudabi_sys_file_stat_fget", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CloudABIFileStat | OUT, 1 } } }, { .name = "cloudabi_sys_file_stat_fput", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { CloudABIFileStat | IN, 1 }, { CloudABIFSFlags, 2 } } }, { .name = "cloudabi_sys_file_stat_get", .ret_type = 1, .nargs = 3, .args = { { CloudABILookup, 0 }, { BinString | IN, 1 }, { CloudABIFileStat | OUT, 3 } } }, { .name = "cloudabi_sys_file_stat_put", .ret_type = 1, .nargs = 4, .args = { { CloudABILookup, 0 }, { BinString | IN, 1 }, { CloudABIFileStat | IN, 3 }, { CloudABIFSFlags, 4 } } }, { .name = "cloudabi_sys_file_symlink", .ret_type = 1, .nargs = 3, .args = { { BinString | IN, 0 }, { Int, 2 }, { BinString | IN, 3 } } }, { .name = "cloudabi_sys_file_unlink", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | IN, 1 }, { CloudABIULFlags, 3 } } }, { .name = "cloudabi_sys_lock_unlock", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { CloudABIMFlags, 1 } } }, { .name = "cloudabi_sys_mem_advise", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIAdvice, 2 } } }, { .name = "cloudabi_sys_mem_map", .ret_type = 1, .nargs = 6, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMProt, 2 }, { CloudABIMFlags, 3 }, { Int, 4 }, { Int, 5 } } }, { .name = "cloudabi_sys_mem_protect", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMProt, 2 } } }, { .name = "cloudabi_sys_mem_sync", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMSFlags, 2 } } }, { .name = "cloudabi_sys_mem_unmap", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Int, 1 } } }, { .name = "cloudabi_sys_proc_exec", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 2 }, { IntArray, 3 }, { Int, 4 } } }, { .name = "cloudabi_sys_proc_exit", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_proc_fork", .ret_type = 1, .nargs = 0 }, { .name = "cloudabi_sys_proc_raise", .ret_type = 1, .nargs = 1, .args = { { CloudABISignal, 0 } } }, { .name = "cloudabi_sys_random_get", .ret_type = 1, .nargs = 2, .args = { { BinString | OUT, 0 }, { Int, 1 } } }, { .name = "cloudabi_sys_sock_shutdown", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CloudABISDFlags, 1 } } }, { .name = "cloudabi_sys_thread_exit", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { CloudABIMFlags, 1 } } }, { .name = "cloudabi_sys_thread_yield", .ret_type = 1, .nargs = 0 }, { .name = 0 }, }; static STAILQ_HEAD(, syscall) syscalls; /* Xlat idea taken from strace */ struct xlat { int val; const char *str; }; #define X(a) { a, #a }, #define XEND { 0, NULL } static struct xlat poll_flags[] = { X(POLLSTANDARD) X(POLLIN) X(POLLPRI) X(POLLOUT) X(POLLERR) X(POLLHUP) X(POLLNVAL) X(POLLRDNORM) X(POLLRDBAND) X(POLLWRBAND) X(POLLINIGNEOF) XEND }; static struct xlat sigaction_flags[] = { X(SA_ONSTACK) X(SA_RESTART) X(SA_RESETHAND) X(SA_NOCLDSTOP) X(SA_NODEFER) X(SA_NOCLDWAIT) X(SA_SIGINFO) XEND }; static struct xlat linux_socketcall_ops[] = { X(LINUX_SOCKET) X(LINUX_BIND) X(LINUX_CONNECT) X(LINUX_LISTEN) X(LINUX_ACCEPT) X(LINUX_GETSOCKNAME) X(LINUX_GETPEERNAME) X(LINUX_SOCKETPAIR) X(LINUX_SEND) X(LINUX_RECV) X(LINUX_SENDTO) X(LINUX_RECVFROM) X(LINUX_SHUTDOWN) X(LINUX_SETSOCKOPT) X(LINUX_GETSOCKOPT) X(LINUX_SENDMSG) X(LINUX_RECVMSG) XEND }; #undef X #define X(a) { CLOUDABI_##a, #a }, static struct xlat cloudabi_advice[] = { X(ADVICE_DONTNEED) X(ADVICE_NOREUSE) X(ADVICE_NORMAL) X(ADVICE_RANDOM) X(ADVICE_SEQUENTIAL) X(ADVICE_WILLNEED) XEND }; static struct xlat cloudabi_clockid[] = { X(CLOCK_MONOTONIC) X(CLOCK_PROCESS_CPUTIME_ID) X(CLOCK_REALTIME) X(CLOCK_THREAD_CPUTIME_ID) XEND }; static struct xlat cloudabi_fdflags[] = { X(FDFLAG_APPEND) X(FDFLAG_DSYNC) X(FDFLAG_NONBLOCK) X(FDFLAG_RSYNC) X(FDFLAG_SYNC) XEND }; static struct xlat cloudabi_fdsflags[] = { X(FDSTAT_FLAGS) X(FDSTAT_RIGHTS) XEND }; static struct xlat cloudabi_filetype[] = { X(FILETYPE_UNKNOWN) X(FILETYPE_BLOCK_DEVICE) X(FILETYPE_CHARACTER_DEVICE) X(FILETYPE_DIRECTORY) X(FILETYPE_PROCESS) X(FILETYPE_REGULAR_FILE) X(FILETYPE_SHARED_MEMORY) X(FILETYPE_SOCKET_DGRAM) X(FILETYPE_SOCKET_STREAM) X(FILETYPE_SYMBOLIC_LINK) XEND }; static struct xlat cloudabi_fsflags[] = { X(FILESTAT_ATIM) X(FILESTAT_ATIM_NOW) X(FILESTAT_MTIM) X(FILESTAT_MTIM_NOW) X(FILESTAT_SIZE) XEND }; static struct xlat cloudabi_mflags[] = { X(MAP_ANON) X(MAP_FIXED) X(MAP_PRIVATE) X(MAP_SHARED) XEND }; static struct xlat cloudabi_mprot[] = { X(PROT_EXEC) X(PROT_WRITE) X(PROT_READ) XEND }; static struct xlat cloudabi_msflags[] = { X(MS_ASYNC) X(MS_INVALIDATE) X(MS_SYNC) XEND }; static struct xlat cloudabi_oflags[] = { X(O_CREAT) X(O_DIRECTORY) X(O_EXCL) X(O_TRUNC) XEND }; static struct xlat cloudabi_sdflags[] = { X(SHUT_RD) X(SHUT_WR) XEND }; static struct xlat cloudabi_signal[] = { X(SIGABRT) X(SIGALRM) X(SIGBUS) X(SIGCHLD) X(SIGCONT) X(SIGFPE) X(SIGHUP) X(SIGILL) X(SIGINT) X(SIGKILL) X(SIGPIPE) X(SIGQUIT) X(SIGSEGV) X(SIGSTOP) X(SIGSYS) X(SIGTERM) X(SIGTRAP) X(SIGTSTP) X(SIGTTIN) X(SIGTTOU) X(SIGURG) X(SIGUSR1) X(SIGUSR2) X(SIGVTALRM) X(SIGXCPU) X(SIGXFSZ) XEND }; static struct xlat cloudabi_ulflags[] = { X(UNLINK_REMOVEDIR) XEND }; static struct xlat cloudabi_whence[] = { X(WHENCE_CUR) X(WHENCE_END) X(WHENCE_SET) XEND }; #undef X #undef XEND /* * Searches an xlat array for a value, and returns it if found. Otherwise * return a string representation. */ static const char * lookup(struct xlat *xlat, int val, int base) { static char tmp[16]; for (; xlat->str != NULL; xlat++) if (xlat->val == val) return (xlat->str); switch (base) { case 8: sprintf(tmp, "0%o", val); break; case 16: sprintf(tmp, "0x%x", val); break; case 10: sprintf(tmp, "%u", val); break; default: errx(1,"Unknown lookup base"); break; } return (tmp); } static const char * xlookup(struct xlat *xlat, int val) { return (lookup(xlat, val, 16)); } /* * Searches an xlat array containing bitfield values. Remaining bits * set after removing the known ones are printed at the end: * IN|0x400. */ static char * xlookup_bits(struct xlat *xlat, int val) { int len, rem; static char str[512]; len = 0; rem = val; for (; xlat->str != NULL; xlat++) { if ((xlat->val & rem) == xlat->val) { /* * Don't print the "all-bits-zero" string unless all * bits are really zero. */ if (xlat->val == 0 && val != 0) continue; len += sprintf(str + len, "%s|", xlat->str); rem &= ~(xlat->val); } } /* * If we have leftover bits or didn't match anything, print * the remainder. */ if (rem || len == 0) len += sprintf(str + len, "0x%x", rem); if (len && str[len - 1] == '|') len--; str[len] = 0; return (str); } static void print_integer_arg(const char *(*decoder)(int), FILE *fp, int value) { const char *str; str = decoder(value); if (str != NULL) fputs(str, fp); else fprintf(fp, "%d", value); } static void print_mask_arg(bool (*decoder)(FILE *, int, int *), FILE *fp, int value) { int rem; if (!decoder(fp, value, &rem)) fprintf(fp, "0x%x", rem); else if (rem != 0) fprintf(fp, "|0x%x", rem); } static void print_mask_arg32(bool (*decoder)(FILE *, uint32_t, uint32_t *), FILE *fp, uint32_t value) { uint32_t rem; if (!decoder(fp, value, &rem)) fprintf(fp, "0x%x", rem); else if (rem != 0) fprintf(fp, "|0x%x", rem); } #ifndef __LP64__ /* * Add argument padding to subsequent system calls afater a Quad * syscall arguments as needed. This used to be done by hand in the * decoded_syscalls table which was ugly and error prone. It is * simpler to do the fixup of offsets at initalization time than when * decoding arguments. */ static void quad_fixup(struct syscall *sc) { int offset, prev; u_int i; offset = 0; prev = -1; for (i = 0; i < sc->nargs; i++) { /* This arg type is a dummy that doesn't use offset. */ if ((sc->args[i].type & ARG_MASK) == PipeFds) continue; assert(prev < sc->args[i].offset); prev = sc->args[i].offset; sc->args[i].offset += offset; switch (sc->args[i].type & ARG_MASK) { case Quad: case QuadHex: #ifdef __powerpc__ /* * 64-bit arguments on 32-bit powerpc must be * 64-bit aligned. If the current offset is * not aligned, the calling convention inserts * a 32-bit pad argument that should be skipped. */ if (sc->args[i].offset % 2 == 1) { sc->args[i].offset++; offset++; } #endif offset++; default: break; } } } #endif void init_syscalls(void) { struct syscall *sc; STAILQ_INIT(&syscalls); for (sc = decoded_syscalls; sc->name != NULL; sc++) { #ifndef __LP64__ quad_fixup(sc); #endif STAILQ_INSERT_HEAD(&syscalls, sc, entries); } } static struct syscall * find_syscall(struct procabi *abi, u_int number) { struct extra_syscall *es; if (number < nitems(abi->syscalls)) return (abi->syscalls[number]); STAILQ_FOREACH(es, &abi->extra_syscalls, entries) { if (es->number == number) return (es->sc); } return (NULL); } static void add_syscall(struct procabi *abi, u_int number, struct syscall *sc) { struct extra_syscall *es; if (number < nitems(abi->syscalls)) { assert(abi->syscalls[number] == NULL); abi->syscalls[number] = sc; } else { es = malloc(sizeof(*es)); es->sc = sc; es->number = number; STAILQ_INSERT_TAIL(&abi->extra_syscalls, es, entries); } } /* * If/when the list gets big, it might be desirable to do it * as a hash table or binary search. */ struct syscall * get_syscall(struct threadinfo *t, u_int number, u_int nargs) { struct syscall *sc; const char *name; char *new_name; u_int i; sc = find_syscall(t->proc->abi, number); if (sc != NULL) return (sc); name = sysdecode_syscallname(t->proc->abi->abi, number); if (name == NULL) { asprintf(&new_name, "#%d", number); name = new_name; } else new_name = NULL; STAILQ_FOREACH(sc, &syscalls, entries) { if (strcmp(name, sc->name) == 0) { add_syscall(t->proc->abi, number, sc); free(new_name); return (sc); } } /* It is unknown. Add it into the list. */ #if DEBUG fprintf(stderr, "unknown syscall %s -- setting args to %d\n", name, nargs); #endif sc = calloc(1, sizeof(struct syscall)); sc->name = name; if (new_name != NULL) sc->unknown = true; sc->ret_type = 1; sc->nargs = nargs; for (i = 0; i < nargs; i++) { sc->args[i].offset = i; /* Treat all unknown arguments as LongHex. */ sc->args[i].type = LongHex; } STAILQ_INSERT_HEAD(&syscalls, sc, entries); add_syscall(t->proc->abi, number, sc); return (sc); } /* * Copy a fixed amount of bytes from the process. */ static int get_struct(pid_t pid, void *offset, void *buf, int len) { struct ptrace_io_desc iorequest; iorequest.piod_op = PIOD_READ_D; iorequest.piod_offs = offset; iorequest.piod_addr = buf; iorequest.piod_len = len; if (ptrace(PT_IO, pid, (caddr_t)&iorequest, 0) < 0) return (-1); return (0); } #define MAXSIZE 4096 /* * Copy a string from the process. Note that it is * expected to be a C string, but if max is set, it will * only get that much. */ static char * get_string(pid_t pid, void *addr, int max) { struct ptrace_io_desc iorequest; char *buf, *nbuf; size_t offset, size, totalsize; offset = 0; if (max) size = max + 1; else { /* Read up to the end of the current page. */ size = PAGE_SIZE - ((uintptr_t)addr % PAGE_SIZE); if (size > MAXSIZE) size = MAXSIZE; } totalsize = size; buf = malloc(totalsize); if (buf == NULL) return (NULL); for (;;) { iorequest.piod_op = PIOD_READ_D; iorequest.piod_offs = (char *)addr + offset; iorequest.piod_addr = buf + offset; iorequest.piod_len = size; if (ptrace(PT_IO, pid, (caddr_t)&iorequest, 0) < 0) { free(buf); return (NULL); } if (memchr(buf + offset, '\0', size) != NULL) return (buf); offset += size; if (totalsize < MAXSIZE && max == 0) { size = MAXSIZE - totalsize; if (size > PAGE_SIZE) size = PAGE_SIZE; nbuf = realloc(buf, totalsize + size); if (nbuf == NULL) { buf[totalsize - 1] = '\0'; return (buf); } buf = nbuf; totalsize += size; } else { buf[totalsize - 1] = '\0'; return (buf); } } } static const char * strsig2(int sig) { static char tmp[32]; const char *signame; signame = sysdecode_signal(sig); if (signame == NULL) { snprintf(tmp, sizeof(tmp), "%d", sig); signame = tmp; } return (signame); } static void print_kevent(FILE *fp, struct kevent *ke) { switch (ke->filter) { case EVFILT_READ: case EVFILT_WRITE: case EVFILT_VNODE: case EVFILT_PROC: case EVFILT_TIMER: case EVFILT_PROCDESC: case EVFILT_EMPTY: fprintf(fp, "%ju", (uintmax_t)ke->ident); break; case EVFILT_SIGNAL: fputs(strsig2(ke->ident), fp); break; default: fprintf(fp, "%p", (void *)ke->ident); } fprintf(fp, ","); print_integer_arg(sysdecode_kevent_filter, fp, ke->filter); fprintf(fp, ","); print_mask_arg(sysdecode_kevent_flags, fp, ke->flags); fprintf(fp, ","); sysdecode_kevent_fflags(fp, ke->filter, ke->fflags, 16); fprintf(fp, ",%#jx,%p", (uintmax_t)ke->data, ke->udata); } static void print_utrace(FILE *fp, void *utrace_addr, size_t len) { unsigned char *utrace_buffer; fprintf(fp, "{ "); if (sysdecode_utrace(fp, utrace_addr, len)) { fprintf(fp, " }"); return; } utrace_buffer = utrace_addr; fprintf(fp, "%zu:", len); while (len--) fprintf(fp, " %02x", *utrace_buffer++); fprintf(fp, " }"); } static void print_sockaddr(FILE *fp, struct trussinfo *trussinfo, void *arg, socklen_t len) { char addr[64]; struct sockaddr_in *lsin; struct sockaddr_in6 *lsin6; struct sockaddr_un *sun; struct sockaddr *sa; u_char *q; pid_t pid = trussinfo->curthread->proc->pid; if (arg == NULL) { fputs("NULL", fp); return; } /* If the length is too small, just bail. */ if (len < sizeof(*sa)) { fprintf(fp, "%p", arg); return; } sa = calloc(1, len); if (get_struct(pid, arg, sa, len) == -1) { free(sa); fprintf(fp, "%p", arg); return; } switch (sa->sa_family) { case AF_INET: if (len < sizeof(*lsin)) goto sockaddr_short; lsin = (struct sockaddr_in *)(void *)sa; inet_ntop(AF_INET, &lsin->sin_addr, addr, sizeof(addr)); fprintf(fp, "{ AF_INET %s:%d }", addr, htons(lsin->sin_port)); break; case AF_INET6: if (len < sizeof(*lsin6)) goto sockaddr_short; lsin6 = (struct sockaddr_in6 *)(void *)sa; inet_ntop(AF_INET6, &lsin6->sin6_addr, addr, sizeof(addr)); fprintf(fp, "{ AF_INET6 [%s]:%d }", addr, htons(lsin6->sin6_port)); break; case AF_UNIX: sun = (struct sockaddr_un *)sa; fprintf(fp, "{ AF_UNIX \"%.*s\" }", (int)(len - offsetof(struct sockaddr_un, sun_path)), sun->sun_path); break; default: sockaddr_short: fprintf(fp, "{ sa_len = %d, sa_family = %d, sa_data = {", (int)sa->sa_len, (int)sa->sa_family); for (q = (u_char *)sa->sa_data; q < (u_char *)sa + len; q++) fprintf(fp, "%s 0x%02x", q == (u_char *)sa->sa_data ? "" : ",", *q); fputs(" } }", fp); } free(sa); } #define IOV_LIMIT 16 static void print_iovec(FILE *fp, struct trussinfo *trussinfo, void *arg, int iovcnt) { struct iovec iov[IOV_LIMIT]; size_t max_string = trussinfo->strsize; char tmp2[max_string + 1], *tmp3; size_t len; pid_t pid = trussinfo->curthread->proc->pid; int i; bool buf_truncated, iov_truncated; if (iovcnt <= 0) { fprintf(fp, "%p", arg); return; } if (iovcnt > IOV_LIMIT) { iovcnt = IOV_LIMIT; iov_truncated = true; } else { iov_truncated = false; } if (get_struct(pid, arg, &iov, iovcnt * sizeof(struct iovec)) == -1) { fprintf(fp, "%p", arg); return; } fputs("[", fp); for (i = 0; i < iovcnt; i++) { len = iov[i].iov_len; if (len > max_string) { len = max_string; buf_truncated = true; } else { buf_truncated = false; } fprintf(fp, "%s{", (i > 0) ? "," : ""); if (len && get_struct(pid, iov[i].iov_base, &tmp2, len) != -1) { tmp3 = malloc(len * 4 + 1); while (len) { if (strvisx(tmp3, tmp2, len, VIS_CSTYLE|VIS_TAB|VIS_NL) <= (int)max_string) break; len--; buf_truncated = true; } fprintf(fp, "\"%s\"%s", tmp3, buf_truncated ? "..." : ""); free(tmp3); } else { fprintf(fp, "%p", iov[i].iov_base); } fprintf(fp, ",%zu}", iov[i].iov_len); } fprintf(fp, "%s%s", iov_truncated ? ",..." : "", "]"); } static void print_gen_cmsg(FILE *fp, struct cmsghdr *cmsghdr) { u_char *q; fputs("{", fp); for (q = CMSG_DATA(cmsghdr); q < (u_char *)cmsghdr + cmsghdr->cmsg_len; q++) { fprintf(fp, "%s0x%02x", q == CMSG_DATA(cmsghdr) ? "" : ",", *q); } fputs("}", fp); } static void print_sctp_initmsg(FILE *fp, struct sctp_initmsg *init) { fprintf(fp, "{out=%u,", init->sinit_num_ostreams); fprintf(fp, "in=%u,", init->sinit_max_instreams); fprintf(fp, "max_rtx=%u,", init->sinit_max_attempts); fprintf(fp, "max_rto=%u}", init->sinit_max_init_timeo); } static void print_sctp_sndrcvinfo(FILE *fp, bool receive, struct sctp_sndrcvinfo *info) { fprintf(fp, "{sid=%u,", info->sinfo_stream); if (receive) { fprintf(fp, "ssn=%u,", info->sinfo_ssn); } fputs("flgs=", fp); sysdecode_sctp_sinfo_flags(fp, info->sinfo_flags); fprintf(fp, ",ppid=%u,", ntohl(info->sinfo_ppid)); if (!receive) { fprintf(fp, "ctx=%u,", info->sinfo_context); fprintf(fp, "ttl=%u,", info->sinfo_timetolive); } if (receive) { fprintf(fp, "tsn=%u,", info->sinfo_tsn); fprintf(fp, "cumtsn=%u,", info->sinfo_cumtsn); } fprintf(fp, "id=%u}", info->sinfo_assoc_id); } static void print_sctp_sndinfo(FILE *fp, struct sctp_sndinfo *info) { fprintf(fp, "{sid=%u,", info->snd_sid); fputs("flgs=", fp); print_mask_arg(sysdecode_sctp_snd_flags, fp, info->snd_flags); fprintf(fp, ",ppid=%u,", ntohl(info->snd_ppid)); fprintf(fp, "ctx=%u,", info->snd_context); fprintf(fp, "id=%u}", info->snd_assoc_id); } static void print_sctp_rcvinfo(FILE *fp, struct sctp_rcvinfo *info) { fprintf(fp, "{sid=%u,", info->rcv_sid); fprintf(fp, "ssn=%u,", info->rcv_ssn); fputs("flgs=", fp); print_mask_arg(sysdecode_sctp_rcv_flags, fp, info->rcv_flags); fprintf(fp, ",ppid=%u,", ntohl(info->rcv_ppid)); fprintf(fp, "tsn=%u,", info->rcv_tsn); fprintf(fp, "cumtsn=%u,", info->rcv_cumtsn); fprintf(fp, "ctx=%u,", info->rcv_context); fprintf(fp, "id=%u}", info->rcv_assoc_id); } static void print_sctp_nxtinfo(FILE *fp, struct sctp_nxtinfo *info) { fprintf(fp, "{sid=%u,", info->nxt_sid); fputs("flgs=", fp); print_mask_arg(sysdecode_sctp_nxt_flags, fp, info->nxt_flags); fprintf(fp, ",ppid=%u,", ntohl(info->nxt_ppid)); fprintf(fp, "len=%u,", info->nxt_length); fprintf(fp, "id=%u}", info->nxt_assoc_id); } static void print_sctp_prinfo(FILE *fp, struct sctp_prinfo *info) { fputs("{pol=", fp); print_integer_arg(sysdecode_sctp_pr_policy, fp, info->pr_policy); fprintf(fp, ",val=%u}", info->pr_value); } static void print_sctp_authinfo(FILE *fp, struct sctp_authinfo *info) { fprintf(fp, "{num=%u}", info->auth_keynumber); } static void print_sctp_ipv4_addr(FILE *fp, struct in_addr *addr) { char buf[INET_ADDRSTRLEN]; const char *s; s = inet_ntop(AF_INET, addr, buf, INET_ADDRSTRLEN); if (s != NULL) fprintf(fp, "{addr=%s}", s); else fputs("{addr=???}", fp); } static void print_sctp_ipv6_addr(FILE *fp, struct in6_addr *addr) { char buf[INET6_ADDRSTRLEN]; const char *s; s = inet_ntop(AF_INET6, addr, buf, INET6_ADDRSTRLEN); if (s != NULL) fprintf(fp, "{addr=%s}", s); else fputs("{addr=???}", fp); } static void print_sctp_cmsg(FILE *fp, bool receive, struct cmsghdr *cmsghdr) { void *data; socklen_t len; len = cmsghdr->cmsg_len; data = CMSG_DATA(cmsghdr); switch (cmsghdr->cmsg_type) { case SCTP_INIT: if (len == CMSG_LEN(sizeof(struct sctp_initmsg))) print_sctp_initmsg(fp, (struct sctp_initmsg *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_SNDRCV: if (len == CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) print_sctp_sndrcvinfo(fp, receive, (struct sctp_sndrcvinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; #if 0 case SCTP_EXTRCV: if (len == CMSG_LEN(sizeof(struct sctp_extrcvinfo))) print_sctp_extrcvinfo(fp, (struct sctp_extrcvinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; #endif case SCTP_SNDINFO: if (len == CMSG_LEN(sizeof(struct sctp_sndinfo))) print_sctp_sndinfo(fp, (struct sctp_sndinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_RCVINFO: if (len == CMSG_LEN(sizeof(struct sctp_rcvinfo))) print_sctp_rcvinfo(fp, (struct sctp_rcvinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_NXTINFO: if (len == CMSG_LEN(sizeof(struct sctp_nxtinfo))) print_sctp_nxtinfo(fp, (struct sctp_nxtinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_PRINFO: if (len == CMSG_LEN(sizeof(struct sctp_prinfo))) print_sctp_prinfo(fp, (struct sctp_prinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_AUTHINFO: if (len == CMSG_LEN(sizeof(struct sctp_authinfo))) print_sctp_authinfo(fp, (struct sctp_authinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_DSTADDRV4: if (len == CMSG_LEN(sizeof(struct in_addr))) print_sctp_ipv4_addr(fp, (struct in_addr *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_DSTADDRV6: if (len == CMSG_LEN(sizeof(struct in6_addr))) print_sctp_ipv6_addr(fp, (struct in6_addr *)data); else print_gen_cmsg(fp, cmsghdr); break; default: print_gen_cmsg(fp, cmsghdr); } } static void print_cmsgs(FILE *fp, pid_t pid, bool receive, struct msghdr *msghdr) { struct cmsghdr *cmsghdr; char *cmsgbuf; const char *temp; socklen_t len; int level, type; bool first; len = msghdr->msg_controllen; if (len == 0) { fputs("{}", fp); return; } cmsgbuf = calloc(1, len); if (get_struct(pid, msghdr->msg_control, cmsgbuf, len) == -1) { fprintf(fp, "%p", msghdr->msg_control); free(cmsgbuf); return; } msghdr->msg_control = cmsgbuf; first = true; fputs("{", fp); for (cmsghdr = CMSG_FIRSTHDR(msghdr); cmsghdr != NULL; cmsghdr = CMSG_NXTHDR(msghdr, cmsghdr)) { level = cmsghdr->cmsg_level; type = cmsghdr->cmsg_type; len = cmsghdr->cmsg_len; fprintf(fp, "%s{level=", first ? "" : ","); print_integer_arg(sysdecode_sockopt_level, fp, level); fputs(",type=", fp); temp = sysdecode_cmsg_type(level, type); if (temp) { fputs(temp, fp); } else { fprintf(fp, "%d", type); } fputs(",data=", fp); switch (level) { case IPPROTO_SCTP: print_sctp_cmsg(fp, receive, cmsghdr); break; default: print_gen_cmsg(fp, cmsghdr); break; } fputs("}", fp); first = false; } fputs("}", fp); free(cmsgbuf); } +static void +print_sysctl_oid(FILE *fp, int *oid, int len) +{ + int i; + + for (i = 0; i < len; i++) + fprintf(fp, ".%d", oid[i]); +} + /* * Converts a syscall argument into a string. Said string is * allocated via malloc(), so needs to be free()'d. sc is * a pointer to the syscall description (see above); args is * an array of all of the system call arguments. */ char * print_arg(struct syscall_args *sc, unsigned long *args, register_t *retval, struct trussinfo *trussinfo) { FILE *fp; char *tmp; size_t tmplen; pid_t pid; fp = open_memstream(&tmp, &tmplen); pid = trussinfo->curthread->proc->pid; switch (sc->type & ARG_MASK) { case Hex: fprintf(fp, "0x%x", (int)args[sc->offset]); break; case Octal: fprintf(fp, "0%o", (int)args[sc->offset]); break; case Int: fprintf(fp, "%d", (int)args[sc->offset]); break; case UInt: fprintf(fp, "%u", (unsigned int)args[sc->offset]); break; case PUInt: { unsigned int val; if (get_struct(pid, (void *)args[sc->offset], &val, sizeof(val)) == 0) fprintf(fp, "{ %u }", val); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case LongHex: fprintf(fp, "0x%lx", args[sc->offset]); break; case Long: fprintf(fp, "%ld", args[sc->offset]); break; case Sizet: fprintf(fp, "%zu", (size_t)args[sc->offset]); break; case ShmName: /* Handle special SHM_ANON value. */ if ((char *)args[sc->offset] == SHM_ANON) { fprintf(fp, "SHM_ANON"); break; } /* FALLTHROUGH */ case Name: { /* NULL-terminated string. */ char *tmp2; tmp2 = get_string(pid, (void*)args[sc->offset], 0); fprintf(fp, "\"%s\"", tmp2); free(tmp2); break; } case BinString: { /* * Binary block of data that might have printable characters. * XXX If type|OUT, assume that the length is the syscall's * return value. Otherwise, assume that the length of the block * is in the next syscall argument. */ int max_string = trussinfo->strsize; char tmp2[max_string + 1], *tmp3; int len; int truncated = 0; if (sc->type & OUT) len = retval[0]; else len = args[sc->offset + 1]; /* * Don't print more than max_string characters, to avoid word * wrap. If we have to truncate put some ... after the string. */ if (len > max_string) { len = max_string; truncated = 1; } if (len && get_struct(pid, (void*)args[sc->offset], &tmp2, len) != -1) { tmp3 = malloc(len * 4 + 1); while (len) { if (strvisx(tmp3, tmp2, len, VIS_CSTYLE|VIS_TAB|VIS_NL) <= max_string) break; len--; truncated = 1; } fprintf(fp, "\"%s\"%s", tmp3, truncated ? "..." : ""); free(tmp3); } else { fprintf(fp, "0x%lx", args[sc->offset]); } break; } case ExecArgs: case ExecEnv: case StringArray: { uintptr_t addr; union { char *strarray[0]; char buf[PAGE_SIZE]; } u; char *string; size_t len; u_int first, i; /* * Only parse argv[] and environment arrays from exec calls * if requested. */ if (((sc->type & ARG_MASK) == ExecArgs && (trussinfo->flags & EXECVEARGS) == 0) || ((sc->type & ARG_MASK) == ExecEnv && (trussinfo->flags & EXECVEENVS) == 0)) { fprintf(fp, "0x%lx", args[sc->offset]); break; } /* * Read a page of pointers at a time. Punt if the top-level * pointer is not aligned. Note that the first read is of * a partial page. */ addr = args[sc->offset]; if (addr % sizeof(char *) != 0) { fprintf(fp, "0x%lx", args[sc->offset]); break; } len = PAGE_SIZE - (addr & PAGE_MASK); if (get_struct(pid, (void *)addr, u.buf, len) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } fputc('[', fp); first = 1; i = 0; while (u.strarray[i] != NULL) { string = get_string(pid, u.strarray[i], 0); fprintf(fp, "%s \"%s\"", first ? "" : ",", string); free(string); first = 0; i++; if (i == len / sizeof(char *)) { addr += len; len = PAGE_SIZE; if (get_struct(pid, (void *)addr, u.buf, len) == -1) { fprintf(fp, ", "); break; } i = 0; } } fputs(" ]", fp); break; } #ifdef __LP64__ case Quad: fprintf(fp, "%ld", args[sc->offset]); break; case QuadHex: fprintf(fp, "0x%lx", args[sc->offset]); break; #else case Quad: case QuadHex: { unsigned long long ll; #if _BYTE_ORDER == _LITTLE_ENDIAN ll = (unsigned long long)args[sc->offset + 1] << 32 | args[sc->offset]; #else ll = (unsigned long long)args[sc->offset] << 32 | args[sc->offset + 1]; #endif if ((sc->type & ARG_MASK) == Quad) fprintf(fp, "%lld", ll); else fprintf(fp, "0x%llx", ll); break; } #endif case PQuadHex: { uint64_t val; if (get_struct(pid, (void *)args[sc->offset], &val, sizeof(val)) == 0) fprintf(fp, "{ 0x%jx }", (uintmax_t)val); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Ptr: fprintf(fp, "0x%lx", args[sc->offset]); break; case Readlinkres: { char *tmp2; if (retval[0] == -1) break; tmp2 = get_string(pid, (void*)args[sc->offset], retval[0]); fprintf(fp, "\"%s\"", tmp2); free(tmp2); break; } case Ioctl: { const char *temp; unsigned long cmd; cmd = args[sc->offset]; temp = sysdecode_ioctlname(cmd); if (temp) fputs(temp, fp); else { fprintf(fp, "0x%lx { IO%s%s 0x%lx('%c'), %lu, %lu }", cmd, cmd & IOC_OUT ? "R" : "", cmd & IOC_IN ? "W" : "", IOCGROUP(cmd), isprint(IOCGROUP(cmd)) ? (char)IOCGROUP(cmd) : '?', cmd & 0xFF, IOCPARM_LEN(cmd)); } break; } case Timespec: { struct timespec ts; if (get_struct(pid, (void *)args[sc->offset], &ts, sizeof(ts)) != -1) fprintf(fp, "{ %jd.%09ld }", (intmax_t)ts.tv_sec, ts.tv_nsec); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Timespec2: { struct timespec ts[2]; const char *sep; unsigned int i; if (get_struct(pid, (void *)args[sc->offset], &ts, sizeof(ts)) != -1) { fputs("{ ", fp); sep = ""; for (i = 0; i < nitems(ts); i++) { fputs(sep, fp); sep = ", "; switch (ts[i].tv_nsec) { case UTIME_NOW: fprintf(fp, "UTIME_NOW"); break; case UTIME_OMIT: fprintf(fp, "UTIME_OMIT"); break; default: fprintf(fp, "%jd.%09ld", (intmax_t)ts[i].tv_sec, ts[i].tv_nsec); break; } } fputs(" }", fp); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Timeval: { struct timeval tv; if (get_struct(pid, (void *)args[sc->offset], &tv, sizeof(tv)) != -1) fprintf(fp, "{ %jd.%06ld }", (intmax_t)tv.tv_sec, tv.tv_usec); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Timeval2: { struct timeval tv[2]; if (get_struct(pid, (void *)args[sc->offset], &tv, sizeof(tv)) != -1) fprintf(fp, "{ %jd.%06ld, %jd.%06ld }", (intmax_t)tv[0].tv_sec, tv[0].tv_usec, (intmax_t)tv[1].tv_sec, tv[1].tv_usec); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Itimerval: { struct itimerval itv; if (get_struct(pid, (void *)args[sc->offset], &itv, sizeof(itv)) != -1) fprintf(fp, "{ %jd.%06ld, %jd.%06ld }", (intmax_t)itv.it_interval.tv_sec, itv.it_interval.tv_usec, (intmax_t)itv.it_value.tv_sec, itv.it_value.tv_usec); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case LinuxSockArgs: { struct linux_socketcall_args largs; if (get_struct(pid, (void *)args[sc->offset], (void *)&largs, sizeof(largs)) != -1) fprintf(fp, "{ %s, 0x%lx }", lookup(linux_socketcall_ops, largs.what, 10), (long unsigned int)largs.args); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Pollfd: { /* * XXX: A Pollfd argument expects the /next/ syscall argument * to be the number of fds in the array. This matches the poll * syscall. */ struct pollfd *pfd; int numfds = args[sc->offset + 1]; size_t bytes = sizeof(struct pollfd) * numfds; int i; if ((pfd = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for pollfd array", bytes); if (get_struct(pid, (void *)args[sc->offset], pfd, bytes) != -1) { fputs("{", fp); for (i = 0; i < numfds; i++) { fprintf(fp, " %d/%s", pfd[i].fd, xlookup_bits(poll_flags, pfd[i].events)); } fputs(" }", fp); } else { fprintf(fp, "0x%lx", args[sc->offset]); } free(pfd); break; } case Fd_set: { /* * XXX: A Fd_set argument expects the /first/ syscall argument * to be the number of fds in the array. This matches the * select syscall. */ fd_set *fds; int numfds = args[0]; size_t bytes = _howmany(numfds, _NFDBITS) * _NFDBITS; int i; if ((fds = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for fd_set array", bytes); if (get_struct(pid, (void *)args[sc->offset], fds, bytes) != -1) { fputs("{", fp); for (i = 0; i < numfds; i++) { if (FD_ISSET(i, fds)) fprintf(fp, " %d", i); } fputs(" }", fp); } else fprintf(fp, "0x%lx", args[sc->offset]); free(fds); break; } case Signal: fputs(strsig2(args[sc->offset]), fp); break; case Sigset: { long sig; sigset_t ss; int i, first; sig = args[sc->offset]; if (get_struct(pid, (void *)args[sc->offset], (void *)&ss, sizeof(ss)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } fputs("{ ", fp); first = 1; for (i = 1; i < sys_nsig; i++) { if (sigismember(&ss, i)) { fprintf(fp, "%s%s", !first ? "|" : "", strsig2(i)); first = 0; } } if (!first) fputc(' ', fp); fputc('}', fp); break; } case Sigprocmask: print_integer_arg(sysdecode_sigprocmask_how, fp, args[sc->offset]); break; case Fcntlflag: /* XXX: Output depends on the value of the previous argument. */ if (sysdecode_fcntl_arg_p(args[sc->offset - 1])) sysdecode_fcntl_arg(fp, args[sc->offset - 1], args[sc->offset], 16); break; case Open: print_mask_arg(sysdecode_open_flags, fp, args[sc->offset]); break; case Fcntl: print_integer_arg(sysdecode_fcntl_cmd, fp, args[sc->offset]); break; case Mprot: print_mask_arg(sysdecode_mmap_prot, fp, args[sc->offset]); break; case Mmapflags: print_mask_arg(sysdecode_mmap_flags, fp, args[sc->offset]); break; case Whence: print_integer_arg(sysdecode_whence, fp, args[sc->offset]); break; case Sockdomain: print_integer_arg(sysdecode_socketdomain, fp, args[sc->offset]); break; case Socktype: print_mask_arg(sysdecode_socket_type, fp, args[sc->offset]); break; case Shutdown: print_integer_arg(sysdecode_shutdown_how, fp, args[sc->offset]); break; case Resource: print_integer_arg(sysdecode_rlimit, fp, args[sc->offset]); break; case RusageWho: print_integer_arg(sysdecode_getrusage_who, fp, args[sc->offset]); break; case Pathconf: print_integer_arg(sysdecode_pathconf_name, fp, args[sc->offset]); break; case Rforkflags: print_mask_arg(sysdecode_rfork_flags, fp, args[sc->offset]); break; case Sockaddr: { socklen_t len; if (args[sc->offset] == 0) { fputs("NULL", fp); break; } /* * Extract the address length from the next argument. If * this is an output sockaddr (OUT is set), then the * next argument is a pointer to a socklen_t. Otherwise * the next argument contains a socklen_t by value. */ if (sc->type & OUT) { if (get_struct(pid, (void *)args[sc->offset + 1], &len, sizeof(len)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } } else len = args[sc->offset + 1]; print_sockaddr(fp, trussinfo, (void *)args[sc->offset], len); break; } case Sigaction: { struct sigaction sa; if (get_struct(pid, (void *)args[sc->offset], &sa, sizeof(sa)) != -1) { fputs("{ ", fp); if (sa.sa_handler == SIG_DFL) fputs("SIG_DFL", fp); else if (sa.sa_handler == SIG_IGN) fputs("SIG_IGN", fp); else fprintf(fp, "%p", sa.sa_handler); fprintf(fp, " %s ss_t }", xlookup_bits(sigaction_flags, sa.sa_flags)); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Kevent: { /* * XXX XXX: The size of the array is determined by either the * next syscall argument, or by the syscall return value, * depending on which argument number we are. This matches the * kevent syscall, but luckily that's the only syscall that uses * them. */ struct kevent *ke; int numevents = -1; size_t bytes; int i; if (sc->offset == 1) numevents = args[sc->offset+1]; else if (sc->offset == 3 && retval[0] != -1) numevents = retval[0]; if (numevents >= 0) { bytes = sizeof(struct kevent) * numevents; if ((ke = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for kevent array", bytes); } else ke = NULL; if (numevents >= 0 && get_struct(pid, (void *)args[sc->offset], ke, bytes) != -1) { fputc('{', fp); for (i = 0; i < numevents; i++) { fputc(' ', fp); print_kevent(fp, &ke[i]); } fputs(" }", fp); } else { fprintf(fp, "0x%lx", args[sc->offset]); } free(ke); break; } case Kevent11: { struct kevent_freebsd11 *ke11; struct kevent ke; int numevents = -1; size_t bytes; int i; if (sc->offset == 1) numevents = args[sc->offset+1]; else if (sc->offset == 3 && retval[0] != -1) numevents = retval[0]; if (numevents >= 0) { bytes = sizeof(struct kevent_freebsd11) * numevents; if ((ke11 = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for kevent array", bytes); } else ke11 = NULL; memset(&ke, 0, sizeof(ke)); if (numevents >= 0 && get_struct(pid, (void *)args[sc->offset], ke11, bytes) != -1) { fputc('{', fp); for (i = 0; i < numevents; i++) { fputc(' ', fp); ke.ident = ke11[i].ident; ke.filter = ke11[i].filter; ke.flags = ke11[i].flags; ke.fflags = ke11[i].fflags; ke.data = ke11[i].data; ke.udata = ke11[i].udata; print_kevent(fp, &ke); } fputs(" }", fp); } else { fprintf(fp, "0x%lx", args[sc->offset]); } free(ke11); break; } case Stat: { struct stat st; if (get_struct(pid, (void *)args[sc->offset], &st, sizeof(st)) != -1) { char mode[12]; strmode(st.st_mode, mode); fprintf(fp, "{ mode=%s,inode=%ju,size=%jd,blksize=%ld }", mode, (uintmax_t)st.st_ino, (intmax_t)st.st_size, (long)st.st_blksize); } else { fprintf(fp, "0x%lx", args[sc->offset]); } break; } case Stat11: { struct freebsd11_stat st; if (get_struct(pid, (void *)args[sc->offset], &st, sizeof(st)) != -1) { char mode[12]; strmode(st.st_mode, mode); fprintf(fp, "{ mode=%s,inode=%ju,size=%jd,blksize=%ld }", mode, (uintmax_t)st.st_ino, (intmax_t)st.st_size, (long)st.st_blksize); } else { fprintf(fp, "0x%lx", args[sc->offset]); } break; } case StatFs: { unsigned int i; struct statfs buf; if (get_struct(pid, (void *)args[sc->offset], &buf, sizeof(buf)) != -1) { char fsid[17]; bzero(fsid, sizeof(fsid)); if (buf.f_fsid.val[0] != 0 || buf.f_fsid.val[1] != 0) { for (i = 0; i < sizeof(buf.f_fsid); i++) snprintf(&fsid[i*2], sizeof(fsid) - (i*2), "%02x", ((u_char *)&buf.f_fsid)[i]); } fprintf(fp, "{ fstypename=%s,mntonname=%s,mntfromname=%s," "fsid=%s }", buf.f_fstypename, buf.f_mntonname, buf.f_mntfromname, fsid); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Rusage: { struct rusage ru; if (get_struct(pid, (void *)args[sc->offset], &ru, sizeof(ru)) != -1) { fprintf(fp, "{ u=%jd.%06ld,s=%jd.%06ld,in=%ld,out=%ld }", (intmax_t)ru.ru_utime.tv_sec, ru.ru_utime.tv_usec, (intmax_t)ru.ru_stime.tv_sec, ru.ru_stime.tv_usec, ru.ru_inblock, ru.ru_oublock); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Rlimit: { struct rlimit rl; if (get_struct(pid, (void *)args[sc->offset], &rl, sizeof(rl)) != -1) { fprintf(fp, "{ cur=%ju,max=%ju }", rl.rlim_cur, rl.rlim_max); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case ExitStatus: { int status; if (get_struct(pid, (void *)args[sc->offset], &status, sizeof(status)) != -1) { fputs("{ ", fp); if (WIFCONTINUED(status)) fputs("CONTINUED", fp); else if (WIFEXITED(status)) fprintf(fp, "EXITED,val=%d", WEXITSTATUS(status)); else if (WIFSIGNALED(status)) fprintf(fp, "SIGNALED,sig=%s%s", strsig2(WTERMSIG(status)), WCOREDUMP(status) ? ",cored" : ""); else fprintf(fp, "STOPPED,sig=%s", strsig2(WTERMSIG(status))); fputs(" }", fp); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Waitoptions: print_mask_arg(sysdecode_wait6_options, fp, args[sc->offset]); break; case Idtype: print_integer_arg(sysdecode_idtype, fp, args[sc->offset]); break; case Procctl: print_integer_arg(sysdecode_procctl_cmd, fp, args[sc->offset]); break; case Umtxop: print_integer_arg(sysdecode_umtx_op, fp, args[sc->offset]); break; case Atfd: print_integer_arg(sysdecode_atfd, fp, args[sc->offset]); break; case Atflags: print_mask_arg(sysdecode_atflags, fp, args[sc->offset]); break; case Accessmode: print_mask_arg(sysdecode_access_mode, fp, args[sc->offset]); break; case Sysarch: print_integer_arg(sysdecode_sysarch_number, fp, args[sc->offset]); break; + case Sysctl: { + char name[BUFSIZ]; + int oid[CTL_MAXNAME + 2], qoid[CTL_MAXNAME + 2]; + size_t i; + int len; + + memset(name, 0, sizeof(name)); + len = args[sc->offset + 1]; + if (get_struct(pid, (void *)args[sc->offset], oid, + len * sizeof(oid[0])) != -1) { + fprintf(fp, "\""); + if (oid[0] == CTL_SYSCTL) { + fprintf(fp, "sysctl."); + switch (oid[1]) { + case CTL_SYSCTL_DEBUG: + fprintf(fp, "debug"); + break; + case CTL_SYSCTL_NAME: + fprintf(fp, "name"); + print_sysctl_oid(fp, oid + 2, len - 2); + break; + case CTL_SYSCTL_NEXT: + fprintf(fp, "next"); + break; + case CTL_SYSCTL_NAME2OID: + fprintf(fp, "name2oid"); + break; + case CTL_SYSCTL_OIDFMT: + fprintf(fp, "oidfmt"); + print_sysctl_oid(fp, oid + 2, len - 2); + break; + case CTL_SYSCTL_OIDDESCR: + fprintf(fp, "oiddescr"); + print_sysctl_oid(fp, oid + 2, len - 2); + break; + case CTL_SYSCTL_OIDLABEL: + fprintf(fp, "oidlabel"); + print_sysctl_oid(fp, oid + 2, len - 2); + break; + default: + print_sysctl_oid(fp, oid + 1, len - 1); + } + } else { + qoid[0] = CTL_SYSCTL; + qoid[1] = CTL_SYSCTL_NAME; + memcpy(qoid + 2, oid, len * sizeof(int)); + i = sizeof(name); + if (sysctl(qoid, len + 2, name, &i, 0, 0) == -1) + print_sysctl_oid(fp, qoid + 2, len); + else + fprintf(fp, "%s", name); + } + fprintf(fp, "\""); + } + break; + } case PipeFds: /* * The pipe() system call in the kernel returns its * two file descriptors via return values. However, * the interface exposed by libc is that pipe() * accepts a pointer to an array of descriptors. * Format the output to match the libc API by printing * the returned file descriptors as a fake argument. * * Overwrite the first retval to signal a successful * return as well. */ fprintf(fp, "{ %d, %d }", (int)retval[0], (int)retval[1]); retval[0] = 0; break; case Utrace: { size_t len; void *utrace_addr; len = args[sc->offset + 1]; utrace_addr = calloc(1, len); if (get_struct(pid, (void *)args[sc->offset], (void *)utrace_addr, len) != -1) print_utrace(fp, utrace_addr, len); else fprintf(fp, "0x%lx", args[sc->offset]); free(utrace_addr); break; } case IntArray: { int descriptors[16]; unsigned long i, ndescriptors; bool truncated; ndescriptors = args[sc->offset + 1]; truncated = false; if (ndescriptors > nitems(descriptors)) { ndescriptors = nitems(descriptors); truncated = true; } if (get_struct(pid, (void *)args[sc->offset], descriptors, ndescriptors * sizeof(descriptors[0])) != -1) { fprintf(fp, "{"); for (i = 0; i < ndescriptors; i++) fprintf(fp, i == 0 ? " %d" : ", %d", descriptors[i]); fprintf(fp, truncated ? ", ... }" : " }"); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Pipe2: print_mask_arg(sysdecode_pipe2_flags, fp, args[sc->offset]); break; case CapFcntlRights: { uint32_t rights; if (sc->type & OUT) { if (get_struct(pid, (void *)args[sc->offset], &rights, sizeof(rights)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } } else rights = args[sc->offset]; print_mask_arg32(sysdecode_cap_fcntlrights, fp, rights); break; } case Fadvice: print_integer_arg(sysdecode_fadvice, fp, args[sc->offset]); break; case FileFlags: { fflags_t rem; if (!sysdecode_fileflags(fp, args[sc->offset], &rem)) fprintf(fp, "0x%x", rem); else if (rem != 0) fprintf(fp, "|0x%x", rem); break; } case Flockop: print_mask_arg(sysdecode_flock_operation, fp, args[sc->offset]); break; case Getfsstatmode: print_integer_arg(sysdecode_getfsstat_mode, fp, args[sc->offset]); break; case Kldsymcmd: print_integer_arg(sysdecode_kldsym_cmd, fp, args[sc->offset]); break; case Kldunloadflags: print_integer_arg(sysdecode_kldunload_flags, fp, args[sc->offset]); break; case Madvice: print_integer_arg(sysdecode_madvice, fp, args[sc->offset]); break; case Socklent: fprintf(fp, "%u", (socklen_t)args[sc->offset]); break; case Sockprotocol: { const char *temp; int domain, protocol; domain = args[sc->offset - 2]; protocol = args[sc->offset]; if (protocol == 0) { fputs("0", fp); } else { temp = sysdecode_socket_protocol(domain, protocol); if (temp) { fputs(temp, fp); } else { fprintf(fp, "%d", protocol); } } break; } case Sockoptlevel: print_integer_arg(sysdecode_sockopt_level, fp, args[sc->offset]); break; case Sockoptname: { const char *temp; int level, name; level = args[sc->offset - 1]; name = args[sc->offset]; temp = sysdecode_sockopt_name(level, name); if (temp) { fputs(temp, fp); } else { fprintf(fp, "%d", name); } break; } case Msgflags: print_mask_arg(sysdecode_msg_flags, fp, args[sc->offset]); break; case CapRights: { cap_rights_t rights; if (get_struct(pid, (void *)args[sc->offset], &rights, sizeof(rights)) != -1) { fputs("{ ", fp); sysdecode_cap_rights(fp, &rights); fputs(" }", fp); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Acltype: print_integer_arg(sysdecode_acltype, fp, args[sc->offset]); break; case Extattrnamespace: print_integer_arg(sysdecode_extattrnamespace, fp, args[sc->offset]); break; case Minherit: print_integer_arg(sysdecode_minherit_inherit, fp, args[sc->offset]); break; case Mlockall: print_mask_arg(sysdecode_mlockall_flags, fp, args[sc->offset]); break; case Mountflags: print_mask_arg(sysdecode_mount_flags, fp, args[sc->offset]); break; case Msync: print_mask_arg(sysdecode_msync_flags, fp, args[sc->offset]); break; case Priowhich: print_integer_arg(sysdecode_prio_which, fp, args[sc->offset]); break; case Ptraceop: print_integer_arg(sysdecode_ptrace_request, fp, args[sc->offset]); break; case Quotactlcmd: if (!sysdecode_quotactl_cmd(fp, args[sc->offset])) fprintf(fp, "%#x", (int)args[sc->offset]); break; case Reboothowto: print_mask_arg(sysdecode_reboot_howto, fp, args[sc->offset]); break; case Rtpriofunc: print_integer_arg(sysdecode_rtprio_function, fp, args[sc->offset]); break; case Schedpolicy: print_integer_arg(sysdecode_scheduler_policy, fp, args[sc->offset]); break; case Schedparam: { struct sched_param sp; if (get_struct(pid, (void *)args[sc->offset], &sp, sizeof(sp)) != -1) fprintf(fp, "{ %d }", sp.sched_priority); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case PSig: { int sig; if (get_struct(pid, (void *)args[sc->offset], &sig, sizeof(sig)) == 0) fprintf(fp, "{ %s }", strsig2(sig)); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Siginfo: { siginfo_t si; if (get_struct(pid, (void *)args[sc->offset], &si, sizeof(si)) != -1) { fprintf(fp, "{ signo=%s", strsig2(si.si_signo)); decode_siginfo(fp, &si); fprintf(fp, " }"); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Iovec: /* * Print argument as an array of struct iovec, where the next * syscall argument is the number of elements of the array. */ print_iovec(fp, trussinfo, (void *)args[sc->offset], (int)args[sc->offset + 1]); break; case Sctpsndrcvinfo: { struct sctp_sndrcvinfo info; if (get_struct(pid, (void *)args[sc->offset], &info, sizeof(struct sctp_sndrcvinfo)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } print_sctp_sndrcvinfo(fp, sc->type & OUT, &info); break; } case Msghdr: { struct msghdr msghdr; if (get_struct(pid, (void *)args[sc->offset], &msghdr, sizeof(struct msghdr)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } fputs("{", fp); print_sockaddr(fp, trussinfo, msghdr.msg_name, msghdr.msg_namelen); fprintf(fp, ",%d,", msghdr.msg_namelen); print_iovec(fp, trussinfo, msghdr.msg_iov, msghdr.msg_iovlen); fprintf(fp, ",%d,", msghdr.msg_iovlen); print_cmsgs(fp, pid, sc->type & OUT, &msghdr); fprintf(fp, ",%u,", msghdr.msg_controllen); print_mask_arg(sysdecode_msg_flags, fp, msghdr.msg_flags); fputs("}", fp); break; } case CloudABIAdvice: fputs(xlookup(cloudabi_advice, args[sc->offset]), fp); break; case CloudABIClockID: fputs(xlookup(cloudabi_clockid, args[sc->offset]), fp); break; case CloudABIFDSFlags: fputs(xlookup_bits(cloudabi_fdsflags, args[sc->offset]), fp); break; case CloudABIFDStat: { cloudabi_fdstat_t fds; if (get_struct(pid, (void *)args[sc->offset], &fds, sizeof(fds)) != -1) { fprintf(fp, "{ %s, ", xlookup(cloudabi_filetype, fds.fs_filetype)); fprintf(fp, "%s, ... }", xlookup_bits(cloudabi_fdflags, fds.fs_flags)); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case CloudABIFileStat: { cloudabi_filestat_t fsb; if (get_struct(pid, (void *)args[sc->offset], &fsb, sizeof(fsb)) != -1) fprintf(fp, "{ %s, %ju }", xlookup(cloudabi_filetype, fsb.st_filetype), (uintmax_t)fsb.st_size); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case CloudABIFileType: fputs(xlookup(cloudabi_filetype, args[sc->offset]), fp); break; case CloudABIFSFlags: fputs(xlookup_bits(cloudabi_fsflags, args[sc->offset]), fp); break; case CloudABILookup: if ((args[sc->offset] & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) != 0) fprintf(fp, "%d|LOOKUP_SYMLINK_FOLLOW", (int)args[sc->offset]); else fprintf(fp, "%d", (int)args[sc->offset]); break; case CloudABIMFlags: fputs(xlookup_bits(cloudabi_mflags, args[sc->offset]), fp); break; case CloudABIMProt: fputs(xlookup_bits(cloudabi_mprot, args[sc->offset]), fp); break; case CloudABIMSFlags: fputs(xlookup_bits(cloudabi_msflags, args[sc->offset]), fp); break; case CloudABIOFlags: fputs(xlookup_bits(cloudabi_oflags, args[sc->offset]), fp); break; case CloudABISDFlags: fputs(xlookup_bits(cloudabi_sdflags, args[sc->offset]), fp); break; case CloudABISignal: fputs(xlookup(cloudabi_signal, args[sc->offset]), fp); break; case CloudABITimestamp: fprintf(fp, "%lu.%09lus", args[sc->offset] / 1000000000, args[sc->offset] % 1000000000); break; case CloudABIULFlags: fputs(xlookup_bits(cloudabi_ulflags, args[sc->offset]), fp); break; case CloudABIWhence: fputs(xlookup(cloudabi_whence, args[sc->offset]), fp); break; default: errx(1, "Invalid argument type %d\n", sc->type & ARG_MASK); } fclose(fp); return (tmp); } /* * Print (to outfile) the system call and its arguments. */ void print_syscall(struct trussinfo *trussinfo) { struct threadinfo *t; const char *name; char **s_args; int i, len, nargs; t = trussinfo->curthread; name = t->cs.sc->name; nargs = t->cs.nargs; s_args = t->cs.s_args; len = print_line_prefix(trussinfo); len += fprintf(trussinfo->outfile, "%s(", name); for (i = 0; i < nargs; i++) { if (s_args[i] != NULL) len += fprintf(trussinfo->outfile, "%s", s_args[i]); else len += fprintf(trussinfo->outfile, ""); len += fprintf(trussinfo->outfile, "%s", i < (nargs - 1) ? "," : ""); } len += fprintf(trussinfo->outfile, ")"); for (i = 0; i < 6 - (len / 8); i++) fprintf(trussinfo->outfile, "\t"); } void print_syscall_ret(struct trussinfo *trussinfo, int error, register_t *retval) { struct timespec timediff; struct threadinfo *t; struct syscall *sc; t = trussinfo->curthread; sc = t->cs.sc; if (trussinfo->flags & COUNTONLY) { timespecsub(&t->after, &t->before, &timediff); timespecadd(&sc->time, &timediff, &sc->time); sc->ncalls++; if (error != 0) sc->nerror++; return; } print_syscall(trussinfo); fflush(trussinfo->outfile); if (retval == NULL) { /* * This system call resulted in the current thread's exit, * so there is no return value or error to display. */ fprintf(trussinfo->outfile, "\n"); return; } if (error == ERESTART) fprintf(trussinfo->outfile, " ERESTART\n"); else if (error == EJUSTRETURN) fprintf(trussinfo->outfile, " EJUSTRETURN\n"); else if (error != 0) { fprintf(trussinfo->outfile, " ERR#%d '%s'\n", sysdecode_freebsd_to_abi_errno(t->proc->abi->abi, error), strerror(error)); } #ifndef __LP64__ else if (sc->ret_type == 2) { off_t off; #if _BYTE_ORDER == _LITTLE_ENDIAN off = (off_t)retval[1] << 32 | retval[0]; #else off = (off_t)retval[0] << 32 | retval[1]; #endif fprintf(trussinfo->outfile, " = %jd (0x%jx)\n", (intmax_t)off, (intmax_t)off); } #endif else fprintf(trussinfo->outfile, " = %jd (0x%jx)\n", (intmax_t)retval[0], (intmax_t)retval[0]); } void print_summary(struct trussinfo *trussinfo) { struct timespec total = {0, 0}; struct syscall *sc; int ncall, nerror; fprintf(trussinfo->outfile, "%-20s%15s%8s%8s\n", "syscall", "seconds", "calls", "errors"); ncall = nerror = 0; STAILQ_FOREACH(sc, &syscalls, entries) if (sc->ncalls) { fprintf(trussinfo->outfile, "%-20s%5jd.%09ld%8d%8d\n", sc->name, (intmax_t)sc->time.tv_sec, sc->time.tv_nsec, sc->ncalls, sc->nerror); timespecadd(&total, &sc->time, &total); ncall += sc->ncalls; nerror += sc->nerror; } fprintf(trussinfo->outfile, "%20s%15s%8s%8s\n", "", "-------------", "-------", "-------"); fprintf(trussinfo->outfile, "%-20s%5jd.%09ld%8d%8d\n", "", (intmax_t)total.tv_sec, total.tv_nsec, ncall, nerror); } Index: projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.8 =================================================================== --- projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.8 (revision 352536) +++ projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.8 (revision 352537) @@ -1,200 +1,200 @@ .\"- .\" Copyright 2006, 2007 Colin Percival .\" All rights reserved .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted providing that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY .\" DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, .\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING .\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd June 14, 2017 .Dt FREEBSD-UPDATE 8 .Os .Sh NAME .Nm freebsd-update .Nd fetch and install binary updates to FreeBSD .Sh SYNOPSIS .Nm .Op Fl b Ar basedir .Op Fl d Ar workdir .Op Fl f Ar conffile .Op Fl F .Op Fl k Ar KEY .Op Fl r Ar newrelease .Op Fl s Ar server .Op Fl t Ar address .Op Fl -not-running-from-cron .Cm command ... .Sh DESCRIPTION The .Nm tool is used to fetch, install, and rollback binary updates to the .Fx base system. Note that updates are only available if they are being built for the .Fx release and architecture being used; in particular, the .Fx Security Team only builds updates for releases shipped in binary form by the .Fx Release Engineering Team, e.g., .Fx 11.2-RELEASE and .Fx 12.0-RELEASE, but not .Fx 11.2-STABLE or .Fx 13.0-CURRENT. .Sh OPTIONS The following options are supported: .Bl -tag -width "-r newrelease" .It Fl b Ar basedir Operate on a system mounted at .Ar basedir . (default: .Pa / , or as given in the configuration file.) .It Fl d Ar workdir Store working files in .Ar workdir . (default: .Pa /var/db/freebsd-update/ , or as given in the configuration file.) .It Fl f Ar conffile Read configuration options from .Ar conffile . (default: .Pa /etc/freebsd-update.conf ) .It Fl F Force .Nm Cm fetch to proceed in the case of an unfinished upgrade. .It Fl k Ar KEY Trust an RSA key with SHA256 of .Ar KEY . (default: read value from configuration file.) .It Fl r Ar newrelease -Specify the new release (e.g. 11.2-RELEASE) to which +Specify the new release (e.g., 11.2-RELEASE) to which .Nm should upgrade (upgrade command only). .It Fl s Ar server Fetch files from the specified server or server pool. (default: read value from configuration file.) .It Fl t Ar address Mail output of .Cm cron command, if any, to .Ar address . (default: root, or as given in the configuration file.) .It Fl -not-running-from-cron Force .Nm Cm fetch to proceed when there is no controlling tty. This is for use by automated scripts and orchestration tools. Please do not run .Nm Cm fetch from crontab or similar using this flag, see: .Nm Cm cron .It Fl -currently-running Ar release Do not detect the currently-running release; instead, assume that the system is running the specified .Ar release . This is most likely to be useful when upgrading jails. .El .Sh COMMANDS The .Cm command can be any one of the following: .Bl -tag -width "rollback" .It Cm fetch Based on the currently installed world and the configuration options set, fetch all available binary updates. .It Cm cron Sleep a random amount of time between 1 and 3600 seconds, then download updates as if the .Cm fetch command was used. If updates are downloaded, an email will be sent (to root or a different address if specified via the .Fl t option or in the configuration file). As the name suggests, this command is designed for running from .Xr cron 8 ; the random delay serves to minimize the probability that a large number of machines will simultaneously attempt to fetch updates. .It Cm upgrade Fetch files necessary for upgrading to a new release. Before using this command, make sure that you read the announcement and release notes for the new release in case there are any special steps needed for upgrading. Note that this command may require up to 500 MB of space in .Ar workdir depending on which components of the .Fx base system are installed. .It Cm install Install the most recently fetched updates or upgrade. .It Cm rollback Uninstall the most recently installed updates. .It Cm IDS Compare the system against a "known good" index of the installed release. .El .Sh TIPS .Bl -bullet .It If your clock is set to local time, adding the line .Pp .Dl 0 3 * * * root /usr/sbin/freebsd-update cron .Pp to /etc/crontab will check for updates every night. If your clock is set to UTC, please pick a random time other than 3AM, to avoid overly imposing an uneven load on the server(s) hosting the updates. .It In spite of its name, .Nm IDS should not be relied upon as an "Intrusion Detection System", since if the system has been tampered with it cannot be trusted to operate correctly. If you intend to use this command for intrusion-detection purposes, make sure you boot from a secure disk (e.g., a CD). .El .Sh FILES .Bl -tag -width "/etc/freebsd-update.conf" .It Pa /etc/freebsd-update.conf Default location of the .Nm configuration file. .It Pa /var/db/freebsd-update/ Default location where .Nm stores temporary files and downloaded updates. .El .Sh SEE ALSO .Xr freebsd-update.conf 5 .Sh AUTHORS .An Colin Percival Aq Mt cperciva@FreeBSD.org Index: projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.sh =================================================================== --- projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.sh (revision 352536) +++ projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.sh (revision 352537) @@ -1,3361 +1,3361 @@ #!/bin/sh #- # SPDX-License-Identifier: BSD-2-Clause-FreeBSD # # Copyright 2004-2007 Colin Percival # All rights reserved # # Redistribution and use in source and binary forms, with or without # modification, are permitted providing that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # $FreeBSD$ #### Usage function -- called from command-line handling code. # Usage instructions. Options not listed: # --debug -- don't filter output from utilities # --no-stats -- don't show progress statistics while fetching files usage () { cat < ${LINE}" exit 1 fi done < ${CONFFILE} # Merge the settings read from the configuration file with those # provided at the command line. mergeconfig } # Provide some default parameters default_params () { # Save any parameters already configured, and clear the slate saveconfig nullconfig # Default configurations config_WorkDir /var/db/freebsd-update config_MailTo root config_AllowAdd yes config_AllowDelete yes config_KeepModifiedMetadata yes config_BaseDir / config_VerboseLevel stats config_StrictComponents no config_BackupKernel yes config_BackupKernelDir /boot/kernel.old config_BackupKernelSymbolFiles no # Merge these defaults into the earlier-configured settings mergeconfig } # Set utility output filtering options, based on ${VERBOSELEVEL} fetch_setup_verboselevel () { case ${VERBOSELEVEL} in debug) QUIETREDIR="/dev/stderr" QUIETFLAG=" " STATSREDIR="/dev/stderr" DDSTATS=".." XARGST="-t" NDEBUG=" " ;; nostats) QUIETREDIR="" QUIETFLAG="" STATSREDIR="/dev/null" DDSTATS=".." XARGST="" NDEBUG="" ;; stats) QUIETREDIR="/dev/null" QUIETFLAG="-q" STATSREDIR="/dev/stdout" DDSTATS="" XARGST="" NDEBUG="-n" ;; esac } # Perform sanity checks and set some final parameters # in preparation for fetching files. Figure out which # set of updates should be downloaded: If the user is # running *-p[0-9]+, strip off the last part; if the # user is running -SECURITY, call it -RELEASE. Chdir # into the working directory. fetchupgrade_check_params () { export HTTP_USER_AGENT="freebsd-update (${COMMAND}, `uname -r`)" _SERVERNAME_z=\ "SERVERNAME must be given via command line or configuration file." _KEYPRINT_z="Key must be given via -k option or configuration file." _KEYPRINT_bad="Invalid key fingerprint: " _WORKDIR_bad="Directory does not exist or is not writable: " _WORKDIR_bad2="Directory is not on a persistent filesystem: " if [ -z "${SERVERNAME}" ]; then echo -n "`basename $0`: " echo "${_SERVERNAME_z}" exit 1 fi if [ -z "${KEYPRINT}" ]; then echo -n "`basename $0`: " echo "${_KEYPRINT_z}" exit 1 fi if ! echo "${KEYPRINT}" | grep -qE "^[0-9a-f]{64}$"; then echo -n "`basename $0`: " echo -n "${_KEYPRINT_bad}" echo ${KEYPRINT} exit 1 fi if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then echo -n "`basename $0`: " echo -n "${_WORKDIR_bad}" echo ${WORKDIR} exit 1 fi case `df -T ${WORKDIR}` in */dev/md[0-9]* | *tmpfs*) echo -n "`basename $0`: " echo -n "${_WORKDIR_bad2}" echo ${WORKDIR} exit 1 ;; esac chmod 700 ${WORKDIR} cd ${WORKDIR} || exit 1 # Generate release number. The s/SECURITY/RELEASE/ bit exists # to provide an upgrade path for FreeBSD Update 1.x users, since # the kernels provided by FreeBSD Update 1.x are always labelled # as X.Y-SECURITY. RELNUM=`uname -r | sed -E 's,-p[0-9]+,,' | sed -E 's,-SECURITY,-RELEASE,'` ARCH=`uname -m` FETCHDIR=${RELNUM}/${ARCH} PATCHDIR=${RELNUM}/${ARCH}/bp # Disallow upgrade from a version that is not a release case ${RELNUM} in *-RELEASE | *-ALPHA* | *-BETA* | *-RC*) ;; *) echo -n "`basename $0`: " cat <<- EOF Cannot upgrade from a version that is not a release (including alpha, beta and release candidates) using `basename $0`. Instead, FreeBSD can be directly upgraded by source or upgraded to a RELEASE/RELENG version prior to running `basename $0`. Currently running: ${RELNUM} EOF exit 1 ;; esac # Figure out what directory contains the running kernel BOOTFILE=`sysctl -n kern.bootfile` KERNELDIR=${BOOTFILE%/kernel} if ! [ -d ${KERNELDIR} ]; then echo "Cannot identify running kernel" exit 1 fi # Figure out what kernel configuration is running. We start with # the output of `uname -i`, and then make the following adjustments: # 1. Replace "SMP-GENERIC" with "SMP". Why the SMP kernel config # file says "ident SMP-GENERIC", I don't know... # 2. If the kernel claims to be GENERIC _and_ ${ARCH} is "amd64" # _and_ `sysctl kern.version` contains a line which ends "/SMP", then # we're running an SMP kernel. This mis-identification is a bug # which was fixed in 6.2-STABLE. KERNCONF=`uname -i` if [ ${KERNCONF} = "SMP-GENERIC" ]; then KERNCONF=SMP fi if [ ${KERNCONF} = "GENERIC" ] && [ ${ARCH} = "amd64" ]; then if sysctl kern.version | grep -qE '/SMP$'; then KERNCONF=SMP fi fi # Define some paths BSPATCH=/usr/bin/bspatch SHA256=/sbin/sha256 PHTTPGET=/usr/libexec/phttpget # Set up variables relating to VERBOSELEVEL fetch_setup_verboselevel # Construct a unique name from ${BASEDIR} BDHASH=`echo ${BASEDIR} | sha256 -q` } # Perform sanity checks etc. before fetching updates. fetch_check_params () { fetchupgrade_check_params if ! [ -z "${TARGETRELEASE}" ]; then echo -n "`basename $0`: " echo -n "-r option is meaningless with 'fetch' command. " echo "(Did you mean 'upgrade' instead?)" exit 1 fi # Check that we have updates ready to install if [ -f ${BDHASH}-install/kerneldone -a $FORCEFETCH -eq 0 ]; then echo "You have a partially completed upgrade pending" echo "Run '$0 install' first." echo "Run '$0 fetch -F' to proceed anyway." exit 1 fi } # Perform sanity checks etc. before fetching upgrades. upgrade_check_params () { fetchupgrade_check_params # Unless set otherwise, we're upgrading to the same kernel config. NKERNCONF=${KERNCONF} # We need TARGETRELEASE set _TARGETRELEASE_z="Release target must be specified via -r option." if [ -z "${TARGETRELEASE}" ]; then echo -n "`basename $0`: " echo "${_TARGETRELEASE_z}" exit 1 fi # The target release should be != the current release. if [ "${TARGETRELEASE}" = "${RELNUM}" ]; then echo -n "`basename $0`: " echo "Cannot upgrade from ${RELNUM} to itself" exit 1 fi # Turning off AllowAdd or AllowDelete is a bad idea for upgrades. if [ "${ALLOWADD}" = "no" ]; then echo -n "`basename $0`: " echo -n "WARNING: \"AllowAdd no\" is a bad idea " echo "when upgrading between releases." echo fi if [ "${ALLOWDELETE}" = "no" ]; then echo -n "`basename $0`: " echo -n "WARNING: \"AllowDelete no\" is a bad idea " echo "when upgrading between releases." echo fi # Set EDITOR to /usr/bin/vi if it isn't already set : ${EDITOR:='/usr/bin/vi'} } # Perform sanity checks and set some final parameters in # preparation for installing updates. install_check_params () { # Check that we are root. All sorts of things won't work otherwise. if [ `id -u` != 0 ]; then echo "You must be root to run this." exit 1 fi # Check that securelevel <= 0. Otherwise we can't update schg files. if [ `sysctl -n kern.securelevel` -gt 0 ]; then echo "Updates cannot be installed when the system securelevel" echo "is greater than zero." exit 1 fi # Check that we have a working directory _WORKDIR_bad="Directory does not exist or is not writable: " if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then echo -n "`basename $0`: " echo -n "${_WORKDIR_bad}" echo ${WORKDIR} exit 1 fi cd ${WORKDIR} || exit 1 # Construct a unique name from ${BASEDIR} BDHASH=`echo ${BASEDIR} | sha256 -q` # Check that we have updates ready to install if ! [ -L ${BDHASH}-install ]; then echo "No updates are available to install." if [ $ISFETCHED -eq 0 ]; then echo "Run '$0 fetch' first." exit 1 fi exit 0 fi if ! [ -f ${BDHASH}-install/INDEX-OLD ] || ! [ -f ${BDHASH}-install/INDEX-NEW ]; then echo "Update manifest is corrupt -- this should never happen." echo "Re-run '$0 fetch'." exit 1 fi # Figure out what directory contains the running kernel BOOTFILE=`sysctl -n kern.bootfile` KERNELDIR=${BOOTFILE%/kernel} if ! [ -d ${KERNELDIR} ]; then echo "Cannot identify running kernel" exit 1 fi } # Perform sanity checks and set some final parameters in # preparation for UNinstalling updates. rollback_check_params () { # Check that we are root. All sorts of things won't work otherwise. if [ `id -u` != 0 ]; then echo "You must be root to run this." exit 1 fi # Check that we have a working directory _WORKDIR_bad="Directory does not exist or is not writable: " if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then echo -n "`basename $0`: " echo -n "${_WORKDIR_bad}" echo ${WORKDIR} exit 1 fi cd ${WORKDIR} || exit 1 # Construct a unique name from ${BASEDIR} BDHASH=`echo ${BASEDIR} | sha256 -q` # Check that we have updates ready to rollback if ! [ -L ${BDHASH}-rollback ]; then echo "No rollback directory found." exit 1 fi if ! [ -f ${BDHASH}-rollback/INDEX-OLD ] || ! [ -f ${BDHASH}-rollback/INDEX-NEW ]; then echo "Update manifest is corrupt -- this should never happen." exit 1 fi } # Perform sanity checks and set some final parameters # in preparation for comparing the system against the # published index. Figure out which index we should # compare against: If the user is running *-p[0-9]+, # strip off the last part; if the user is running # -SECURITY, call it -RELEASE. Chdir into the working # directory. IDS_check_params () { export HTTP_USER_AGENT="freebsd-update (${COMMAND}, `uname -r`)" _SERVERNAME_z=\ "SERVERNAME must be given via command line or configuration file." _KEYPRINT_z="Key must be given via -k option or configuration file." _KEYPRINT_bad="Invalid key fingerprint: " _WORKDIR_bad="Directory does not exist or is not writable: " if [ -z "${SERVERNAME}" ]; then echo -n "`basename $0`: " echo "${_SERVERNAME_z}" exit 1 fi if [ -z "${KEYPRINT}" ]; then echo -n "`basename $0`: " echo "${_KEYPRINT_z}" exit 1 fi if ! echo "${KEYPRINT}" | grep -qE "^[0-9a-f]{64}$"; then echo -n "`basename $0`: " echo -n "${_KEYPRINT_bad}" echo ${KEYPRINT} exit 1 fi if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then echo -n "`basename $0`: " echo -n "${_WORKDIR_bad}" echo ${WORKDIR} exit 1 fi cd ${WORKDIR} || exit 1 # Generate release number. The s/SECURITY/RELEASE/ bit exists # to provide an upgrade path for FreeBSD Update 1.x users, since # the kernels provided by FreeBSD Update 1.x are always labelled # as X.Y-SECURITY. RELNUM=`uname -r | sed -E 's,-p[0-9]+,,' | sed -E 's,-SECURITY,-RELEASE,'` ARCH=`uname -m` FETCHDIR=${RELNUM}/${ARCH} PATCHDIR=${RELNUM}/${ARCH}/bp # Figure out what directory contains the running kernel BOOTFILE=`sysctl -n kern.bootfile` KERNELDIR=${BOOTFILE%/kernel} if ! [ -d ${KERNELDIR} ]; then echo "Cannot identify running kernel" exit 1 fi # Figure out what kernel configuration is running. We start with # the output of `uname -i`, and then make the following adjustments: # 1. Replace "SMP-GENERIC" with "SMP". Why the SMP kernel config # file says "ident SMP-GENERIC", I don't know... # 2. If the kernel claims to be GENERIC _and_ ${ARCH} is "amd64" # _and_ `sysctl kern.version` contains a line which ends "/SMP", then # we're running an SMP kernel. This mis-identification is a bug # which was fixed in 6.2-STABLE. KERNCONF=`uname -i` if [ ${KERNCONF} = "SMP-GENERIC" ]; then KERNCONF=SMP fi if [ ${KERNCONF} = "GENERIC" ] && [ ${ARCH} = "amd64" ]; then if sysctl kern.version | grep -qE '/SMP$'; then KERNCONF=SMP fi fi # Define some paths SHA256=/sbin/sha256 PHTTPGET=/usr/libexec/phttpget # Set up variables relating to VERBOSELEVEL fetch_setup_verboselevel } #### Core functionality -- the actual work gets done here # Use an SRV query to pick a server. If the SRV query doesn't provide # a useful answer, use the server name specified by the user. # Put another way... look up _http._tcp.${SERVERNAME} and pick a server # from that; or if no servers are returned, use ${SERVERNAME}. # This allows a user to specify "portsnap.freebsd.org" (in which case # portsnap will select one of the mirrors) or "portsnap5.tld.freebsd.org" # (in which case portsnap will use that particular server, since there # won't be an SRV entry for that name). # # We ignore the Port field, since we are always going to use port 80. # Fetch the mirror list, but do not pick a mirror yet. Returns 1 if # no mirrors are available for any reason. fetch_pick_server_init () { : > serverlist_tried # Check that host(1) exists (i.e., that the system wasn't built with the # WITHOUT_BIND set) and don't try to find a mirror if it doesn't exist. if ! which -s host; then : > serverlist_full return 1 fi echo -n "Looking up ${SERVERNAME} mirrors... " # Issue the SRV query and pull out the Priority, Weight, and Target fields. # BIND 9 prints "$name has SRV record ..." while BIND 8 prints # "$name server selection ..."; we allow either format. MLIST="_http._tcp.${SERVERNAME}" host -t srv "${MLIST}" | sed -nE "s/${MLIST} (has SRV record|server selection) //Ip" | cut -f 1,2,4 -d ' ' | sed -e 's/\.$//' | sort > serverlist_full # If no records, give up -- we'll just use the server name we were given. if [ `wc -l < serverlist_full` -eq 0 ]; then echo "none found." return 1 fi # Report how many mirrors we found. echo `wc -l < serverlist_full` "mirrors found." # Generate a random seed for use in picking mirrors. If HTTP_PROXY # is set, this will be used to generate the seed; otherwise, the seed # will be random. if [ -n "${HTTP_PROXY}${http_proxy}" ]; then RANDVALUE=`sha256 -qs "${HTTP_PROXY}${http_proxy}" | tr -d 'a-f' | cut -c 1-9` else RANDVALUE=`jot -r 1 0 999999999` fi } # Pick a mirror. Returns 1 if we have run out of mirrors to try. fetch_pick_server () { # Generate a list of not-yet-tried mirrors sort serverlist_tried | comm -23 serverlist_full - > serverlist # Have we run out of mirrors? if [ `wc -l < serverlist` -eq 0 ]; then cat <<- EOF No mirrors remaining, giving up. This may be because upgrading from this platform (${ARCH}) or release (${RELNUM}) is unsupported by `basename $0`. Only platforms with Tier 1 support can be upgraded by `basename $0`. See https://www.freebsd.org/platforms/index.html for more info. If unsupported, FreeBSD must be upgraded by source. EOF return 1 fi # Find the highest priority level (lowest numeric value). SRV_PRIORITY=`cut -f 1 -d ' ' serverlist | sort -n | head -1` # Add up the weights of the response lines at that priority level. SRV_WSUM=0; while read X; do case "$X" in ${SRV_PRIORITY}\ *) SRV_W=`echo $X | cut -f 2 -d ' '` SRV_WSUM=$(($SRV_WSUM + $SRV_W)) ;; esac done < serverlist # If all the weights are 0, pretend that they are all 1 instead. if [ ${SRV_WSUM} -eq 0 ]; then SRV_WSUM=`grep -E "^${SRV_PRIORITY} " serverlist | wc -l` SRV_W_ADD=1 else SRV_W_ADD=0 fi # Pick a value between 0 and the sum of the weights - 1 SRV_RND=`expr ${RANDVALUE} % ${SRV_WSUM}` # Read through the list of mirrors and set SERVERNAME. Write the line # corresponding to the mirror we selected into serverlist_tried so that # we won't try it again. while read X; do case "$X" in ${SRV_PRIORITY}\ *) SRV_W=`echo $X | cut -f 2 -d ' '` SRV_W=$(($SRV_W + $SRV_W_ADD)) if [ $SRV_RND -lt $SRV_W ]; then SERVERNAME=`echo $X | cut -f 3 -d ' '` echo "$X" >> serverlist_tried break else SRV_RND=$(($SRV_RND - $SRV_W)) fi ;; esac done < serverlist } # Take a list of ${oldhash}|${newhash} and output a list of needed patches, # i.e., those for which we have ${oldhash} and don't have ${newhash}. fetch_make_patchlist () { grep -vE "^([0-9a-f]{64})\|\1$" | tr '|' ' ' | while read X Y; do if [ -f "files/${Y}.gz" ] || [ ! -f "files/${X}.gz" ]; then continue fi echo "${X}|${Y}" done | sort -u } # Print user-friendly progress statistics fetch_progress () { LNC=0 while read x; do LNC=$(($LNC + 1)) if [ $(($LNC % 10)) = 0 ]; then echo -n $LNC elif [ $(($LNC % 2)) = 0 ]; then echo -n . fi done echo -n " " } # Function for asking the user if everything is ok continuep () { while read -p "Does this look reasonable (y/n)? " CONTINUE; do case "${CONTINUE}" in y*) return 0 ;; n*) return 1 ;; esac done } # Initialize the working directory workdir_init () { mkdir -p files touch tINDEX.present } # Check that we have a public key with an appropriate hash, or # fetch the key if it doesn't exist. Returns 1 if the key has # not yet been fetched. fetch_key () { if [ -r pub.ssl ] && [ `${SHA256} -q pub.ssl` = ${KEYPRINT} ]; then return 0 fi echo -n "Fetching public key from ${SERVERNAME}... " rm -f pub.ssl fetch ${QUIETFLAG} http://${SERVERNAME}/${FETCHDIR}/pub.ssl \ 2>${QUIETREDIR} || true if ! [ -r pub.ssl ]; then echo "failed." return 1 fi if ! [ `${SHA256} -q pub.ssl` = ${KEYPRINT} ]; then echo "key has incorrect hash." rm -f pub.ssl return 1 fi echo "done." } # Fetch metadata signature, aka "tag". fetch_tag () { echo -n "Fetching metadata signature " echo ${NDEBUG} "for ${RELNUM} from ${SERVERNAME}... " rm -f latest.ssl fetch ${QUIETFLAG} http://${SERVERNAME}/${FETCHDIR}/latest.ssl \ 2>${QUIETREDIR} || true if ! [ -r latest.ssl ]; then echo "failed." return 1 fi openssl rsautl -pubin -inkey pub.ssl -verify \ < latest.ssl > tag.new 2>${QUIETREDIR} || true rm latest.ssl if ! [ `wc -l < tag.new` = 1 ] || ! grep -qE \ "^freebsd-update\|${ARCH}\|${RELNUM}\|[0-9]+\|[0-9a-f]{64}\|[0-9]{10}" \ tag.new; then echo "invalid signature." return 1 fi echo "done." RELPATCHNUM=`cut -f 4 -d '|' < tag.new` TINDEXHASH=`cut -f 5 -d '|' < tag.new` EOLTIME=`cut -f 6 -d '|' < tag.new` } # Sanity-check the patch number in a tag, to make sure that we're not # going to "update" backwards and to prevent replay attacks. fetch_tagsanity () { # Check that we're not going to move from -pX to -pY with Y < X. RELPX=`uname -r | sed -E 's,.*-,,'` if echo ${RELPX} | grep -qE '^p[0-9]+$'; then RELPX=`echo ${RELPX} | cut -c 2-` else RELPX=0 fi if [ "${RELPATCHNUM}" -lt "${RELPX}" ]; then echo echo -n "Files on mirror (${RELNUM}-p${RELPATCHNUM})" echo " appear older than what" echo "we are currently running (`uname -r`)!" echo "Cowardly refusing to proceed any further." return 1 fi # If "tag" exists and corresponds to ${RELNUM}, make sure that # it contains a patch number <= RELPATCHNUM, in order to protect # against rollback (replay) attacks. if [ -f tag ] && grep -qE \ "^freebsd-update\|${ARCH}\|${RELNUM}\|[0-9]+\|[0-9a-f]{64}\|[0-9]{10}" \ tag; then LASTRELPATCHNUM=`cut -f 4 -d '|' < tag` if [ "${RELPATCHNUM}" -lt "${LASTRELPATCHNUM}" ]; then echo echo -n "Files on mirror (${RELNUM}-p${RELPATCHNUM})" echo " are older than the" echo -n "most recently seen updates" echo " (${RELNUM}-p${LASTRELPATCHNUM})." echo "Cowardly refusing to proceed any further." return 1 fi fi } # Fetch metadata index file fetch_metadata_index () { echo ${NDEBUG} "Fetching metadata index... " rm -f ${TINDEXHASH} fetch ${QUIETFLAG} http://${SERVERNAME}/${FETCHDIR}/t/${TINDEXHASH} 2>${QUIETREDIR} if ! [ -f ${TINDEXHASH} ]; then echo "failed." return 1 fi if [ `${SHA256} -q ${TINDEXHASH}` != ${TINDEXHASH} ]; then echo "update metadata index corrupt." return 1 fi echo "done." } # Print an error message about signed metadata being bogus. fetch_metadata_bogus () { echo echo "The update metadata$1 is correctly signed, but" echo "failed an integrity check." echo "Cowardly refusing to proceed any further." return 1 } # Construct tINDEX.new by merging the lines named in $1 from ${TINDEXHASH} # with the lines not named in $@ from tINDEX.present (if that file exists). fetch_metadata_index_merge () { for METAFILE in $@; do if [ `grep -E "^${METAFILE}\|" ${TINDEXHASH} | wc -l` \ -ne 1 ]; then fetch_metadata_bogus " index" return 1 fi grep -E "${METAFILE}\|" ${TINDEXHASH} done | sort > tINDEX.wanted if [ -f tINDEX.present ]; then join -t '|' -v 2 tINDEX.wanted tINDEX.present | sort -m - tINDEX.wanted > tINDEX.new rm tINDEX.wanted else mv tINDEX.wanted tINDEX.new fi } # Sanity check all the lines of tINDEX.new. Even if more metadata lines # are added by future versions of the server, this won't cause problems, # since the only lines which appear in tINDEX.new are the ones which we # specifically grepped out of ${TINDEXHASH}. fetch_metadata_index_sanity () { if grep -qvE '^[0-9A-Z.-]+\|[0-9a-f]{64}$' tINDEX.new; then fetch_metadata_bogus " index" return 1 fi } # Sanity check the metadata file $1. fetch_metadata_sanity () { # Some aliases to save space later: ${P} is a character which can # appear in a path; ${M} is the four numeric metadata fields; and # ${H} is a sha256 hash. P="[-+./:=,%@_[~[:alnum:]]" M="[0-9]+\|[0-9]+\|[0-9]+\|[0-9]+" H="[0-9a-f]{64}" # Check that the first four fields make sense. if gunzip -c < files/$1.gz | grep -qvE "^[a-z]+\|[0-9a-z-]+\|${P}+\|[fdL-]\|"; then fetch_metadata_bogus "" return 1 fi # Remove the first three fields. gunzip -c < files/$1.gz | cut -f 4- -d '|' > sanitycheck.tmp # Sanity check entries with type 'f' if grep -E '^f' sanitycheck.tmp | grep -qvE "^f\|${M}\|${H}\|${P}*\$"; then fetch_metadata_bogus "" return 1 fi # Sanity check entries with type 'd' if grep -E '^d' sanitycheck.tmp | grep -qvE "^d\|${M}\|\|\$"; then fetch_metadata_bogus "" return 1 fi # Sanity check entries with type 'L' if grep -E '^L' sanitycheck.tmp | grep -qvE "^L\|${M}\|${P}*\|\$"; then fetch_metadata_bogus "" return 1 fi # Sanity check entries with type '-' if grep -E '^-' sanitycheck.tmp | grep -qvE "^-\|\|\|\|\|\|"; then fetch_metadata_bogus "" return 1 fi # Clean up rm sanitycheck.tmp } # Fetch the metadata index and metadata files listed in $@, # taking advantage of metadata patches where possible. fetch_metadata () { fetch_metadata_index || return 1 fetch_metadata_index_merge $@ || return 1 fetch_metadata_index_sanity || return 1 # Generate a list of wanted metadata patches join -t '|' -o 1.2,2.2 tINDEX.present tINDEX.new | fetch_make_patchlist > patchlist if [ -s patchlist ]; then # Attempt to fetch metadata patches echo -n "Fetching `wc -l < patchlist | tr -d ' '` " echo ${NDEBUG} "metadata patches.${DDSTATS}" tr '|' '-' < patchlist | lam -s "${FETCHDIR}/tp/" - -s ".gz" | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${STATSREDIR} | fetch_progress echo "done." # Attempt to apply metadata patches echo -n "Applying metadata patches... " tr '|' ' ' < patchlist | while read X Y; do if [ ! -f "${X}-${Y}.gz" ]; then continue; fi gunzip -c < ${X}-${Y}.gz > diff gunzip -c < files/${X}.gz > diff-OLD # Figure out which lines are being added and removed grep -E '^-' diff | cut -c 2- | while read PREFIX; do look "${PREFIX}" diff-OLD done | sort > diff-rm grep -E '^\+' diff | cut -c 2- > diff-add # Generate the new file comm -23 diff-OLD diff-rm | sort - diff-add > diff-NEW if [ `${SHA256} -q diff-NEW` = ${Y} ]; then mv diff-NEW files/${Y} gzip -n files/${Y} else mv diff-NEW ${Y}.bad fi rm -f ${X}-${Y}.gz diff rm -f diff-OLD diff-NEW diff-add diff-rm done 2>${QUIETREDIR} echo "done." fi # Update metadata without patches cut -f 2 -d '|' < tINDEX.new | while read Y; do if [ ! -f "files/${Y}.gz" ]; then echo ${Y}; fi done | sort -u > filelist if [ -s filelist ]; then echo -n "Fetching `wc -l < filelist | tr -d ' '` " echo ${NDEBUG} "metadata files... " lam -s "${FETCHDIR}/m/" - -s ".gz" < filelist | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${QUIETREDIR} while read Y; do if ! [ -f ${Y}.gz ]; then echo "failed." return 1 fi if [ `gunzip -c < ${Y}.gz | ${SHA256} -q` = ${Y} ]; then mv ${Y}.gz files/${Y}.gz else echo "metadata is corrupt." return 1 fi done < filelist echo "done." fi # Sanity-check the metadata files. cut -f 2 -d '|' tINDEX.new > filelist while read X; do fetch_metadata_sanity ${X} || return 1 done < filelist # Remove files which are no longer needed cut -f 2 -d '|' tINDEX.present | sort > oldfiles cut -f 2 -d '|' tINDEX.new | sort | comm -13 - oldfiles | lam -s "files/" - -s ".gz" | xargs rm -f rm patchlist filelist oldfiles rm ${TINDEXHASH} # We're done! mv tINDEX.new tINDEX.present mv tag.new tag return 0 } # Extract a subset of a downloaded metadata file containing only the parts # which are listed in COMPONENTS. fetch_filter_metadata_components () { METAHASH=`look "$1|" tINDEX.present | cut -f 2 -d '|'` gunzip -c < files/${METAHASH}.gz > $1.all # Fish out the lines belonging to components we care about. for C in ${COMPONENTS}; do look "`echo ${C} | tr '/' '|'`|" $1.all done > $1 # Remove temporary file. rm $1.all } # Generate a filtered version of the metadata file $1 from the downloaded # file, by fishing out the lines corresponding to components we're trying # to keep updated, and then removing lines corresponding to paths we want # to ignore. fetch_filter_metadata () { # Fish out the lines belonging to components we care about. fetch_filter_metadata_components $1 # Canonicalize directory names by removing any trailing / in # order to avoid listing directories multiple times if they # belong to multiple components. Turning "/" into "" doesn't # matter, since we add a leading "/" when we use paths later. cut -f 3- -d '|' $1 | sed -e 's,/|d|,|d|,' | sed -e 's,/|-|,|-|,' | sort -u > $1.tmp # Figure out which lines to ignore and remove them. for X in ${IGNOREPATHS}; do grep -E "^${X}" $1.tmp done | sort -u | comm -13 - $1.tmp > $1 # Remove temporary files. rm $1.tmp } # Filter the metadata file $1 by adding lines with "/boot/$2" # replaced by ${KERNELDIR} (which is `sysctl -n kern.bootfile` minus the # trailing "/kernel"); and if "/boot/$2" does not exist, remove # the original lines which start with that. # Put another way: Deal with the fact that the FOO kernel is sometimes # installed in /boot/FOO/ and is sometimes installed elsewhere. fetch_filter_kernel_names () { grep ^/boot/$2 $1 | sed -e "s,/boot/$2,${KERNELDIR},g" | sort - $1 > $1.tmp mv $1.tmp $1 if ! [ -d /boot/$2 ]; then grep -v ^/boot/$2 $1 > $1.tmp mv $1.tmp $1 fi } # For all paths appearing in $1 or $3, inspect the system # and generate $2 describing what is currently installed. fetch_inspect_system () { # No errors yet... rm -f .err # Tell the user why his disk is suddenly making lots of noise echo -n "Inspecting system... " # Generate list of files to inspect cat $1 $3 | cut -f 1 -d '|' | sort -u > filelist # Examine each file and output lines of the form # /path/to/file|type|device-inum|user|group|perm|flags|value # sorted by device and inode number. while read F; do # If the symlink/file/directory does not exist, record this. if ! [ -e ${BASEDIR}/${F} ]; then echo "${F}|-||||||" continue fi if ! [ -r ${BASEDIR}/${F} ]; then echo "Cannot read file: ${BASEDIR}/${F}" \ >/dev/stderr touch .err return 1 fi # Otherwise, output an index line. if [ -L ${BASEDIR}/${F} ]; then echo -n "${F}|L|" stat -n -f '%d-%i|%u|%g|%Mp%Lp|%Of|' ${BASEDIR}/${F}; readlink ${BASEDIR}/${F}; elif [ -f ${BASEDIR}/${F} ]; then echo -n "${F}|f|" stat -n -f '%d-%i|%u|%g|%Mp%Lp|%Of|' ${BASEDIR}/${F}; sha256 -q ${BASEDIR}/${F}; elif [ -d ${BASEDIR}/${F} ]; then echo -n "${F}|d|" stat -f '%d-%i|%u|%g|%Mp%Lp|%Of|' ${BASEDIR}/${F}; else echo "Unknown file type: ${BASEDIR}/${F}" \ >/dev/stderr touch .err return 1 fi done < filelist | sort -k 3,3 -t '|' > $2.tmp rm filelist # Check if an error occurred during system inspection if [ -f .err ]; then return 1 fi # Convert to the form # /path/to/file|type|user|group|perm|flags|value|hlink # by resolving identical device and inode numbers into hard links. cut -f 1,3 -d '|' $2.tmp | sort -k 1,1 -t '|' | sort -s -u -k 2,2 -t '|' | join -1 2 -2 3 -t '|' - $2.tmp | awk -F \| -v OFS=\| \ '{ if (($2 == $3) || ($4 == "-")) print $3,$4,$5,$6,$7,$8,$9,"" else print $3,$4,$5,$6,$7,$8,$9,$2 }' | sort > $2 rm $2.tmp # We're finished looking around echo "done." } # For any paths matching ${MERGECHANGES}, compare $1 and $2 and find any # files which differ; generate $3 containing these paths and the old hashes. fetch_filter_mergechanges () { # Pull out the paths and hashes of the files matching ${MERGECHANGES}. for F in $1 $2; do for X in ${MERGECHANGES}; do grep -E "^${X}" ${F} done | cut -f 1,2,7 -d '|' | sort > ${F}-values done # Any line in $2-values which doesn't appear in $1-values and is a # file means that we should list the path in $3. comm -13 $1-values $2-values | fgrep '|f|' | cut -f 1 -d '|' > $2-paths # For each path, pull out one (and only one!) entry from $1-values. # Note that we cannot distinguish which "old" version the user made # changes to; but hopefully any changes which occur due to security # updates will exist in both the "new" version and the version which # the user has installed, so the merging will still work. while read X; do look "${X}|" $1-values | head -1 done < $2-paths > $3 # Clean up rm $1-values $2-values $2-paths } # For any paths matching ${UPDATEIFUNMODIFIED}, remove lines from $[123] # which correspond to lines in $2 with hashes not matching $1 or $3, unless # the paths are listed in $4. For entries in $2 marked "not present" # (aka. type -), remove lines from $[123] unless there is a corresponding # entry in $1. fetch_filter_unmodified_notpresent () { # Figure out which lines of $1 and $3 correspond to bits which # should only be updated if they haven't changed, and fish out # the (path, type, value) tuples. # NOTE: We don't consider a file to be "modified" if it matches # the hash from $3. for X in ${UPDATEIFUNMODIFIED}; do grep -E "^${X}" $1 grep -E "^${X}" $3 done | cut -f 1,2,7 -d '|' | sort > $1-values # Do the same for $2. for X in ${UPDATEIFUNMODIFIED}; do grep -E "^${X}" $2 done | cut -f 1,2,7 -d '|' | sort > $2-values # Any entry in $2-values which is not in $1-values corresponds to # a path which we need to remove from $1, $2, and $3, unless it # that path appears in $4. comm -13 $1-values $2-values | sort -t '|' -k 1,1 > mlines.tmp cut -f 1 -d '|' $4 | sort | join -v 2 -t '|' - mlines.tmp | sort > mlines rm $1-values $2-values mlines.tmp # Any lines in $2 which are not in $1 AND are "not present" lines # also belong in mlines. comm -13 $1 $2 | cut -f 1,2,7 -d '|' | fgrep '|-|' >> mlines # Remove lines from $1, $2, and $3 for X in $1 $2 $3; do sort -t '|' -k 1,1 ${X} > ${X}.tmp cut -f 1 -d '|' < mlines | sort | join -v 2 -t '|' - ${X}.tmp | sort > ${X} rm ${X}.tmp done # Store a list of the modified files, for future reference fgrep -v '|-|' mlines | cut -f 1 -d '|' > modifiedfiles rm mlines } # For each entry in $1 of type -, remove any corresponding # entry from $2 if ${ALLOWADD} != "yes". Remove all entries # of type - from $1. fetch_filter_allowadd () { cut -f 1,2 -d '|' < $1 | fgrep '|-' | cut -f 1 -d '|' > filesnotpresent if [ ${ALLOWADD} != "yes" ]; then sort < $2 | join -v 1 -t '|' - filesnotpresent | sort > $2.tmp mv $2.tmp $2 fi sort < $1 | join -v 1 -t '|' - filesnotpresent | sort > $1.tmp mv $1.tmp $1 rm filesnotpresent } # If ${ALLOWDELETE} != "yes", then remove any entries from $1 # which don't correspond to entries in $2. fetch_filter_allowdelete () { # Produce a lists ${PATH}|${TYPE} for X in $1 $2; do cut -f 1-2 -d '|' < ${X} | sort -u > ${X}.nodes done # Figure out which lines need to be removed from $1. if [ ${ALLOWDELETE} != "yes" ]; then comm -23 $1.nodes $2.nodes > $1.badnodes else : > $1.badnodes fi # Remove the relevant lines from $1 while read X; do look "${X}|" $1 done < $1.badnodes | comm -13 - $1 > $1.tmp mv $1.tmp $1 rm $1.badnodes $1.nodes $2.nodes } # If ${KEEPMODIFIEDMETADATA} == "yes", then for each entry in $2 # with metadata not matching any entry in $1, replace the corresponding # line of $3 with one having the same metadata as the entry in $2. fetch_filter_modified_metadata () { # Fish out the metadata from $1 and $2 for X in $1 $2; do cut -f 1-6 -d '|' < ${X} > ${X}.metadata done # Find the metadata we need to keep if [ ${KEEPMODIFIEDMETADATA} = "yes" ]; then comm -13 $1.metadata $2.metadata > keepmeta else : > keepmeta fi # Extract the lines which we need to remove from $3, and # construct the lines which we need to add to $3. : > $3.remove : > $3.add while read LINE; do NODE=`echo "${LINE}" | cut -f 1-2 -d '|'` look "${NODE}|" $3 >> $3.remove look "${NODE}|" $3 | cut -f 7- -d '|' | lam -s "${LINE}|" - >> $3.add done < keepmeta # Remove the specified lines and add the new lines. sort $3.remove | comm -13 - $3 | sort -u - $3.add > $3.tmp mv $3.tmp $3 rm keepmeta $1.metadata $2.metadata $3.add $3.remove } # Remove lines from $1 and $2 which are identical; # no need to update a file if it isn't changing. fetch_filter_uptodate () { comm -23 $1 $2 > $1.tmp comm -13 $1 $2 > $2.tmp mv $1.tmp $1 mv $2.tmp $2 } # Fetch any "clean" old versions of files we need for merging changes. fetch_files_premerge () { # We only need to do anything if $1 is non-empty. if [ -s $1 ]; then # Tell the user what we're doing echo -n "Fetching files from ${OLDRELNUM} for merging... " # List of files wanted fgrep '|f|' < $1 | cut -f 3 -d '|' | sort -u > files.wanted # Only fetch the files we don't already have while read Y; do if [ ! -f "files/${Y}.gz" ]; then echo ${Y}; fi done < files.wanted > filelist # Actually fetch them lam -s "${OLDFETCHDIR}/f/" - -s ".gz" < filelist | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${QUIETREDIR} # Make sure we got them all, and move them into /files/ while read Y; do if ! [ -f ${Y}.gz ]; then echo "failed." return 1 fi if [ `gunzip -c < ${Y}.gz | ${SHA256} -q` = ${Y} ]; then mv ${Y}.gz files/${Y}.gz else echo "${Y} has incorrect hash." return 1 fi done < filelist echo "done." # Clean up rm filelist files.wanted fi } # Prepare to fetch files: Generate a list of the files we need, # copy the unmodified files we have into /files/, and generate # a list of patches to download. fetch_files_prepare () { # Tell the user why his disk is suddenly making lots of noise echo -n "Preparing to download files... " # Reduce indices to ${PATH}|${HASH} pairs for X in $1 $2 $3; do cut -f 1,2,7 -d '|' < ${X} | fgrep '|f|' | cut -f 1,3 -d '|' | sort > ${X}.hashes done # List of files wanted cut -f 2 -d '|' < $3.hashes | sort -u | while read HASH; do if ! [ -f files/${HASH}.gz ]; then echo ${HASH} fi done > files.wanted # Generate a list of unmodified files comm -12 $1.hashes $2.hashes | sort -k 1,1 -t '|' > unmodified.files # Copy all files into /files/. We only need the unmodified files # for use in patching; but we'll want all of them if the user asks # to rollback the updates later. while read LINE; do F=`echo "${LINE}" | cut -f 1 -d '|'` HASH=`echo "${LINE}" | cut -f 2 -d '|'` # Skip files we already have. if [ -f files/${HASH}.gz ]; then continue fi # Make sure the file hasn't changed. cp "${BASEDIR}/${F}" tmpfile if [ `sha256 -q tmpfile` != ${HASH} ]; then echo echo "File changed while FreeBSD Update running: ${F}" return 1 fi # Place the file into storage. gzip -c < tmpfile > files/${HASH}.gz rm tmpfile done < $2.hashes # Produce a list of patches to download sort -k 1,1 -t '|' $3.hashes | join -t '|' -o 2.2,1.2 - unmodified.files | fetch_make_patchlist > patchlist # Garbage collect rm unmodified.files $1.hashes $2.hashes $3.hashes # We don't need the list of possible old files any more. rm $1 # We're finished making noise echo "done." } # Fetch files. fetch_files () { # Attempt to fetch patches if [ -s patchlist ]; then echo -n "Fetching `wc -l < patchlist | tr -d ' '` " echo ${NDEBUG} "patches.${DDSTATS}" tr '|' '-' < patchlist | lam -s "${PATCHDIR}/" - | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${STATSREDIR} | fetch_progress echo "done." # Attempt to apply patches echo -n "Applying patches... " tr '|' ' ' < patchlist | while read X Y; do if [ ! -f "${X}-${Y}" ]; then continue; fi gunzip -c < files/${X}.gz > OLD bspatch OLD NEW ${X}-${Y} if [ `${SHA256} -q NEW` = ${Y} ]; then mv NEW files/${Y} gzip -n files/${Y} fi rm -f diff OLD NEW ${X}-${Y} done 2>${QUIETREDIR} echo "done." fi # Download files which couldn't be generate via patching while read Y; do if [ ! -f "files/${Y}.gz" ]; then echo ${Y}; fi done < files.wanted > filelist if [ -s filelist ]; then echo -n "Fetching `wc -l < filelist | tr -d ' '` " echo ${NDEBUG} "files... " lam -s "${FETCHDIR}/f/" - -s ".gz" < filelist | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${STATSREDIR} | fetch_progress while read Y; do if ! [ -f ${Y}.gz ]; then echo "failed." return 1 fi if [ `gunzip -c < ${Y}.gz | ${SHA256} -q` = ${Y} ]; then mv ${Y}.gz files/${Y}.gz else echo "${Y} has incorrect hash." return 1 fi done < filelist echo "done." fi # Clean up rm files.wanted filelist patchlist } # Create and populate install manifest directory; and report what updates # are available. fetch_create_manifest () { # If we have an existing install manifest, nuke it. if [ -L "${BDHASH}-install" ]; then rm -r ${BDHASH}-install/ rm ${BDHASH}-install fi # Report to the user if any updates were avoided due to local changes if [ -s modifiedfiles ]; then cat - modifiedfiles <<- EOF | ${PAGER} The following files are affected by updates. No changes have been downloaded, however, because the files have been modified locally: EOF fi rm modifiedfiles # If no files will be updated, tell the user and exit if ! [ -s INDEX-PRESENT ] && ! [ -s INDEX-NEW ]; then rm INDEX-PRESENT INDEX-NEW echo echo -n "No updates needed to update system to " echo "${RELNUM}-p${RELPATCHNUM}." return fi # Divide files into (a) removed files, (b) added files, and # (c) updated files. cut -f 1 -d '|' < INDEX-PRESENT | sort > INDEX-PRESENT.flist cut -f 1 -d '|' < INDEX-NEW | sort > INDEX-NEW.flist comm -23 INDEX-PRESENT.flist INDEX-NEW.flist > files.removed comm -13 INDEX-PRESENT.flist INDEX-NEW.flist > files.added comm -12 INDEX-PRESENT.flist INDEX-NEW.flist > files.updated rm INDEX-PRESENT.flist INDEX-NEW.flist # Report removed files, if any if [ -s files.removed ]; then cat - files.removed <<- EOF | ${PAGER} The following files will be removed as part of updating to ${RELNUM}-p${RELPATCHNUM}: EOF fi rm files.removed # Report added files, if any if [ -s files.added ]; then cat - files.added <<- EOF | ${PAGER} The following files will be added as part of updating to ${RELNUM}-p${RELPATCHNUM}: EOF fi rm files.added # Report updated files, if any if [ -s files.updated ]; then cat - files.updated <<- EOF | ${PAGER} The following files will be updated as part of updating to ${RELNUM}-p${RELPATCHNUM}: EOF fi rm files.updated # Create a directory for the install manifest. MDIR=`mktemp -d install.XXXXXX` || return 1 # Populate it mv INDEX-PRESENT ${MDIR}/INDEX-OLD mv INDEX-NEW ${MDIR}/INDEX-NEW # Link it into place ln -s ${MDIR} ${BDHASH}-install } # Warn about any upcoming EoL fetch_warn_eol () { # What's the current time? NOWTIME=`date "+%s"` # When did we last warn about the EoL date? if [ -f lasteolwarn ]; then LASTWARN=`cat lasteolwarn` else LASTWARN=`expr ${NOWTIME} - 63072000` fi # If the EoL time is past, warn. if [ ${EOLTIME} -lt ${NOWTIME} ]; then echo cat <<-EOF WARNING: `uname -sr` HAS PASSED ITS END-OF-LIFE DATE. Any security issues discovered after `date -r ${EOLTIME}` will not have been corrected. EOF return 1 fi # Figure out how long it has been since we last warned about the # upcoming EoL, and how much longer we have left. SINCEWARN=`expr ${NOWTIME} - ${LASTWARN}` TIMELEFT=`expr ${EOLTIME} - ${NOWTIME}` # Don't warn if the EoL is more than 3 months away if [ ${TIMELEFT} -gt 7884000 ]; then return 0 fi # Don't warn if the time remaining is more than 3 times the time # since the last warning. if [ ${TIMELEFT} -gt `expr ${SINCEWARN} \* 3` ]; then return 0 fi # Figure out what time units to use. if [ ${TIMELEFT} -lt 604800 ]; then UNIT="day" SIZE=86400 elif [ ${TIMELEFT} -lt 2678400 ]; then UNIT="week" SIZE=604800 else UNIT="month" SIZE=2678400 fi # Compute the right number of units NUM=`expr ${TIMELEFT} / ${SIZE}` if [ ${NUM} != 1 ]; then UNIT="${UNIT}s" fi # Print the warning echo cat <<-EOF WARNING: `uname -sr` is approaching its End-of-Life date. It is strongly recommended that you upgrade to a newer release within the next ${NUM} ${UNIT}. EOF # Update the stored time of last warning echo ${NOWTIME} > lasteolwarn } # Do the actual work involved in "fetch" / "cron". fetch_run () { workdir_init || return 1 # Prepare the mirror list. fetch_pick_server_init && fetch_pick_server # Try to fetch the public key until we run out of servers. while ! fetch_key; do fetch_pick_server || return 1 done # Try to fetch the metadata index signature ("tag") until we run # out of available servers; and sanity check the downloaded tag. while ! fetch_tag; do fetch_pick_server || return 1 done fetch_tagsanity || return 1 # Fetch the latest INDEX-NEW and INDEX-OLD files. fetch_metadata INDEX-NEW INDEX-OLD || return 1 # Generate filtered INDEX-NEW and INDEX-OLD files containing only # the lines which (a) belong to components we care about, and (b) # don't correspond to paths we're explicitly ignoring. fetch_filter_metadata INDEX-NEW || return 1 fetch_filter_metadata INDEX-OLD || return 1 # Translate /boot/${KERNCONF} into ${KERNELDIR} fetch_filter_kernel_names INDEX-NEW ${KERNCONF} fetch_filter_kernel_names INDEX-OLD ${KERNCONF} # For all paths appearing in INDEX-OLD or INDEX-NEW, inspect the # system and generate an INDEX-PRESENT file. fetch_inspect_system INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1 # Based on ${UPDATEIFUNMODIFIED}, remove lines from INDEX-* which # correspond to lines in INDEX-PRESENT with hashes not appearing # in INDEX-OLD or INDEX-NEW. Also remove lines where the entry in # INDEX-PRESENT has type - and there isn't a corresponding entry in # INDEX-OLD with type -. fetch_filter_unmodified_notpresent \ INDEX-OLD INDEX-PRESENT INDEX-NEW /dev/null # For each entry in INDEX-PRESENT of type -, remove any corresponding # entry from INDEX-NEW if ${ALLOWADD} != "yes". Remove all entries # of type - from INDEX-PRESENT. fetch_filter_allowadd INDEX-PRESENT INDEX-NEW # If ${ALLOWDELETE} != "yes", then remove any entries from # INDEX-PRESENT which don't correspond to entries in INDEX-NEW. fetch_filter_allowdelete INDEX-PRESENT INDEX-NEW # If ${KEEPMODIFIEDMETADATA} == "yes", then for each entry in # INDEX-PRESENT with metadata not matching any entry in INDEX-OLD, # replace the corresponding line of INDEX-NEW with one having the # same metadata as the entry in INDEX-PRESENT. fetch_filter_modified_metadata INDEX-OLD INDEX-PRESENT INDEX-NEW # Remove lines from INDEX-PRESENT and INDEX-NEW which are identical; # no need to update a file if it isn't changing. fetch_filter_uptodate INDEX-PRESENT INDEX-NEW # Prepare to fetch files: Generate a list of the files we need, # copy the unmodified files we have into /files/, and generate # a list of patches to download. fetch_files_prepare INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1 # Fetch files. fetch_files || return 1 # Create and populate install manifest directory; and report what # updates are available. fetch_create_manifest || return 1 # Warn about any upcoming EoL fetch_warn_eol || return 1 } # If StrictComponents is not "yes", generate a new components list # with only the components which appear to be installed. upgrade_guess_components () { if [ "${STRICTCOMPONENTS}" = "no" ]; then # Generate filtered INDEX-ALL with only the components listed # in COMPONENTS. fetch_filter_metadata_components $1 || return 1 # Tell the user why his disk is suddenly making lots of noise echo -n "Inspecting system... " # Look at the files on disk, and assume that a component is # supposed to be present if it is more than half-present. cut -f 1-3 -d '|' < INDEX-ALL | tr '|' ' ' | while read C S F; do if [ -e ${BASEDIR}/${F} ]; then echo "+ ${C}|${S}" fi echo "= ${C}|${S}" done | sort | uniq -c | sed -E 's,^ +,,' > compfreq grep ' = ' compfreq | cut -f 1,3 -d ' ' | sort -k 2,2 -t ' ' > compfreq.total grep ' + ' compfreq | cut -f 1,3 -d ' ' | sort -k 2,2 -t ' ' > compfreq.present join -t ' ' -1 2 -2 2 compfreq.present compfreq.total | while read S P T; do if [ ${T} -ne 0 -a ${P} -gt `expr ${T} / 2` ]; then echo ${S} fi done > comp.present cut -f 2 -d ' ' < compfreq.total > comp.total rm INDEX-ALL compfreq compfreq.total compfreq.present # We're done making noise. echo "done." # Sometimes the kernel isn't installed where INDEX-ALL # thinks that it should be: In particular, it is often in # /boot/kernel instead of /boot/GENERIC or /boot/SMP. To # deal with this, if "kernel|X" is listed in comp.total # (i.e., is a component which would be upgraded if it is # found to be present) we will add it to comp.present. # If "kernel|" is in comp.total but "kernel|X" is # not, we print a warning -- the user is running a kernel # which isn't part of the release. KCOMP=`echo ${KERNCONF} | tr 'A-Z' 'a-z'` grep -E "^kernel\|${KCOMP}\$" comp.total >> comp.present if grep -qE "^kernel\|" comp.total && ! grep -qE "^kernel\|${KCOMP}\$" comp.total; then cat <<-EOF WARNING: This system is running a "${KCOMP}" kernel, which is not a kernel configuration distributed as part of FreeBSD ${RELNUM}. This kernel will not be updated: you MUST update the kernel manually before running "$0 install". EOF fi # Re-sort the list of installed components and generate # the list of non-installed components. sort -u < comp.present > comp.present.tmp mv comp.present.tmp comp.present comm -13 comp.present comp.total > comp.absent # Ask the user to confirm that what we have is correct. To # reduce user confusion, translate "X|Y" back to "X/Y" (as # subcomponents must be listed in the configuration file). echo echo -n "The following components of FreeBSD " echo "seem to be installed:" tr '|' '/' < comp.present | fmt -72 echo echo -n "The following components of FreeBSD " echo "do not seem to be installed:" tr '|' '/' < comp.absent | fmt -72 echo continuep || return 1 echo # Suck the generated list of components into ${COMPONENTS}. # Note that comp.present.tmp is used due to issues with # pipelines and setting variables. COMPONENTS="" tr '|' '/' < comp.present > comp.present.tmp while read C; do COMPONENTS="${COMPONENTS} ${C}" done < comp.present.tmp # Delete temporary files rm comp.present comp.present.tmp comp.absent comp.total fi } # If StrictComponents is not "yes", COMPONENTS contains an entry # corresponding to the currently running kernel, and said kernel # does not exist in the new release, add "kernel/generic" to the # list of components. upgrade_guess_new_kernel () { if [ "${STRICTCOMPONENTS}" = "no" ]; then # Grab the unfiltered metadata file. METAHASH=`look "$1|" tINDEX.present | cut -f 2 -d '|'` gunzip -c < files/${METAHASH}.gz > $1.all # If "kernel/${KCOMP}" is in ${COMPONENTS} and that component # isn't in $1.all, we need to add kernel/generic. for C in ${COMPONENTS}; do if [ ${C} = "kernel/${KCOMP}" ] && ! grep -qE "^kernel\|${KCOMP}\|" $1.all; then COMPONENTS="${COMPONENTS} kernel/generic" NKERNCONF="GENERIC" cat <<-EOF WARNING: This system is running a "${KCOMP}" kernel, which is not a kernel configuration distributed as part of FreeBSD ${RELNUM}. As part of upgrading to FreeBSD ${RELNUM}, this kernel will be replaced with a "generic" kernel. EOF continuep || return 1 fi done # Don't need this any more... rm $1.all fi } # Convert INDEX-OLD (last release) and INDEX-ALL (new release) into # INDEX-OLD and INDEX-NEW files (in the sense of normal upgrades). upgrade_oldall_to_oldnew () { # For each ${F}|... which appears in INDEX-ALL but does not appear # in INDEX-OLD, add ${F}|-|||||| to INDEX-OLD. cut -f 1 -d '|' < $1 | sort -u > $1.paths cut -f 1 -d '|' < $2 | sort -u | comm -13 $1.paths - | lam - -s "|-||||||" | sort - $1 > $1.tmp mv $1.tmp $1 # Remove lines from INDEX-OLD which also appear in INDEX-ALL comm -23 $1 $2 > $1.tmp mv $1.tmp $1 # Remove lines from INDEX-ALL which have a file name not appearing # anywhere in INDEX-OLD (since these must be files which haven't # changed -- if they were new, there would be an entry of type "-"). cut -f 1 -d '|' < $1 | sort -u > $1.paths sort -k 1,1 -t '|' < $2 | join -t '|' - $1.paths | sort > $2.tmp rm $1.paths mv $2.tmp $2 # Rename INDEX-ALL to INDEX-NEW. mv $2 $3 } # Helper for upgrade_merge: Return zero true iff the two files differ only # in the contents of their RCS tags. samef () { X=`sed -E 's/\\$FreeBSD.*\\$/\$FreeBSD\$/' < $1 | ${SHA256}` Y=`sed -E 's/\\$FreeBSD.*\\$/\$FreeBSD\$/' < $2 | ${SHA256}` if [ $X = $Y ]; then return 0; else return 1; fi } # From the list of "old" files in $1, merge changes in $2 with those in $3, # and update $3 to reflect the hashes of merged files. upgrade_merge () { # We only need to do anything if $1 is non-empty. if [ -s $1 ]; then cut -f 1 -d '|' $1 | sort > $1-paths # Create staging area for merging files rm -rf merge/ while read F; do D=`dirname ${F}` mkdir -p merge/old/${D} mkdir -p merge/${OLDRELNUM}/${D} mkdir -p merge/${RELNUM}/${D} mkdir -p merge/new/${D} done < $1-paths # Copy in files while read F; do # Currently installed file V=`look "${F}|" $2 | cut -f 7 -d '|'` gunzip < files/${V}.gz > merge/old/${F} # Old release if look "${F}|" $1 | fgrep -q "|f|"; then V=`look "${F}|" $1 | cut -f 3 -d '|'` gunzip < files/${V}.gz \ > merge/${OLDRELNUM}/${F} fi # New release if look "${F}|" $3 | cut -f 1,2,7 -d '|' | fgrep -q "|f|"; then V=`look "${F}|" $3 | cut -f 7 -d '|'` gunzip < files/${V}.gz \ > merge/${RELNUM}/${F} fi done < $1-paths # Attempt to automatically merge changes echo -n "Attempting to automatically merge " echo -n "changes in files..." : > failed.merges while read F; do # If the file doesn't exist in the new release, # the result of "merging changes" is having the file # not exist. if ! [ -f merge/${RELNUM}/${F} ]; then continue fi # If the file didn't exist in the old release, we're # going to throw away the existing file and hope that # the version from the new release is what we want. if ! [ -f merge/${OLDRELNUM}/${F} ]; then cp merge/${RELNUM}/${F} merge/new/${F} continue fi # Some files need special treatment. case ${F} in /etc/spwd.db | /etc/pwd.db | /etc/login.conf.db) # Don't merge these -- we're rebuild them # after updates are installed. cp merge/old/${F} merge/new/${F} ;; *) if ! diff3 -E -m -L "current version" \ -L "${OLDRELNUM}" -L "${RELNUM}" \ merge/old/${F} \ merge/${OLDRELNUM}/${F} \ merge/${RELNUM}/${F} \ > merge/new/${F} 2>/dev/null; then echo ${F} >> failed.merges fi ;; esac done < $1-paths echo " done." # Ask the user to handle any files which didn't merge. while read F; do # If the installed file differs from the version in # the old release only due to RCS tag expansion # then just use the version in the new release. if samef merge/old/${F} merge/${OLDRELNUM}/${F}; then cp merge/${RELNUM}/${F} merge/new/${F} continue fi cat <<-EOF The following file could not be merged automatically: ${F} Press Enter to edit this file in ${EDITOR} and resolve the conflicts manually... EOF read dummy files/${V}.gz echo "${F}|${V}" fi done < $1-paths > newhashes # Pull lines out from $3 which need to be updated to # reflect merged files. while read F; do look "${F}|" $3 done < $1-paths > $3-oldlines # Update lines to reflect merged files join -t '|' -o 1.1,1.2,1.3,1.4,1.5,1.6,2.2,1.8 \ $3-oldlines newhashes > $3-newlines # Remove old lines from $3 and add new lines. sort $3-oldlines | comm -13 - $3 | sort - $3-newlines > $3.tmp mv $3.tmp $3 # Clean up rm $1-paths newhashes $3-oldlines $3-newlines rm -rf merge/ fi # We're done with merging files. rm $1 } # Do the work involved in fetching upgrades to a new release upgrade_run () { workdir_init || return 1 # Prepare the mirror list. fetch_pick_server_init && fetch_pick_server # Try to fetch the public key until we run out of servers. while ! fetch_key; do fetch_pick_server || return 1 done # Try to fetch the metadata index signature ("tag") until we run # out of available servers; and sanity check the downloaded tag. while ! fetch_tag; do fetch_pick_server || return 1 done fetch_tagsanity || return 1 # Fetch the INDEX-OLD and INDEX-ALL. fetch_metadata INDEX-OLD INDEX-ALL || return 1 # If StrictComponents is not "yes", generate a new components list # with only the components which appear to be installed. upgrade_guess_components INDEX-ALL || return 1 # Generate filtered INDEX-OLD and INDEX-ALL files containing only # the components we want and without anything marked as "Ignore". fetch_filter_metadata INDEX-OLD || return 1 fetch_filter_metadata INDEX-ALL || return 1 # Merge the INDEX-OLD and INDEX-ALL files into INDEX-OLD. sort INDEX-OLD INDEX-ALL > INDEX-OLD.tmp mv INDEX-OLD.tmp INDEX-OLD rm INDEX-ALL # Adjust variables for fetching files from the new release. OLDRELNUM=${RELNUM} RELNUM=${TARGETRELEASE} OLDFETCHDIR=${FETCHDIR} FETCHDIR=${RELNUM}/${ARCH} # Try to fetch the NEW metadata index signature ("tag") until we run # out of available servers; and sanity check the downloaded tag. while ! fetch_tag; do fetch_pick_server || return 1 done # Fetch the new INDEX-ALL. fetch_metadata INDEX-ALL || return 1 # If StrictComponents is not "yes", COMPONENTS contains an entry # corresponding to the currently running kernel, and said kernel # does not exist in the new release, add "kernel/generic" to the # list of components. upgrade_guess_new_kernel INDEX-ALL || return 1 # Filter INDEX-ALL to contain only the components we want and without # anything marked as "Ignore". fetch_filter_metadata INDEX-ALL || return 1 # Convert INDEX-OLD (last release) and INDEX-ALL (new release) into # INDEX-OLD and INDEX-NEW files (in the sense of normal upgrades). upgrade_oldall_to_oldnew INDEX-OLD INDEX-ALL INDEX-NEW # Translate /boot/${KERNCONF} or /boot/${NKERNCONF} into ${KERNELDIR} fetch_filter_kernel_names INDEX-NEW ${NKERNCONF} fetch_filter_kernel_names INDEX-OLD ${KERNCONF} # For all paths appearing in INDEX-OLD or INDEX-NEW, inspect the # system and generate an INDEX-PRESENT file. fetch_inspect_system INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1 # Based on ${MERGECHANGES}, generate a file tomerge-old with the # paths and hashes of old versions of files to merge. fetch_filter_mergechanges INDEX-OLD INDEX-PRESENT tomerge-old # Based on ${UPDATEIFUNMODIFIED}, remove lines from INDEX-* which # correspond to lines in INDEX-PRESENT with hashes not appearing # in INDEX-OLD or INDEX-NEW. Also remove lines where the entry in # INDEX-PRESENT has type - and there isn't a corresponding entry in # INDEX-OLD with type -. fetch_filter_unmodified_notpresent \ INDEX-OLD INDEX-PRESENT INDEX-NEW tomerge-old # For each entry in INDEX-PRESENT of type -, remove any corresponding # entry from INDEX-NEW if ${ALLOWADD} != "yes". Remove all entries # of type - from INDEX-PRESENT. fetch_filter_allowadd INDEX-PRESENT INDEX-NEW # If ${ALLOWDELETE} != "yes", then remove any entries from # INDEX-PRESENT which don't correspond to entries in INDEX-NEW. fetch_filter_allowdelete INDEX-PRESENT INDEX-NEW # If ${KEEPMODIFIEDMETADATA} == "yes", then for each entry in # INDEX-PRESENT with metadata not matching any entry in INDEX-OLD, # replace the corresponding line of INDEX-NEW with one having the # same metadata as the entry in INDEX-PRESENT. fetch_filter_modified_metadata INDEX-OLD INDEX-PRESENT INDEX-NEW # Remove lines from INDEX-PRESENT and INDEX-NEW which are identical; # no need to update a file if it isn't changing. fetch_filter_uptodate INDEX-PRESENT INDEX-NEW # Fetch "clean" files from the old release for merging changes. fetch_files_premerge tomerge-old # Prepare to fetch files: Generate a list of the files we need, # copy the unmodified files we have into /files/, and generate # a list of patches to download. fetch_files_prepare INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1 # Fetch patches from to-${RELNUM}/${ARCH}/bp/ PATCHDIR=to-${RELNUM}/${ARCH}/bp fetch_files || return 1 # Merge configuration file changes. upgrade_merge tomerge-old INDEX-PRESENT INDEX-NEW || return 1 # Create and populate install manifest directory; and report what # updates are available. fetch_create_manifest || return 1 # Leave a note behind to tell the "install" command that the kernel # needs to be installed before the world. touch ${BDHASH}-install/kernelfirst # Remind the user that they need to run "freebsd-update install" # to install the downloaded bits, in case they didn't RTFM. echo "To install the downloaded upgrades, run \"$0 install\"." } # Make sure that all the file hashes mentioned in $@ have corresponding # gzipped files stored in /files/. install_verify () { # Generate a list of hashes cat $@ | cut -f 2,7 -d '|' | grep -E '^f' | cut -f 2 -d '|' | sort -u > filelist # Make sure all the hashes exist while read HASH; do if ! [ -f files/${HASH}.gz ]; then echo -n "Update files missing -- " echo "this should never happen." echo "Re-run '$0 fetch'." return 1 fi done < filelist # Clean up rm filelist } # Remove the system immutable flag from files install_unschg () { # Generate file list cat $@ | cut -f 1 -d '|' > filelist # Remove flags while read F; do if ! [ -e ${BASEDIR}/${F} ]; then continue else echo ${BASEDIR}/${F} fi done < filelist | xargs chflags noschg || return 1 # Clean up rm filelist } # Decide which directory name to use for kernel backups. backup_kernel_finddir () { CNT=0 while true ; do # Pathname does not exist, so it is OK use that name # for backup directory. if [ ! -e $BASEDIR/$BACKUPKERNELDIR ]; then return 0 fi # If directory do exist, we only use if it has our # marker file. if [ -d $BASEDIR/$BACKUPKERNELDIR -a \ -e $BASEDIR/$BACKUPKERNELDIR/.freebsd-update ]; then return 0 fi # We could not use current directory name, so add counter to # the end and try again. CNT=$((CNT + 1)) if [ $CNT -gt 9 ]; then echo "Could not find valid backup dir ($BASEDIR/$BACKUPKERNELDIR)" exit 1 fi BACKUPKERNELDIR="`echo $BACKUPKERNELDIR | sed -Ee 's/[0-9]\$//'`" BACKUPKERNELDIR="${BACKUPKERNELDIR}${CNT}" done } # Backup the current kernel using hardlinks, if not disabled by user. # Since we delete all files in the directory used for previous backups # we create a marker file called ".freebsd-update" in the directory so # we can determine on the next run that the directory was created by # freebsd-update and we then do not accidentally remove user files in # the unlikely case that the user has created a directory with a # conflicting name. backup_kernel () { # Only make kernel backup is so configured. if [ $BACKUPKERNEL != yes ]; then return 0 fi # Decide which directory name to use for kernel backups. backup_kernel_finddir # Remove old kernel backup files. If $BACKUPKERNELDIR was # "not ours", backup_kernel_finddir would have exited, so # deleting the directory content is as safe as we can make it. if [ -d $BASEDIR/$BACKUPKERNELDIR ]; then rm -fr $BASEDIR/$BACKUPKERNELDIR fi # Create directories for backup. mkdir -p $BASEDIR/$BACKUPKERNELDIR mtree -cdn -p "${BASEDIR}/${KERNELDIR}" | \ mtree -Ue -p "${BASEDIR}/${BACKUPKERNELDIR}" > /dev/null # Mark the directory as having been created by freebsd-update. touch $BASEDIR/$BACKUPKERNELDIR/.freebsd-update if [ $? -ne 0 ]; then echo "Could not create kernel backup directory" exit 1 fi # Disable pathname expansion to be sure *.symbols is not # expanded. set -f # Use find to ignore symbol files, unless disabled by user. if [ $BACKUPKERNELSYMBOLFILES = yes ]; then FINDFILTER="" else FINDFILTER="-a ! -name *.debug -a ! -name *.symbols" fi # Backup all the kernel files using hardlinks. (cd ${BASEDIR}/${KERNELDIR} && find . -type f $FINDFILTER -exec \ cp -pl '{}' ${BASEDIR}/${BACKUPKERNELDIR}/'{}' \;) # Re-enable patchname expansion. set +f } # Install new files install_from_index () { # First pass: Do everything apart from setting file flags. We # can't set flags yet, because schg inhibits hard linking. sort -k 1,1 -t '|' $1 | tr '|' ' ' | while read FPATH TYPE OWNER GROUP PERM FLAGS HASH LINK; do case ${TYPE} in d) # Create a directory install -d -o ${OWNER} -g ${GROUP} \ -m ${PERM} ${BASEDIR}/${FPATH} ;; f) if [ -z "${LINK}" ]; then # Create a file, without setting flags. gunzip < files/${HASH}.gz > ${HASH} install -S -o ${OWNER} -g ${GROUP} \ -m ${PERM} ${HASH} ${BASEDIR}/${FPATH} rm ${HASH} else # Create a hard link. ln -f ${BASEDIR}/${LINK} ${BASEDIR}/${FPATH} fi ;; L) # Create a symlink ln -sfh ${HASH} ${BASEDIR}/${FPATH} ;; esac done # Perform a second pass, adding file flags. tr '|' ' ' < $1 | while read FPATH TYPE OWNER GROUP PERM FLAGS HASH LINK; do if [ ${TYPE} = "f" ] && ! [ ${FLAGS} = "0" ]; then chflags ${FLAGS} ${BASEDIR}/${FPATH} fi done } # Remove files which we want to delete install_delete () { # Generate list of new files cut -f 1 -d '|' < $2 | sort > newfiles # Generate subindex of old files we want to nuke sort -k 1,1 -t '|' $1 | join -t '|' -v 1 - newfiles | sort -r -k 1,1 -t '|' | cut -f 1,2 -d '|' | tr '|' ' ' > killfiles # Remove the offending bits while read FPATH TYPE; do case ${TYPE} in d) rmdir ${BASEDIR}/${FPATH} ;; f) rm ${BASEDIR}/${FPATH} ;; L) rm ${BASEDIR}/${FPATH} ;; esac done < killfiles # Clean up rm newfiles killfiles } # Install new files, delete old files, and update linker.hints install_files () { # If we haven't already dealt with the kernel, deal with it. if ! [ -f $1/kerneldone ]; then grep -E '^/boot/' $1/INDEX-OLD > INDEX-OLD grep -E '^/boot/' $1/INDEX-NEW > INDEX-NEW # Backup current kernel before installing a new one backup_kernel || return 1 # Install new files install_from_index INDEX-NEW || return 1 # Remove files which need to be deleted install_delete INDEX-OLD INDEX-NEW || return 1 # Update linker.hints if necessary if [ -s INDEX-OLD -o -s INDEX-NEW ]; then kldxref -R ${BASEDIR}/boot/ 2>/dev/null fi # We've finished updating the kernel. touch $1/kerneldone # Do we need to ask for a reboot now? if [ -f $1/kernelfirst ] && [ -s INDEX-OLD -o -s INDEX-NEW ]; then cat <<-EOF Kernel updates have been installed. Please reboot and run "$0 install" again to finish installing updates. EOF exit 0 fi fi # If we haven't already dealt with the world, deal with it. if ! [ -f $1/worlddone ]; then # Create any necessary directories first grep -vE '^/boot/' $1/INDEX-NEW | grep -E '^[^|]+\|d\|' > INDEX-NEW install_from_index INDEX-NEW || return 1 # Install new runtime linker grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '^[^|]+\|d\|' | grep -E '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' > INDEX-NEW install_from_index INDEX-NEW || return 1 # Install new shared libraries next grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '^[^|]+\|d\|' | grep -vE '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-NEW install_from_index INDEX-NEW || return 1 # Deal with everything else grep -vE '^/boot/' $1/INDEX-OLD | grep -vE '^[^|]+\|d\|' | grep -vE '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' | grep -vE '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-OLD grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '^[^|]+\|d\|' | grep -vE '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' | grep -vE '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-NEW install_from_index INDEX-NEW || return 1 install_delete INDEX-OLD INDEX-NEW || return 1 # Rebuild generated pwd files. if [ ${BASEDIR}/etc/master.passwd -nt ${BASEDIR}/etc/spwd.db ] || [ ${BASEDIR}/etc/master.passwd -nt ${BASEDIR}/etc/pwd.db ] || [ ${BASEDIR}/etc/master.passwd -nt ${BASEDIR}/etc/passwd ]; then pwd_mkdb -d ${BASEDIR}/etc -p ${BASEDIR}/etc/master.passwd fi # Rebuild /etc/login.conf.db if necessary. if [ ${BASEDIR}/etc/login.conf -nt ${BASEDIR}/etc/login.conf.db ]; then cap_mkdb ${BASEDIR}/etc/login.conf fi # Rebuild man page databases, if necessary. for D in /usr/share/man /usr/share/openssl/man; do if [ ! -d ${BASEDIR}/$D ]; then continue fi if [ -z "$(find ${BASEDIR}/$D -type f -newer ${BASEDIR}/$D/mandoc.db)" ]; then continue; fi makewhatis ${BASEDIR}/$D done # We've finished installing the world and deleting old files # which are not shared libraries. touch $1/worlddone # Do we need to ask the user to portupgrade now? grep -vE '^/boot/' $1/INDEX-NEW | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' | cut -f 1 -d '|' | sort > newfiles if grep -vE '^/boot/' $1/INDEX-OLD | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' | cut -f 1 -d '|' | sort | join -v 1 - newfiles | grep -q .; then cat <<-EOF Completing this upgrade requires removing old shared object files. Please rebuild all installed 3rd party software (e.g., programs installed from the ports tree) and then run "$0 install" again to finish installing updates. EOF rm newfiles exit 0 fi rm newfiles fi # Remove old shared libraries grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '^[^|]+\|d\|' | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-NEW grep -vE '^/boot/' $1/INDEX-OLD | grep -vE '^[^|]+\|d\|' | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-OLD install_delete INDEX-OLD INDEX-NEW || return 1 # Remove old directories grep -vE '^/boot/' $1/INDEX-NEW | grep -E '^[^|]+\|d\|' > INDEX-NEW grep -vE '^/boot/' $1/INDEX-OLD | grep -E '^[^|]+\|d\|' > INDEX-OLD install_delete INDEX-OLD INDEX-NEW || return 1 # Remove temporary files rm INDEX-OLD INDEX-NEW } # Rearrange bits to allow the installed updates to be rolled back install_setup_rollback () { # Remove the "reboot after installing kernel", "kernel updated", and # "finished installing the world" flags if present -- they are # irrelevant when rolling back updates. if [ -f ${BDHASH}-install/kernelfirst ]; then rm ${BDHASH}-install/kernelfirst rm ${BDHASH}-install/kerneldone fi if [ -f ${BDHASH}-install/worlddone ]; then rm ${BDHASH}-install/worlddone fi if [ -L ${BDHASH}-rollback ]; then mv ${BDHASH}-rollback ${BDHASH}-install/rollback fi mv ${BDHASH}-install ${BDHASH}-rollback } # Actually install updates install_run () { echo -n "Installing updates..." # Make sure we have all the files we should have install_verify ${BDHASH}-install/INDEX-OLD \ ${BDHASH}-install/INDEX-NEW || return 1 # Remove system immutable flag from files install_unschg ${BDHASH}-install/INDEX-OLD \ ${BDHASH}-install/INDEX-NEW || return 1 # Install new files, delete old files, and update linker.hints install_files ${BDHASH}-install || return 1 # Rearrange bits to allow the installed updates to be rolled back install_setup_rollback echo " done." } # Rearrange bits to allow the previous set of updates to be rolled back next. rollback_setup_rollback () { if [ -L ${BDHASH}-rollback/rollback ]; then mv ${BDHASH}-rollback/rollback rollback-tmp rm -r ${BDHASH}-rollback/ rm ${BDHASH}-rollback mv rollback-tmp ${BDHASH}-rollback else rm -r ${BDHASH}-rollback/ rm ${BDHASH}-rollback fi } # Install old files, delete new files, and update linker.hints rollback_files () { # Install old shared library files which don't have the same path as # a new shared library file. grep -vE '^/boot/' $1/INDEX-NEW | grep -E '/lib/.*\.so\.[0-9]+\|' | cut -f 1 -d '|' | sort > INDEX-NEW.libs.flist grep -vE '^/boot/' $1/INDEX-OLD | grep -E '/lib/.*\.so\.[0-9]+\|' | sort -k 1,1 -t '|' - | join -t '|' -v 1 - INDEX-NEW.libs.flist > INDEX-OLD install_from_index INDEX-OLD || return 1 # Deal with files which are neither kernel nor shared library grep -vE '^/boot/' $1/INDEX-OLD | grep -vE '/lib/.*\.so\.[0-9]+\|' > INDEX-OLD grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '/lib/.*\.so\.[0-9]+\|' > INDEX-NEW install_from_index INDEX-OLD || return 1 install_delete INDEX-NEW INDEX-OLD || return 1 # Install any old shared library files which we didn't install above. grep -vE '^/boot/' $1/INDEX-OLD | grep -E '/lib/.*\.so\.[0-9]+\|' | sort -k 1,1 -t '|' - | join -t '|' - INDEX-NEW.libs.flist > INDEX-OLD install_from_index INDEX-OLD || return 1 # Delete unneeded shared library files grep -vE '^/boot/' $1/INDEX-OLD | grep -E '/lib/.*\.so\.[0-9]+\|' > INDEX-OLD grep -vE '^/boot/' $1/INDEX-NEW | grep -E '/lib/.*\.so\.[0-9]+\|' > INDEX-NEW install_delete INDEX-NEW INDEX-OLD || return 1 # Deal with kernel files grep -E '^/boot/' $1/INDEX-OLD > INDEX-OLD grep -E '^/boot/' $1/INDEX-NEW > INDEX-NEW install_from_index INDEX-OLD || return 1 install_delete INDEX-NEW INDEX-OLD || return 1 if [ -s INDEX-OLD -o -s INDEX-NEW ]; then kldxref -R /boot/ 2>/dev/null fi # Remove temporary files rm INDEX-OLD INDEX-NEW INDEX-NEW.libs.flist } # Actually rollback updates rollback_run () { echo -n "Uninstalling updates..." # If there are updates waiting to be installed, remove them; we # want the user to re-run 'fetch' after rolling back updates. if [ -L ${BDHASH}-install ]; then rm -r ${BDHASH}-install/ rm ${BDHASH}-install fi # Make sure we have all the files we should have install_verify ${BDHASH}-rollback/INDEX-NEW \ ${BDHASH}-rollback/INDEX-OLD || return 1 # Remove system immutable flag from files install_unschg ${BDHASH}-rollback/INDEX-NEW \ ${BDHASH}-rollback/INDEX-OLD || return 1 # Install old files, delete new files, and update linker.hints rollback_files ${BDHASH}-rollback || return 1 # Remove the rollback directory and the symlink pointing to it; and # rearrange bits to allow the previous set of updates to be rolled # back next. rollback_setup_rollback echo " done." } # Compare INDEX-ALL and INDEX-PRESENT and print warnings about differences. IDS_compare () { # Get all the lines which mismatch in something other than file # flags. We ignore file flags because sysinstall doesn't seem to # set them when it installs FreeBSD; warning about these adds a # very large amount of noise. cut -f 1-5,7-8 -d '|' $1 > $1.noflags sort -k 1,1 -t '|' $1.noflags > $1.sorted cut -f 1-5,7-8 -d '|' $2 | comm -13 $1.noflags - | fgrep -v '|-|||||' | sort -k 1,1 -t '|' | join -t '|' $1.sorted - > INDEX-NOTMATCHING # Ignore files which match IDSIGNOREPATHS. for X in ${IDSIGNOREPATHS}; do grep -E "^${X}" INDEX-NOTMATCHING done | sort -u | comm -13 - INDEX-NOTMATCHING > INDEX-NOTMATCHING.tmp mv INDEX-NOTMATCHING.tmp INDEX-NOTMATCHING # Go through the lines and print warnings. local IFS='|' while read FPATH TYPE OWNER GROUP PERM HASH LINK P_TYPE P_OWNER P_GROUP P_PERM P_HASH P_LINK; do # Warn about different object types. if ! [ "${TYPE}" = "${P_TYPE}" ]; then echo -n "${FPATH} is a " case "${P_TYPE}" in f) echo -n "regular file, " ;; d) echo -n "directory, " ;; L) echo -n "symlink, " ;; esac echo -n "but should be a " case "${TYPE}" in f) echo -n "regular file." ;; d) echo -n "directory." ;; L) echo -n "symlink." ;; esac echo # Skip other tests, since they don't make sense if # we're comparing different object types. continue fi # Warn about different owners. if ! [ "${OWNER}" = "${P_OWNER}" ]; then echo -n "${FPATH} is owned by user id ${P_OWNER}, " echo "but should be owned by user id ${OWNER}." fi # Warn about different groups. if ! [ "${GROUP}" = "${P_GROUP}" ]; then echo -n "${FPATH} is owned by group id ${P_GROUP}, " echo "but should be owned by group id ${GROUP}." fi # Warn about different permissions. We do not warn about # different permissions on symlinks, since some archivers # don't extract symlink permissions correctly and they are # ignored anyway. if ! [ "${PERM}" = "${P_PERM}" ] && ! [ "${TYPE}" = "L" ]; then echo -n "${FPATH} has ${P_PERM} permissions, " echo "but should have ${PERM} permissions." fi # Warn about different file hashes / symlink destinations. if ! [ "${HASH}" = "${P_HASH}" ]; then if [ "${TYPE}" = "L" ]; then echo -n "${FPATH} is a symlink to ${P_HASH}, " echo "but should be a symlink to ${HASH}." fi if [ "${TYPE}" = "f" ]; then echo -n "${FPATH} has SHA256 hash ${P_HASH}, " echo "but should have SHA256 hash ${HASH}." fi fi # We don't warn about different hard links, since some # some archivers break hard links, and as long as the # underlying data is correct they really don't matter. done < INDEX-NOTMATCHING # Clean up rm $1 $1.noflags $1.sorted $2 INDEX-NOTMATCHING } # Do the work involved in comparing the system to a "known good" index IDS_run () { workdir_init || return 1 # Prepare the mirror list. fetch_pick_server_init && fetch_pick_server # Try to fetch the public key until we run out of servers. while ! fetch_key; do fetch_pick_server || return 1 done # Try to fetch the metadata index signature ("tag") until we run # out of available servers; and sanity check the downloaded tag. while ! fetch_tag; do fetch_pick_server || return 1 done fetch_tagsanity || return 1 # Fetch INDEX-OLD and INDEX-ALL. fetch_metadata INDEX-OLD INDEX-ALL || return 1 # Generate filtered INDEX-OLD and INDEX-ALL files containing only # the components we want and without anything marked as "Ignore". fetch_filter_metadata INDEX-OLD || return 1 fetch_filter_metadata INDEX-ALL || return 1 # Merge the INDEX-OLD and INDEX-ALL files into INDEX-ALL. sort INDEX-OLD INDEX-ALL > INDEX-ALL.tmp mv INDEX-ALL.tmp INDEX-ALL rm INDEX-OLD # Translate /boot/${KERNCONF} to ${KERNELDIR} fetch_filter_kernel_names INDEX-ALL ${KERNCONF} # Inspect the system and generate an INDEX-PRESENT file. fetch_inspect_system INDEX-ALL INDEX-PRESENT /dev/null || return 1 # Compare INDEX-ALL and INDEX-PRESENT and print warnings about any # differences. IDS_compare INDEX-ALL INDEX-PRESENT } #### Main functions -- call parameter-handling and core functions # Using the command line, configuration file, and defaults, # set all the parameters which are needed later. get_params () { init_params parse_cmdline $@ parse_conffile default_params } # Fetch command. Make sure that we're being called # interactively, then run fetch_check_params and fetch_run cmd_fetch () { if [ ! -t 0 -a $NOTTYOK -eq 0 ]; then echo -n "`basename $0` fetch should not " echo "be run non-interactively." echo "Run `basename $0` cron instead." exit 1 fi fetch_check_params fetch_run || exit 1 ISFETCHED=1 } # Cron command. Make sure the parameters are sensible; wait # rand(3600) seconds; then fetch updates. While fetching updates, # send output to a temporary file; only print that file if the # fetching failed. cmd_cron () { fetch_check_params sleep `jot -r 1 0 3600` TMPFILE=`mktemp /tmp/freebsd-update.XXXXXX` || exit 1 if ! fetch_run >> ${TMPFILE} || ! grep -q "No updates needed" ${TMPFILE} || [ ${VERBOSELEVEL} = "debug" ]; then mail -s "`hostname` security updates" ${MAILTO} < ${TMPFILE} fi rm ${TMPFILE} } # Fetch files for upgrading to a new release. cmd_upgrade () { upgrade_check_params upgrade_run || exit 1 } # Install downloaded updates. cmd_install () { install_check_params install_run || exit 1 } # Rollback most recently installed updates. cmd_rollback () { rollback_check_params rollback_run || exit 1 } # Compare system against a "known good" index. cmd_IDS () { IDS_check_params IDS_run || exit 1 } #### Entry point # Make sure we find utilities from the base system export PATH=/sbin:/bin:/usr/sbin:/usr/bin:${PATH} # Set a pager if the user doesn't if [ -z "$PAGER" ]; then PAGER=/usr/bin/less fi # Set LC_ALL in order to avoid problems with character ranges like [A-Z]. export LC_ALL=C get_params $@ for COMMAND in ${COMMANDS}; do cmd_${COMMAND} done Index: projects/clang900-import/usr.sbin/newsyslog/newsyslog.conf =================================================================== --- projects/clang900-import/usr.sbin/newsyslog/newsyslog.conf (revision 352536) +++ projects/clang900-import/usr.sbin/newsyslog/newsyslog.conf (revision 352537) @@ -1,35 +1,36 @@ # configuration file for newsyslog # $FreeBSD$ # # Entries which do not specify the '/pid_file' field will cause the # syslogd process to be signalled when that log file is rotated. This # action is only appropriate for log files which are written to by the # syslogd process (ie, files listed in /etc/syslog.conf). If there # is no process which needs to be signalled when a given log file is # rotated, then the entry for that file should include the 'N' flag. # # Note: some sites will want to select more restrictive protections than the # defaults. In particular, it may be desirable to switch many of the 644 # entries to 640 or 600. For example, some sites will consider the # contents of maillog, messages, and lpd-errs to be confidential. In the # future, these defaults may change to more conservative ones. # # logfilename [owner:group] mode count size when flags [/pid_file] [sig_num] /var/log/all.log 600 7 * @T00 J /var/log/auth.log 600 7 1000 @0101T JC /var/log/console.log 600 5 1000 * J /var/log/cron 600 3 1000 * JC /var/log/daily.log 640 7 * @T00 JN /var/log/debug.log 600 7 1000 * JC /var/log/init.log 644 3 1000 * J /var/log/kerberos.log 600 7 1000 * J /var/log/maillog 640 7 * @T00 JC /var/log/messages 644 5 1000 @0101T JC /var/log/monthly.log 640 12 * $M1D0 JN /var/log/devd.log 644 3 1000 * JC /var/log/security 600 10 1000 * JC /var/log/utx.log 644 3 * @01T05 B /var/log/weekly.log 640 5 * $W6D0 JN +/var/log/daemon.log 644 5 1000 @0101T JC /etc/newsyslog.conf.d/[!.]*.conf /usr/local/etc/newsyslog.conf.d/[!.]*.conf Index: projects/clang900-import/usr.sbin/ntp/libntp/Makefile =================================================================== --- projects/clang900-import/usr.sbin/ntp/libntp/Makefile (revision 352536) +++ projects/clang900-import/usr.sbin/ntp/libntp/Makefile (revision 352537) @@ -1,93 +1,93 @@ # $FreeBSD$ .PATH: ${SRCTOP}/contrib/ntp/libntp \ ${SRCTOP}/contrib/ntp/lib/isc \ ${SRCTOP}/contrib/ntp/lib/isc/nls \ ${SRCTOP}/contrib/ntp/lib/isc/pthreads \ ${SRCTOP}/contrib/ntp/lib/isc/unix \ LIB= ntp INTERNALLIB= NTP_SRCS= systime.c a_md5encrypt.c adjtime.c atoint.c \ atolfp.c atouint.c audio.c authkeys.c \ authreadkeys.c authusekey.c bsd_strerror.c buftvtots.c \ caljulian.c caltontp.c calyearstart.c clocktime.c \ clocktypes.c decodenetnum.c dofptoa.c dolfptoa.c \ emalloc.c findconfig.c getopt.c hextoint.c \ hextolfp.c humandate.c icom.c iosignal.c \ is_ip_address.c \ lib_strbuf.c \ libssl_compat.c \ machines.c mktime.c modetoa.c \ mstolfp.c msyslog.c netof.c ntp_calendar.c \ ntp_crypto_rnd.c ntp_intres.c ntp_libopts.c \ ntp_lineedit.c ntp_random.c ntp_rfc2553.c ntp_worker.c \ numtoa.c numtohost.c octtoint.c prettydate.c \ recvbuff.c refidsmear.c \ refnumtoa.c snprintf.c socket.c \ socktoa.c socktohost.c ssl_init.c statestr.c \ strdup.c strl_obsd.c syssignal.c timetoa.c \ timevalops.c uglydate.c vint64ops.c work_fork.c \ work_thread.c xsbprintf.c ymd2yd.c ISC_PTHREADS_SRCS= condition.c \ thread.c \ mutex.c ISC_UNIX_SRCS= dir.c \ errno2result.c \ file.c \ interfaceiter.c \ net.c \ stdio.c \ stdtime.c \ strerror.c \ time.c \ tsmemcmp.c ISC_NLS_SRCS= msgcat.c ISC_SRCS= assertions.c \ buffer.c \ backtrace-emptytbl.c \ backtrace.c \ error.c \ event.c \ inet_ntop.c \ inet_pton.c \ lib.c \ log.c \ md5.c \ netaddr.c \ netscope.c \ ondestroy.c \ random.c \ result.c \ task.c \ sha1.c \ sockaddr.c \ ${ISC_NLS_SRCS} \ ${ISC_PTHREADS_SRCS} \ ${ISC_UNIX_SRCS} SRCS= ${NTP_SRCS} ${ISC_SRCS} version.c CFLAGS+= -I${SRCTOP}/contrib/ntp/include \ -I${SRCTOP}/contrib/ntp/lib/isc/include \ -I${SRCTOP}/contrib/ntp/lib/isc/unix/include \ -I${SRCTOP}/contrib/ntp/lib/isc/pthreads/include \ -I${SRCTOP}/contrib/ntp/sntp/libopts \ -I${SRCTOP}/lib/libc/${MACHINE_ARCH} \ -I${SYSROOT:U${DESTDIR}}/${INCLUDEDIR}/edit \ -I${.CURDIR:H} \ -I${.CURDIR}/ -CFLAGS+= -DHAVE_BSD_NICE -DHAVE_STDINT_H +CFLAGS+= -DHAVE_BSD_NICE -DHAVE_STDINT_H -DHAVE_CLOSEFROM CLEANFILES+= .version version.c version.c: sh -e ${.CURDIR:H}/scripts/mkver ntpd .include Index: projects/clang900-import/usr.sbin/pkg/Makefile =================================================================== --- projects/clang900-import/usr.sbin/pkg/Makefile (revision 352536) +++ projects/clang900-import/usr.sbin/pkg/Makefile (revision 352537) @@ -1,16 +1,26 @@ # $FreeBSD$ +.if ${MACHINE} != "amd64" && ${MACHINE} != "i386" +PKGCONFBRANCH?= quarterly +.else +_BRANCH!= ${MAKE} -C ${SRCTOP}/release -V BRANCH +BRANCH?= ${_BRANCH} +. if ${BRANCH:MBETA*} || ${BRANCH:MRC*} || ${BRANCH:MRELEASE*} +PKGCONFBRANCH?= quarterly +. else PKGCONFBRANCH?= latest +. endif +.endif CONFS= FreeBSD.conf.${PKGCONFBRANCH} CONFSNAME= FreeBSD.conf CONFSDIR= /etc/pkg CONFSMODE= 644 PROG= pkg SRCS= pkg.c dns_utils.c config.c MAN= pkg.7 CFLAGS+=-I${SRCTOP}/contrib/libucl/include .PATH: ${SRCTOP}/contrib/libucl/include LIBADD= archive fetch ucl sbuf crypto ssl .include Index: projects/clang900-import/usr.sbin/syslogd/syslog.conf =================================================================== --- projects/clang900-import/usr.sbin/syslogd/syslog.conf (revision 352536) +++ projects/clang900-import/usr.sbin/syslogd/syslog.conf (revision 352537) @@ -1,34 +1,35 @@ # $FreeBSD$ # # Spaces ARE valid field separators in this file. However, # other *nix-like systems still insist on using tabs as field # separators. If you are sharing this file between systems, you # may want to use only tabs as field separators here. # Consult the syslog.conf(5) manpage. *.err;kern.warning;auth.notice;mail.crit /dev/console *.notice;authpriv.none;kern.debug;lpr.info;mail.crit;news.err /var/log/messages security.* /var/log/security auth.info;authpriv.info /var/log/auth.log mail.info /var/log/maillog cron.* /var/log/cron !-devd *.=debug /var/log/debug.log *.emerg * +daemon.info /var/log/daemon.log # uncomment this to log all writes to /dev/console to /var/log/console.log # touch /var/log/console.log and chmod it to mode 600 before it will work #console.info /var/log/console.log # uncomment this to enable logging of all log messages to /var/log/all.log # touch /var/log/all.log and chmod it to mode 600 before it will work #*.* /var/log/all.log # uncomment this to enable logging to a remote loghost named loghost #*.* @loghost # uncomment these if you're running inn # news.crit /var/log/news/news.crit # news.err /var/log/news/news.err # news.notice /var/log/news/news.notice # Uncomment this if you wish to see messages produced by devd # !devd # *.>=notice /var/log/devd.log !* include /etc/syslog.d include /usr/local/etc/syslog.d Index: projects/clang900-import =================================================================== --- projects/clang900-import (revision 352536) +++ projects/clang900-import (revision 352537) Property changes on: projects/clang900-import ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r352436-352536