Index: projects/clang900-import/Makefile.inc1 =================================================================== --- projects/clang900-import/Makefile.inc1 (revision 352586) +++ projects/clang900-import/Makefile.inc1 (revision 352587) @@ -1,3400 +1,3401 @@ # # $FreeBSD$ # # Make command line options: # -DNO_CLEANDIR run ${MAKE} clean, instead of ${MAKE} cleandir # -DNO_CLEAN do not clean at all # -DDB_FROM_SRC use the user/group databases in src/etc instead of # the system database when installing. # -DNO_SHARE do not go into share subdir # -DKERNFAST define NO_KERNEL{CONFIG,CLEAN,OBJ} # -DNO_KERNELCONFIG do not run config in ${MAKE} buildkernel # -DNO_KERNELCLEAN do not run ${MAKE} clean in ${MAKE} buildkernel # -DNO_KERNELOBJ do not run ${MAKE} obj in ${MAKE} buildkernel # -DNO_PORTSUPDATE do not update ports in ${MAKE} update # -DNO_ROOT install without using root privilege # -DNO_DOCUPDATE do not update doc in ${MAKE} update # -DWITHOUT_CTF do not run the DTrace CTF conversion tools on built objects # LOCAL_DIRS="list of dirs" to add additional dirs to the SUBDIR list # LOCAL_ITOOLS="list of tools" to add additional tools to the ITOOLS list # LOCAL_LIB_DIRS="list of dirs" to add additional dirs to libraries target # LOCAL_MTREE="list of mtree files" to process to allow local directories # to be created before files are installed # LOCAL_TOOL_DIRS="list of dirs" to add additional dirs to the build-tools # list # LOCAL_XTOOL_DIRS="list of dirs" to add additional dirs to the # cross-tools target # METALOG="path to metadata log" to write permission and ownership # when NO_ROOT is set. (default: ${DESTDIR}/METALOG) # TARGET="machine" to crossbuild world for a different machine type # TARGET_ARCH= may be required when a TARGET supports multiple endians # BUILDENV_SHELL= shell to launch for the buildenv target (def:${SHELL}) # WORLD_FLAGS= additional flags to pass to make(1) during buildworld # KERNEL_FLAGS= additional flags to pass to make(1) during buildkernel # SUBDIR_OVERRIDE="list of dirs" to build rather than everything. # All libraries and includes, and some build tools will still build. # # The intended user-driven targets are: # buildworld - rebuild *everything*, including glue to help do upgrades # installworld- install everything built by "buildworld" # checkworld - run test suite on installed world # doxygen - build API documentation of the kernel # update - convenient way to update your source tree (eg: svn/svnup) # # Standard targets (not defined here) are documented in the makefiles in # /usr/share/mk. These include: # obj depend all install clean cleandepend cleanobj .if !defined(TARGET) || !defined(TARGET_ARCH) .error "Both TARGET and TARGET_ARCH must be defined." .endif .if make(showconfig) || make(test-system-*) _MKSHOWCONFIG= t .endif SRCDIR?= ${.CURDIR} LOCALBASE?= /usr/local # Cross toolchain changes must be in effect before bsd.compiler.mk # so that gets the right CC, and pass CROSS_TOOLCHAIN to submakes. .if defined(CROSS_TOOLCHAIN) .if exists(${LOCALBASE}/share/toolchains/${CROSS_TOOLCHAIN}.mk) .include "${LOCALBASE}/share/toolchains/${CROSS_TOOLCHAIN}.mk" .elif exists(${CROSS_TOOLCHAIN}) .include "${CROSS_TOOLCHAIN}" .else .error CROSS_TOOLCHAIN ${CROSS_TOOLCHAIN} not found .endif CROSSENV+=CROSS_TOOLCHAIN="${CROSS_TOOLCHAIN}" .endif .if defined(CROSS_TOOLCHAIN_PREFIX) CROSS_COMPILER_PREFIX?=${CROSS_TOOLCHAIN_PREFIX} .endif XCOMPILERS= CC CXX CPP .for COMPILER in ${XCOMPILERS} .if defined(CROSS_COMPILER_PREFIX) X${COMPILER}?= ${CROSS_COMPILER_PREFIX}${${COMPILER}} .else X${COMPILER}?= ${${COMPILER}} .endif .endfor # If a full path to an external cross compiler is given, don't build # a cross compiler. .if ${XCC:N${CCACHE_BIN}:M/*} MK_CLANG_BOOTSTRAP= no MK_GCC_BOOTSTRAP= no .endif # Pull in compiler metadata from buildworld/toolchain if possible to avoid # running CC from bsd.compiler.mk. .if make(installworld) || make(install) || make(distributeworld) || \ make(stageworld) .-include "${OBJTOP}/toolchain-metadata.mk" .if !defined(_LOADED_TOOLCHAIN_METADATA) .error A build is required first. You may have the wrong MAKEOBJDIRPREFIX set. .endif .endif # Pull in COMPILER_TYPE and COMPILER_FREEBSD_VERSION early. Pull it from the # tree to be friendlier to foreign OS builds. It's safe to do so unconditionally # here since we will always have the right make, unlike in src/Makefile # Don't include bsd.linker.mk yet until XBINUTILS is handled (after src.opts.mk) _NO_INCLUDE_LINKERMK= t # We also want the X_COMPILER* variables if we are using an external toolchain. _WANT_TOOLCHAIN_CROSS_VARS= t .include "share/mk/bsd.compiler.mk" .undef _NO_INCLUDE_LINKERMK .undef _WANT_TOOLCHAIN_CROSS_VARS # src.opts.mk depends on COMPILER_FEATURES .include "share/mk/src.opts.mk" .if ${TARGET} == ${MACHINE} TARGET_CPUTYPE?=${CPUTYPE} .else TARGET_CPUTYPE?= .endif .if !empty(TARGET_CPUTYPE) _TARGET_CPUTYPE=${TARGET_CPUTYPE} .else _TARGET_CPUTYPE=dummy .endif .if ${TARGET} == "arm" .if ${TARGET_ARCH:Marmv[67]*} != "" && ${TARGET_CPUTYPE:M*soft*} == "" TARGET_ABI= gnueabihf .else TARGET_ABI= gnueabi .endif .endif MACHINE_ABI?= unknown MACHINE_TRIPLE?=${MACHINE_ARCH:S/amd64/x86_64/:C/hf$//:S/mipsn32/mips64/}-${MACHINE_ABI}-freebsd13.0 TARGET_ABI?= unknown TARGET_TRIPLE?= ${TARGET_ARCH:S/amd64/x86_64/:C/hf$//:S/mipsn32/mips64/}-${TARGET_ABI}-freebsd13.0 KNOWN_ARCHES?= aarch64/arm64 \ amd64 \ arm \ armv6/arm \ armv7/arm \ i386 \ mips \ mipsel/mips \ mips64el/mips \ mipsn32el/mips \ mips64/mips \ mipsn32/mips \ mipshf/mips \ mipselhf/mips \ mips64elhf/mips \ mips64hf/mips \ powerpc \ powerpc64/powerpc \ powerpcspe/powerpc \ riscv64/riscv \ riscv64sf/riscv \ sparc64 .if ${TARGET} == ${TARGET_ARCH} _t= ${TARGET} .else _t= ${TARGET_ARCH}/${TARGET} .endif .for _t in ${_t} .if empty(KNOWN_ARCHES:M${_t}) .error Unknown target ${TARGET_ARCH}:${TARGET}. .endif .endfor # If all targets are disabled for system llvm then don't expect it to work # for cross-builds. .if !defined(TOOLS_PREFIX) && ${MK_LLVM_TARGET_ALL} == "no" && \ ${MACHINE} != ${TARGET} && ${MACHINE_ARCH} != ${TARGET_ARCH} && \ !make(showconfig) MK_SYSTEM_COMPILER= no MK_SYSTEM_LINKER= no .endif # Handle external binutils. .if defined(CROSS_TOOLCHAIN_PREFIX) CROSS_BINUTILS_PREFIX?=${CROSS_TOOLCHAIN_PREFIX} .endif # If we do not have a bootstrap binutils (because the in-tree one does not # support the target architecture), provide a default cross-binutils prefix. # This allows riscv64 builds, for example, to automatically use the # riscv64-binutils port or package. .if !make(showconfig) && !defined(_NO_INCLUDE_COMPILERMK) .if !empty(BROKEN_OPTIONS:MBINUTILS_BOOTSTRAP) && \ ${MK_LLD_BOOTSTRAP} == "no" && \ !defined(CROSS_BINUTILS_PREFIX) CROSS_BINUTILS_PREFIX=/usr/local/${TARGET_TRIPLE}/bin/ .if !exists(${CROSS_BINUTILS_PREFIX}) .error In-tree binutils does not support the ${TARGET_ARCH} architecture. Install the ${TARGET_ARCH}-binutils port or package or set CROSS_BINUTILS_PREFIX. .endif .endif .endif XBINUTILS= AS AR LD NM OBJCOPY RANLIB SIZE STRINGS .for BINUTIL in ${XBINUTILS} .if defined(CROSS_BINUTILS_PREFIX) && \ exists(${CROSS_BINUTILS_PREFIX}/${${BINUTIL}}) X${BINUTIL}?= ${CROSS_BINUTILS_PREFIX:C,/*$,,}/${${BINUTIL}} .else X${BINUTIL}?= ${${BINUTIL}} .endif .endfor # If a full path to an external linker is given, don't build lld. .if ${XLD:M/*} MK_LLD_BOOTSTRAP= no .endif # We also want the X_LINKER* variables if we are using an external toolchain. _WANT_TOOLCHAIN_CROSS_VARS= t .include "share/mk/bsd.linker.mk" .undef _WANT_TOOLCHAIN_CROSS_VARS # Begin WITH_SYSTEM_COMPILER / WITH_SYSTEM_LD # WITH_SYSTEM_COMPILER - Pull in needed values and make a decision. # Check if there is a local compiler that can satisfy as an external compiler. # Which compiler is expected to be used? .if ${MK_CLANG_BOOTSTRAP} == "yes" WANT_COMPILER_TYPE= clang .elif ${MK_GCC_BOOTSTRAP} == "yes" WANT_COMPILER_TYPE= gcc .else WANT_COMPILER_TYPE= .endif .if !defined(WANT_COMPILER_FREEBSD_VERSION) && !make(showconfig) && \ !make(test-system-linker) .if ${WANT_COMPILER_TYPE} == "clang" WANT_COMPILER_FREEBSD_VERSION_FILE= lib/clang/freebsd_cc_version.h WANT_COMPILER_FREEBSD_VERSION!= \ awk '$$2 == "FREEBSD_CC_VERSION" {printf("%d\n", $$3)}' \ ${SRCDIR}/${WANT_COMPILER_FREEBSD_VERSION_FILE} || echo unknown WANT_COMPILER_VERSION_FILE= lib/clang/include/clang/Basic/Version.inc WANT_COMPILER_VERSION!= \ awk '$$2 == "CLANG_VERSION" {split($$3, a, "."); print a[1] * 10000 + a[2] * 100 + a[3]}' \ ${SRCDIR}/${WANT_COMPILER_VERSION_FILE} || echo unknown .elif ${WANT_COMPILER_TYPE} == "gcc" WANT_COMPILER_FREEBSD_VERSION_FILE= gnu/usr.bin/cc/cc_tools/freebsd-native.h WANT_COMPILER_FREEBSD_VERSION!= \ awk '$$2 == "FBSD_CC_VER" {printf("%d\n", $$3)}' \ ${SRCDIR}/${WANT_COMPILER_FREEBSD_VERSION_FILE} || echo unknown WANT_COMPILER_VERSION_FILE= contrib/gcc/BASE-VER WANT_COMPILER_VERSION!= \ awk -F. '{print $$1 * 10000 + $$2 * 100 + $$3}' \ ${SRCDIR}/${WANT_COMPILER_VERSION_FILE} || echo unknown .endif .export WANT_COMPILER_FREEBSD_VERSION WANT_COMPILER_VERSION .endif # !defined(WANT_COMPILER_FREEBSD_VERSION) # It needs to be the same revision as we would build for the bootstrap. # If the expected vs CC is different then we can't skip. # GCC cannot be used for cross-arch yet. For clang we pass -target later if # TARGET_ARCH!=MACHINE_ARCH. .if ${MK_SYSTEM_COMPILER} == "yes" && \ defined(WANT_COMPILER_FREEBSD_VERSION) && \ (${MK_CLANG_BOOTSTRAP} == "yes" || ${MK_GCC_BOOTSTRAP} == "yes") && \ !make(xdev*) && \ ${X_COMPILER_TYPE} == ${WANT_COMPILER_TYPE} && \ (${X_COMPILER_TYPE} == "clang" || ${TARGET_ARCH} == ${MACHINE_ARCH}) && \ ${X_COMPILER_VERSION} == ${WANT_COMPILER_VERSION} && \ ${X_COMPILER_FREEBSD_VERSION} == ${WANT_COMPILER_FREEBSD_VERSION} # Everything matches, disable the bootstrap compiler. MK_CLANG_BOOTSTRAP= no MK_GCC_BOOTSTRAP= no USING_SYSTEM_COMPILER= yes .endif # ${WANT_COMPILER_TYPE} == ${COMPILER_TYPE} # WITH_SYSTEM_LD - Pull in needed values and make a decision. # Check if there is a local linker that can satisfy as an external linker. # Which linker is expected to be used? .if ${MK_LLD_BOOTSTRAP} == "yes" WANT_LINKER_TYPE= lld .elif ${MK_BINUTILS_BOOTSTRAP} == "yes" # Note that there's no support for bfd in WITH_SYSTEM_LINKER. WANT_LINKER_TYPE= bfd .else WANT_LINKER_TYPE= .endif .if !defined(WANT_LINKER_FREEBSD_VERSION) && !make(showconfig) && \ !make(test-system-compiler) .if ${WANT_LINKER_TYPE} == "lld" WANT_LINKER_FREEBSD_VERSION_FILE= lib/clang/include/lld/Common/Version.inc WANT_LINKER_FREEBSD_VERSION!= \ awk '$$2 == "LLD_REVISION_STRING" {gsub(/"/, "", $$3); print $$3}' \ ${SRCDIR}/${WANT_LINKER_FREEBSD_VERSION_FILE} || echo unknown WANT_LINKER_VERSION_FILE= lib/clang/include/lld/Common/Version.inc WANT_LINKER_VERSION!= \ awk '$$2 == "LLD_VERSION" {split($$3, a, "."); print a[1] * 10000 + a[2] * 100 + a[3]}' \ ${SRCDIR}/${WANT_LINKER_VERSION_FILE} || echo unknown .else WANT_LINKER_FREEBSD_VERSION_FILE= WANT_LINKER_FREEBSD_VERSION= .endif .export WANT_LINKER_FREEBSD_VERSION WANT_LINKER_VERSION .endif # !defined(WANT_LINKER_FREEBSD_VERSION) .if ${MK_SYSTEM_LINKER} == "yes" && \ defined(WANT_LINKER_FREEBSD_VERSION) && \ (${MK_LLD_BOOTSTRAP} == "yes") && \ !make(xdev*) && \ ${X_LINKER_TYPE} == ${WANT_LINKER_TYPE} && \ ${X_LINKER_VERSION} == ${WANT_LINKER_VERSION} && \ ${X_LINKER_FREEBSD_VERSION} == ${WANT_LINKER_FREEBSD_VERSION} # Everything matches, disable the bootstrap linker. MK_LLD_BOOTSTRAP= no USING_SYSTEM_LINKER= yes .endif # ${WANT_LINKER_TYPE} == ${LINKER_TYPE} # WITH_SYSTEM_COMPILER / WITH_SYSTEM_LINKER - Handle defaults and debug. USING_SYSTEM_COMPILER?= no USING_SYSTEM_LINKER?= no TEST_SYSTEM_COMPILER_VARS= \ USING_SYSTEM_COMPILER MK_SYSTEM_COMPILER \ MK_CROSS_COMPILER MK_CLANG_BOOTSTRAP MK_GCC_BOOTSTRAP \ WANT_COMPILER_TYPE WANT_COMPILER_VERSION WANT_COMPILER_VERSION_FILE \ WANT_COMPILER_FREEBSD_VERSION WANT_COMPILER_FREEBSD_VERSION_FILE \ CC COMPILER_TYPE COMPILER_FEATURES COMPILER_VERSION \ COMPILER_FREEBSD_VERSION \ XCC X_COMPILER_TYPE X_COMPILER_FEATURES X_COMPILER_VERSION \ X_COMPILER_FREEBSD_VERSION TEST_SYSTEM_LINKER_VARS= \ USING_SYSTEM_LINKER MK_SYSTEM_LINKER \ MK_LLD_BOOTSTRAP MK_BINUTILS_BOOTSTRAP \ WANT_LINKER_TYPE WANT_LINKER_VERSION WANT_LINKER_VERSION_FILE \ WANT_LINKER_FREEBSD_VERSION WANT_LINKER_FREEBSD_VERSION_FILE \ LD LINKER_TYPE LINKER_FEATURES LINKER_VERSION \ LINKER_FREEBSD_VERSION \ XLD X_LINKER_TYPE X_LINKER_FEATURES X_LINKER_VERSION \ X_LINKER_FREEBSD_VERSION .for _t in compiler linker test-system-${_t}: .PHONY .for v in ${TEST_SYSTEM_${_t:tu}_VARS} ${_+_}@printf "%-35s= %s\n" "${v}" "${${v}}" .endfor .endfor .if (make(buildworld) || make(buildkernel) || make(kernel-toolchain) || \ make(toolchain) || make(_cross-tools)) .if ${USING_SYSTEM_COMPILER} == "yes" .info SYSTEM_COMPILER: Determined that CC=${CC} matches the source tree. Not bootstrapping a cross-compiler. .elif ${MK_CLANG_BOOTSTRAP} == "yes" .info SYSTEM_COMPILER: libclang will be built for bootstrapping a cross-compiler. .endif .if ${USING_SYSTEM_LINKER} == "yes" .info SYSTEM_LINKER: Determined that LD=${LD} matches the source tree. Not bootstrapping a cross-linker. .elif ${MK_LLD_BOOTSTRAP} == "yes" .info SYSTEM_LINKER: libclang will be built for bootstrapping a cross-linker. .endif .endif # End WITH_SYSTEM_COMPILER / WITH_SYSTEM_LD # Store some compiler metadata for use in installworld where we don't # want to invoke CC at all. _TOOLCHAIN_METADATA_VARS= COMPILER_VERSION \ COMPILER_TYPE \ COMPILER_FEATURES \ COMPILER_FREEBSD_VERSION \ LINKER_VERSION \ LINKER_FEATURES \ LINKER_TYPE \ LINKER_FREEBSD_VERSION toolchain-metadata.mk: .PHONY .META @: > ${.TARGET} @echo ".info Using cached toolchain metadata from build at $$(hostname) on $$(date)" \ > ${.TARGET} @echo "_LOADED_TOOLCHAIN_METADATA=t" >> ${.TARGET} .for v in ${_TOOLCHAIN_METADATA_VARS} @echo "${v}=${${v}}" >> ${.TARGET} @echo "X_${v}=${X_${v}}" >> ${.TARGET} .endfor @echo ".export ${_TOOLCHAIN_METADATA_VARS}" >> ${.TARGET} @echo ".export ${_TOOLCHAIN_METADATA_VARS:C,^,X_,}" >> ${.TARGET} # We must do lib/ and libexec/ before bin/ in case of a mid-install error to # keep the users system reasonably usable. For static->dynamic root upgrades, # we don't want to install a dynamic binary without rtld and the needed # libraries. More commonly, for dynamic root, we don't want to install a # binary that requires a newer library version that hasn't been installed yet. # This ordering is not a guarantee though. The only guarantee of a working # system here would require fine-grained ordering of all components based # on their dependencies. .if !empty(SUBDIR_OVERRIDE) SUBDIR= ${SUBDIR_OVERRIDE} .else SUBDIR= lib libexec # Add LOCAL_LIB_DIRS, but only if they will not be picked up as a SUBDIR # of a LOCAL_DIRS directory. This allows LOCAL_DIRS=foo and # LOCAL_LIB_DIRS=foo/lib to behave as expected. .for _DIR in ${LOCAL_DIRS:M*/} ${LOCAL_DIRS:N*/:S|$|/|} _REDUNDANT_LIB_DIRS+= ${LOCAL_LIB_DIRS:M${_DIR}*} .endfor .for _DIR in ${LOCAL_LIB_DIRS} .if ${_DIR} == ".WAIT" || (empty(_REDUNDANT_LIB_DIRS:M${_DIR}) && exists(${.CURDIR}/${_DIR}/Makefile)) SUBDIR+= ${_DIR} .endif .endfor .if !defined(NO_ROOT) && (make(installworld) || make(install)) # Ensure libraries are installed before progressing. SUBDIR+=.WAIT .endif SUBDIR+=bin .if ${MK_CDDL} != "no" SUBDIR+=cddl .endif SUBDIR+=gnu include .if ${MK_KERBEROS} != "no" SUBDIR+=kerberos5 .endif .if ${MK_RESCUE} != "no" SUBDIR+=rescue .endif SUBDIR+=sbin .if ${MK_CRYPT} != "no" SUBDIR+=secure .endif .if !defined(NO_SHARE) SUBDIR+=share .endif .if ${MK_BOOT} != "no" SUBDIR+=stand .endif SUBDIR+=sys usr.bin usr.sbin .if ${MK_TESTS} != "no" SUBDIR+= tests .endif # Local directories are built in parallel with the base system directories. # Users may insert a .WAIT directive at the beginning or elsewhere within # the LOCAL_DIRS and LOCAL_LIB_DIRS lists as needed. .for _DIR in ${LOCAL_DIRS} .if ${_DIR} == ".WAIT" || exists(${.CURDIR}/${_DIR}/Makefile) SUBDIR+= ${_DIR} .endif .endfor # We must do etc/ last as it hooks into building the man whatis file # by calling 'makedb' in share/man. This is only relevant for # install/distribute so they build the whatis file after every manpage is # installed. .if make(installworld) || make(install) SUBDIR+=.WAIT .endif SUBDIR+=etc .endif # !empty(SUBDIR_OVERRIDE) .if defined(NOCLEAN) .warning NOCLEAN option is deprecated. Use NO_CLEAN instead. NO_CLEAN= ${NOCLEAN} .endif .if defined(NO_CLEANDIR) CLEANDIR= clean cleandepend .else CLEANDIR= cleandir .endif .if defined(WORLDFAST) NO_CLEAN= t NO_OBJWALK= t .endif .if ${MK_META_MODE} == "yes" # If filemon is used then we can rely on the build being incremental-safe. # The .meta files will also track the build command and rebuild should # it change. .if empty(.MAKE.MODE:Mnofilemon) NO_CLEAN= t .endif .endif .if defined(NO_OBJWALK) || ${MK_AUTO_OBJ} == "yes" NO_OBJWALK= t NO_KERNELOBJ= t .endif .if !defined(NO_OBJWALK) _obj= obj .endif LOCAL_TOOL_DIRS?= PACKAGEDIR?= ${DESTDIR}/${DISTDIR} .if empty(SHELL:M*csh*) BUILDENV_SHELL?=${SHELL} .else BUILDENV_SHELL?=/bin/sh .endif .if !defined(_MKSHOWCONFIG) .if !defined(SVN_CMD) || empty(SVN_CMD) . for _P in /usr/bin /usr/local/bin . for _S in svn svnlite . if exists(${_P}/${_S}) SVN_CMD= ${_P}/${_S} . endif . endfor . endfor .export SVN_CMD .endif SVNFLAGS?= -r HEAD .if !defined(VCS_REVISION) || empty(VCS_REVISION) .if !defined(SVNVERSION_CMD) || empty(SVNVERSION_CMD) . for _D in ${PATH:S,:, ,g} . if exists(${_D}/svnversion) SVNVERSION_CMD?=${_D}/svnversion . endif . if exists(${_D}/svnliteversion) SVNVERSION_CMD?=${_D}/svnliteversion . endif . endfor .endif _VCS_REVISION?= $$(eval ${SVNVERSION_CMD} ${SRCDIR}) . if !empty(_VCS_REVISION) VCS_REVISION= $$(echo r${_VCS_REVISION}) . endif .export VCS_REVISION .endif .if !defined(OSRELDATE) .if exists(/usr/include/osreldate.h) OSRELDATE!= awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \ /usr/include/osreldate.h .else OSRELDATE= 0 .endif .export OSRELDATE .endif # Set VERSION for CTFMERGE to use via the default CTFFLAGS=-L VERSION. .if !defined(_REVISION) _REVISION!= ${MAKE} -C ${SRCDIR}/release MK_AUTO_OBJ=no -V REVISION .export _REVISION .endif .if !defined(_BRANCH) _BRANCH!= ${MAKE} -C ${SRCDIR}/release MK_AUTO_OBJ=no -V BRANCH .export _BRANCH .endif .if !defined(SRCRELDATE) SRCRELDATE!= awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \ ${SRCDIR}/sys/sys/param.h .export SRCRELDATE .endif .if !defined(VERSION) VERSION= FreeBSD ${_REVISION}-${_BRANCH:C/-p[0-9]+$//} ${TARGET_ARCH} ${SRCRELDATE} .export VERSION .endif .if !defined(PKG_VERSION) .if ${_BRANCH:MSTABLE*} || ${_BRANCH:MCURRENT*} TIMENOW= %Y%m%d%H%M%S EXTRA_REVISION= .s${TIMENOW:gmtime} .elif ${_BRANCH:MALPHA*} EXTRA_REVISION= _${_BRANCH:C/-ALPHA/.a/} .elif ${_BRANCH:MBETA*} EXTRA_REVISION= _${_BRANCH:C/-BETA/.b/} .elif ${_BRANCH:MRC*} EXTRA_REVISION= _${_BRANCH:C/-RC/.r/} .elif ${_BRANCH:MPRERELEASE*} EXTRA_REVISION= _${_BRANCH:C/-PRERELEASE/.p/} .elif ${_BRANCH:M*-p*} EXTRA_REVISION= _${_BRANCH:C/.*-p([0-9]+$)/\1/} .endif PKG_VERSION= ${_REVISION}${EXTRA_REVISION} .endif .endif # !defined(PKG_VERSION) .if !defined(_MKSHOWCONFIG) _CPUTYPE!= MAKEFLAGS= CPUTYPE=${_TARGET_CPUTYPE} ${MAKE} -f /dev/null \ -m ${.CURDIR}/share/mk MK_AUTO_OBJ=no -V CPUTYPE .if ${_CPUTYPE} != ${_TARGET_CPUTYPE} .error CPUTYPE global should be set with ?=. .endif .endif .if make(buildworld) BUILD_ARCH!= uname -p .if ${MACHINE_ARCH} != ${BUILD_ARCH} .error To cross-build, set TARGET_ARCH. .endif .endif WORLDTMP?= ${OBJTOP}/tmp BPATH= ${CCACHE_WRAPPER_PATH_PFX}${WORLDTMP}/legacy/usr/sbin:${WORLDTMP}/legacy/usr/bin:${WORLDTMP}/legacy/bin XPATH= ${WORLDTMP}/usr/sbin:${WORLDTMP}/usr/bin # When building we want to find the cross tools before the host tools in ${BPATH}. # We also need to add UNIVERSE_TOOLCHAIN_PATH so that we can find the shared # toolchain files (clang, lld, etc.) during make universe/tinderbox STRICTTMPPATH= ${XPATH}:${BPATH}:${UNIVERSE_TOOLCHAIN_PATH} # We should not be using tools from /usr/bin accidentally since this could cause # the build to break on other systems that don't have that tool. For now we # still allow using the old behaviour (inheriting $PATH) if # BUILD_WITH_STRICT_TMPPATH is set to 0 but this will eventually be removed. # Currently strict $PATH can cause build failures and does not work yet with # USING_SYSTEM_LINKER/USING_SYSTEM_COMPILER. Once these issues have been # resolved it will be turned on by default. BUILD_WITH_STRICT_TMPPATH?=0 .if ${BUILD_WITH_STRICT_TMPPATH} != 0 TMPPATH= ${STRICTTMPPATH} .else TMPPATH= ${STRICTTMPPATH}:${PATH} .endif # # Avoid running mktemp(1) unless actually needed. # It may not be functional, e.g., due to new ABI # when in the middle of installing over this system. # .if make(distributeworld) || make(installworld) || make(stageworld) .if ${BUILD_WITH_STRICT_TMPPATH} != 0 MKTEMP=${WORLDTMP}/legacy/usr/bin/mktemp .if !exists(${MKTEMP}) .error "mktemp binary doesn't exist in expected location: ${MKTEMP}" .endif .else MKTEMP=mktemp .endif INSTALLTMP!= ${MKTEMP} -d -u -t install .endif .if make(stagekernel) || make(distributekernel) TAGS+= kernel PACKAGE= kernel .endif # # Building a world goes through the following stages # # 1. legacy stage [BMAKE] # This stage is responsible for creating compatibility # shims that are needed by the bootstrap-tools, # build-tools and cross-tools stages. These are generally # APIs that tools from one of those three stages need to # build that aren't present on the host. # 1. bootstrap-tools stage [BMAKE] # This stage is responsible for creating programs that # are needed for backward compatibility reasons. They # are not built as cross-tools. # 2. build-tools stage [TMAKE] # This stage is responsible for creating the object # tree and building any tools that are needed during # the build process. Some programs are listed during # this phase because they build binaries to generate # files needed to build these programs. This stage also # builds the 'build-tools' target rather than 'all'. # 3. cross-tools stage [XMAKE] # This stage is responsible for creating any tools that # are needed for building the system. A cross-compiler is one # of them. This differs from build tools in two ways: # 1. the 'all' target is built rather than 'build-tools' # 2. these tools are installed into TMPPATH for stage 4. # 4. world stage [WMAKE] # This stage actually builds the world. # 5. install stage (optional) [IMAKE] # This stage installs a previously built world. # BOOTSTRAPPING?= 0 # Keep these in sync MINIMUM_SUPPORTED_OSREL?= 1002501 MINIMUM_SUPPORTED_REL?= 10.3 # Common environment for world related stages CROSSENV+= \ MACHINE_ARCH=${TARGET_ARCH} \ MACHINE=${TARGET} \ CPUTYPE=${TARGET_CPUTYPE} .if ${MK_META_MODE} != "no" # Don't rebuild build-tools targets during normal build. CROSSENV+= BUILD_TOOLS_META=.NOMETA .endif .if defined(TARGET_CFLAGS) CROSSENV+= ${TARGET_CFLAGS} .endif BOOTSTRAPPING_OSRELDATE?=${OSRELDATE} # bootstrap-tools stage BMAKEENV= INSTALL="sh ${.CURDIR}/tools/install.sh" \ TOOLS_PREFIX=${TOOLS_PREFIX_UNDEF:U${WORLDTMP}} \ PATH=${BPATH}:${PATH} \ WORLDTMP=${WORLDTMP} \ MAKEFLAGS="-m ${.CURDIR}/tools/build/mk ${.MAKEFLAGS}" # need to keep this in sync with targets/pseudo/bootstrap-tools/Makefile BSARGS= DESTDIR= \ OBJTOP='${WORLDTMP}/obj-tools' \ OBJROOT='$${OBJTOP}/' \ MAKEOBJDIRPREFIX= \ BOOTSTRAPPING=${BOOTSTRAPPING_OSRELDATE} \ BWPHASE=${.TARGET:C,^_,,} \ SSP_CFLAGS= \ MK_HTML=no NO_LINT=yes MK_MAN=no \ -DNO_PIC MK_PROFILE=no -DNO_SHARED \ -DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no \ MK_CLANG_EXTRAS=no MK_CLANG_FULL=no \ MK_LLDB=no MK_RETPOLINE=no MK_TESTS=no \ MK_INCLUDES=yes BMAKE= \ ${BMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ ${BSARGS} .if empty(.MAKEOVERRIDES:MMK_LLVM_TARGET_ALL) BMAKE+= MK_LLVM_TARGET_ALL=no .endif # build-tools stage TMAKE= \ ${BMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \ DESTDIR= \ BOOTSTRAPPING=${BOOTSTRAPPING_OSRELDATE} \ BWPHASE=${.TARGET:C,^_,,} \ SSP_CFLAGS= \ -DNO_LINT \ -DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no \ MK_CLANG_EXTRAS=no MK_CLANG_FULL=no \ MK_LLDB=no MK_RETPOLINE=no MK_TESTS=no # cross-tools stage # TOOLS_PREFIX set in BMAKE XMAKE= ${BMAKE} \ TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \ MK_GDB=no MK_TESTS=no # kernel-tools stage KTMAKEENV= INSTALL="sh ${.CURDIR}/tools/install.sh" \ PATH=${BPATH}:${PATH} \ WORLDTMP=${WORLDTMP} KTMAKE= \ TOOLS_PREFIX=${TOOLS_PREFIX_UNDEF:U${WORLDTMP}} \ ${KTMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ DESTDIR= \ OBJTOP='${WORLDTMP}/obj-kernel-tools' \ OBJROOT='$${OBJTOP}/' \ MAKEOBJDIRPREFIX= \ BOOTSTRAPPING=${BOOTSTRAPPING_OSRELDATE} \ SSP_CFLAGS= \ MK_HTML=no -DNO_LINT MK_MAN=no \ -DNO_PIC MK_PROFILE=no -DNO_SHARED \ -DNO_CPU_CFLAGS MK_RETPOLINE=no MK_WARNS=no MK_CTF=no # world stage WMAKEENV= ${CROSSENV} \ INSTALL="sh ${.CURDIR}/tools/install.sh" \ PATH=${TMPPATH} \ SYSROOT=${WORLDTMP} # make hierarchy HMAKE= PATH=${TMPPATH} ${MAKE} LOCAL_MTREE=${LOCAL_MTREE:Q} .if defined(NO_ROOT) HMAKE+= PATH=${TMPPATH} METALOG=${METALOG} -DNO_ROOT .endif CROSSENV+= CC="${XCC} ${XCFLAGS}" CXX="${XCXX} ${XCXXFLAGS} ${XCFLAGS}" \ CPP="${XCPP} ${XCFLAGS}" \ AS="${XAS}" AR="${XAR}" LD="${XLD}" LLVM_LINK="${XLLVM_LINK}" \ NM=${XNM} OBJCOPY="${XOBJCOPY}" \ RANLIB=${XRANLIB} STRINGS=${XSTRINGS} \ SIZE="${XSIZE}" .if defined(CROSS_BINUTILS_PREFIX) && exists(${CROSS_BINUTILS_PREFIX}) # In the case of xdev-build tools, CROSS_BINUTILS_PREFIX won't be a # directory, but the compiler will look in the right place for its # tools so we don't need to tell it where to look. BFLAGS+= -B${CROSS_BINUTILS_PREFIX} .endif # The internal bootstrap compiler has a default sysroot set by TOOLS_PREFIX # and target set by TARGET/TARGET_ARCH. However, there are several needs to # always pass an explicit --sysroot and -target. # - External compiler needs sysroot and target flags. # - External ld needs sysroot. # - To be clear about the use of a sysroot when using the internal compiler. # - Easier debugging. # - Allowing WITH_SYSTEM_COMPILER+WITH_META_MODE to work together due to # the flip-flopping build command when sometimes using external and # sometimes using internal. # - Allow using lld which has no support for default paths. .if !defined(CROSS_BINUTILS_PREFIX) || !exists(${CROSS_BINUTILS_PREFIX}) BFLAGS+= -B${WORLDTMP}/usr/bin .endif .if ${WANT_COMPILER_TYPE} == gcc || \ (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == gcc) .elif ${WANT_COMPILER_TYPE} == clang || \ (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == clang) XCFLAGS+= -target ${TARGET_TRIPLE} .endif XCFLAGS+= --sysroot=${WORLDTMP} .if !empty(BFLAGS) XCFLAGS+= ${BFLAGS} .endif .if ${MK_LIB32} != "no" && (${TARGET_ARCH} == "amd64" || \ ${TARGET_ARCH} == "powerpc64" || ${TARGET_ARCH:Mmips64*} != "") LIBCOMPAT= 32 .include "Makefile.libcompat" .elif ${MK_LIBSOFT} != "no" && ${TARGET_ARCH:Marmv[67]*} != "" LIBCOMPAT= SOFT .include "Makefile.libcompat" .endif # META_MODE normally ignores host file changes since every build updates # timestamps (see NO_META_IGNORE_HOST in sys.mk). There are known times # when the ABI breaks though that we want to force rebuilding WORLDTMP # to get updated host tools. .if ${MK_META_MODE} == "yes" && defined(NO_CLEAN) && \ !defined(NO_META_IGNORE_HOST) && !defined(NO_META_IGNORE_HOST_HEADERS) && \ !defined(_MKSHOWCONFIG) # r318736 - ino64 major ABI breakage META_MODE_BAD_ABI_VERS+= 1200031 .if !defined(OBJDIR_HOST_OSRELDATE) .if exists(${OBJTOP}/host-osreldate.h) OBJDIR_HOST_OSRELDATE!= \ awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \ ${OBJTOP}/host-osreldate.h .elif exists(${WORLDTMP}/usr/include/osreldate.h) OBJDIR_HOST_OSRELDATE= 0 .endif .export OBJDIR_HOST_OSRELDATE .endif # Note that this logic is the opposite of normal BOOTSTRAP handling. We want # to compare the WORLDTMP's OSRELDATE to the host's OSRELDATE. If the WORLDTMP # is older than the ABI-breakage OSRELDATE of the HOST then we rebuild. .if defined(OBJDIR_HOST_OSRELDATE) .for _ver in ${META_MODE_BAD_ABI_VERS} .if ${OSRELDATE} >= ${_ver} && ${OBJDIR_HOST_OSRELDATE} < ${_ver} _meta_mode_need_rebuild= ${_ver} .endif .endfor .if defined(_meta_mode_need_rebuild) .info META_MODE: Rebuilding host tools due to ABI breakage in __FreeBSD_version ${_meta_mode_need_rebuild}. NO_META_IGNORE_HOST_HEADERS= 1 .export NO_META_IGNORE_HOST_HEADERS .endif # defined(_meta_mode_need_rebuild) .endif # defined(OBJDIR_HOST_OSRELDATE) .endif # ${MK_META_MODE} == "yes" && defined(NO_CLEAN) ... # This is only used for META_MODE+filemon to track what the oldest # __FreeBSD_version is in WORLDTMP. This purposely does NOT have # a make dependency on /usr/include/osreldate.h as the file should # only be copied when it is missing or meta mode determines it has changed. # Since host files are normally ignored without NO_META_IGNORE_HOST # the file will never be updated unless that flag is specified. This # allows tracking the oldest osreldate to force rebuilds via # META_MODE_BADABI_REVS above. host-osreldate.h: # DO NOT ADD /usr/include/osreldate.h here @cp -f /usr/include/osreldate.h ${.TARGET} WMAKE= ${WMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ BWPHASE=${.TARGET:C,^_,,} \ DESTDIR=${WORLDTMP} IMAKEENV= ${CROSSENV} IMAKE= ${IMAKEENV} ${MAKE} -f Makefile.inc1 \ ${IMAKE_INSTALL} ${IMAKE_MTREE} .if empty(.MAKEFLAGS:M-n) IMAKEENV+= PATH=${STRICTTMPPATH}:${INSTALLTMP} \ LD_LIBRARY_PATH=${INSTALLTMP} \ PATH_LOCALE=${INSTALLTMP}/locale IMAKE+= __MAKE_SHELL=${INSTALLTMP}/sh .else IMAKEENV+= PATH=${TMPPATH}:${INSTALLTMP} .endif # When generating install media, do not allow user and group information from # the build host to affect the contents of the distribution. .if make(distributeworld) || make(distrib-dirs) || make(distribution) DB_FROM_SRC= yes .endif .if defined(DB_FROM_SRC) INSTALLFLAGS+= -N ${.CURDIR}/etc MTREEFLAGS+= -N ${.CURDIR}/etc .endif _INSTALL_DDIR= ${DESTDIR}/${DISTDIR} INSTALL_DDIR= ${_INSTALL_DDIR:S://:/:g:C:/$::} .if defined(NO_ROOT) METALOG?= ${DESTDIR}/${DISTDIR}/METALOG METALOG:= ${METALOG:C,//+,/,g} IMAKE+= -DNO_ROOT METALOG=${METALOG} INSTALLFLAGS+= -U -M ${METALOG} -D ${INSTALL_DDIR} MTREEFLAGS+= -W .endif .if defined(BUILD_PKGS) INSTALLFLAGS+= -h sha256 .endif .if defined(DB_FROM_SRC) || defined(NO_ROOT) IMAKE_INSTALL= INSTALL="install ${INSTALLFLAGS}" IMAKE_MTREE= MTREE_CMD="mtree ${MTREEFLAGS}" .endif DESTDIR_MTREEFLAGS= -deU # When creating worldtmp we don't need to set the directories as owned by root # so we also pass -W WORLDTMP_MTREEFLAGS= -deUW .if defined(NO_ROOT) # When building with -DNO_ROOT we shouldn't be changing the directories # that are created by mtree to be owned by root/wheel. DESTDIR_MTREEFLAGS+= -W .endif MTREE?= mtree .if ${BUILD_WITH_STRICT_TMPPATH} != 0 MTREE= ${WORLDTMP}/legacy/usr/sbin/mtree .endif WORLDTMP_MTREE= ${MTREE} ${WORLDTMP_MTREEFLAGS} DESTDIR_MTREE= ${MTREE} ${DESTDIR_MTREEFLAGS} # kernel stage KMAKEENV= ${WMAKEENV:NSYSROOT=*} KMAKE= ${KMAKEENV} ${MAKE} ${.MAKEFLAGS} ${KERNEL_FLAGS} KERNEL=${INSTKERNNAME} # # buildworld # # Attempt to rebuild the entire system, with reasonable chance of # success, regardless of how old your existing system is. # _sanity_check: .PHONY .MAKE .if ${.CURDIR:C/[^,]//g} != "" # The m4 build of sendmail files doesn't like it if ',' is used # anywhere in the path of it's files. @echo @echo "*** Error: path to source tree contains a comma ','" @echo @false .elif ${.CURDIR:M*\:*} != "" # Using ':' leaks into PATH and breaks finding cross-tools. @echo @echo "*** Error: path to source tree contains a colon ':'" @echo @false .endif # Our current approach to dependency tracking cannot cope with certain source # tree changes, particularly with respect to removing source files and # replacing generated files. Handle these cases here in an ad-hoc fashion. _cleanobj_fast_depend_hack: .PHONY # Syscall stubs rewritten in C and obsolete MD assembly implementations # Date SVN Rev Syscalls # 20180604 r334626 brk sbrk .for f in brk sbrk @if [ -e "${OBJTOP}/lib/libc/.depend.${f}.o" ] && \ egrep -qw '${f}\.[sS]' ${OBJTOP}/lib/libc/.depend.${f}.o; then \ echo "Removing stale dependencies for ${f} syscall wrappers"; \ rm -f ${OBJTOP}/lib/libc/.depend.${f}.* \ ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.${f}.*}; \ fi .endfor # 20181013 r339348 bcopy reimplemented as .c .for f in bcopy memcpy memmove @if [ -e "${OBJTOP}/lib/libc/.depend.${f}.o" ] && \ egrep -qw 'bcopy\.[sS]' ${OBJTOP}/lib/libc/.depend.${f}.o; then \ echo "Removing stale dependencies for bcopy"; \ rm -f ${OBJTOP}/lib/libc/.depend.${f}.* \ ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.${f}.*}; \ fi .endfor # 20181115 r340463 bzero reimplemented as .c @if [ -e "${OBJTOP}/lib/libc/.depend.bzero.o" ] && \ egrep -qw 'bzero\.[sS]' ${OBJTOP}/lib/libc/.depend.bzero.o; then \ echo "Removing stale dependencies for bzero"; \ rm -f ${OBJTOP}/lib/libc/.depend.bzero.* \ ${LIBCOMPAT:D${LIBCOMPAT_OBJTOP}/lib/libc/.depend.bzero.*}; \ fi # 20181009 track migration from ntp's embedded libevent to updated one @if [ -e "${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.bufferevent_openssl.o" ] && \ egrep -q 'contrib/ntp/sntp/libevent/bufferevent_openssl.c' \ ${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.bufferevent_openssl.o ; then \ echo "Removing stale libevent dependencies"; \ rm -f ${OBJTOP}/usr.sbin/ntp/libntpevent/.depend.*; \ fi # 20181209 r341759 track migration across wpa update @if [ -e "${OBJTOP}/usr.sbin/wpa/wpa_supplicant/.depend.rrm.o" ] && \ egrep -q 'src/ap/rrm.c' \ ${OBJTOP}/usr.sbin/wpa/wpa_supplicant/.depend.rrm.o; then \ echo "Removing stale wpa dependencies"; \ rm -f ${OBJTOP}/usr.sbin/wpa/*/.depend*; \ fi _worldtmp: .PHONY @echo @echo "--------------------------------------------------------------" @echo ">>> Rebuilding the temporary build tree" @echo "--------------------------------------------------------------" .if !defined(NO_CLEAN) rm -rf ${WORLDTMP} .else # Note: for delete-old we need to set $PATH to also include the host $PATH # since otherwise a partial build with missing symlinks in ${WORLDTMP}/legacy/ # will fail to run due to missing binaries. $WMAKE sets PATH to only ${TMPPATH} # so we remove that assingnment from $WMAKE and prepend the new $PATH ${_+_}@if [ -e "${WORLDTMP}" ]; then \ echo ">>> Deleting stale files in build tree..."; \ cd ${.CURDIR}; env PATH=${TMPPATH}:${PATH} ${WMAKE:NPATH=*} \ _NO_INCLUDE_COMPILERMK=t -DBATCH_DELETE_OLD_FILES delete-old \ delete-old-libs >/dev/null; \ fi rm -rf ${WORLDTMP}/legacy/usr/include .if ${USING_SYSTEM_COMPILER} == "yes" .for cc in cc c++ if [ -x ${WORLDTMP}/usr/bin/${cc} ]; then \ inum=$$(stat -f %i ${WORLDTMP}/usr/bin/${cc}); \ find ${WORLDTMP}/usr/bin -inum $${inum} -delete; \ fi .endfor .endif # ${USING_SYSTEM_COMPILER} == "yes" .if ${USING_SYSTEM_LINKER} == "yes" @rm -f ${WORLDTMP}/usr/bin/ld ${WORLDTMP}/usr/bin/ld.lld .endif # ${USING_SYSTEM_LINKER} == "yes" .endif # !defined(NO_CLEAN) @mkdir -p ${WORLDTMP} @touch ${WORLDTMP}/${.TARGET} # We can't use mtree to create the worldtmp directories since it may not be # available on the target system (this happens e.g. when building on non-FreeBSD) cd ${.CURDIR}/tools/build; \ ${MAKE} DIRPRFX=tools/build/ DESTDIR=${WORLDTMP}/legacy installdirs # In order to build without inheriting $PATH we need to add symlinks to the host # tools in $WORLDTMP for the tools that we don't build during bootstrap-tools cd ${.CURDIR}/tools/build; \ ${MAKE} DIRPRFX=tools/build/ DESTDIR=${WORLDTMP}/legacy host-symlinks _legacy: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 1.1: legacy release compatibility shims" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${BMAKE} legacy _bootstrap-tools: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 1.2: bootstrap tools" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${BMAKE} bootstrap-tools mkdir -p ${WORLDTMP}/usr ${WORLDTMP}/lib/casper ${WORLDTMP}/lib/geom ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${WORLDTMP}/usr >/dev/null ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${WORLDTMP}/usr/include >/dev/null ln -sf ${.CURDIR}/sys ${WORLDTMP} .if ${MK_DEBUG_FILES} != "no" ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.debug.dist \ -p ${WORLDTMP}/usr/lib >/dev/null .endif .for _mtree in ${LOCAL_MTREE} ${WORLDTMP_MTREE} -f ${.CURDIR}/${_mtree} -p ${WORLDTMP} > /dev/null .endfor _cleanobj: .if !defined(NO_CLEAN) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.1: cleaning up the object tree" @echo "--------------------------------------------------------------" # Avoid including bsd.compiler.mk in clean and obj with _NO_INCLUDE_COMPILERMK # since the restricted $PATH might not contain a valid cc binary ${_+_}cd ${.CURDIR}; ${WMAKE} _NO_INCLUDE_COMPILERMK=t ${CLEANDIR} .if defined(LIBCOMPAT) ${_+_}cd ${.CURDIR}; ${LIBCOMPATWMAKE} _NO_INCLUDE_COMPILERMK=t -f Makefile.inc1 ${CLEANDIR} .endif .else ${_+_}cd ${.CURDIR}; ${WMAKE} _NO_INCLUDE_COMPILERMK=t _cleanobj_fast_depend_hack .endif # !defined(NO_CLEAN) _obj: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.2: rebuilding the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${WMAKE} _NO_INCLUDE_COMPILERMK=t obj _build-tools: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.3: build tools" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${TMAKE} build-tools _cross-tools: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 3: cross tools" @echo "--------------------------------------------------------------" @rm -f ${OBJTOP}/toolchain-metadata.mk ${_+_}cd ${.CURDIR}; ${XMAKE} cross-tools ${_+_}cd ${.CURDIR}; ${XMAKE} kernel-tools _build-metadata: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 3.1: recording build metadata" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${WMAKE} toolchain-metadata.mk ${_+_}cd ${.CURDIR}; ${WMAKE} host-osreldate.h _includes: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 4.1: building includes" @echo "--------------------------------------------------------------" # Special handling for SUBDIR_OVERRIDE in buildworld as they most likely need # headers from default SUBDIR. Do SUBDIR_OVERRIDE includes last. ${_+_}cd ${.CURDIR}; ${WMAKE} SUBDIR_OVERRIDE= SHARED=symlinks \ MK_INCLUDES=yes includes .if !empty(SUBDIR_OVERRIDE) && make(buildworld) ${_+_}cd ${.CURDIR}; ${WMAKE} MK_INCLUDES=yes SHARED=symlinks includes .endif _libraries: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 4.2: building libraries" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; \ ${WMAKE} -DNO_FSCHG MK_HTML=no -DNO_LINT MK_MAN=no \ MK_PROFILE=no MK_TESTS=no MK_TESTS_SUPPORT=${MK_TESTS} libraries everything: .PHONY @echo @echo "--------------------------------------------------------------" @echo ">>> stage 4.3: building everything" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; _PARALLEL_SUBDIR_OK=1 ${WMAKE} all WMAKE_TGTS= .if !defined(WORLDFAST) WMAKE_TGTS+= _sanity_check _worldtmp _legacy .if empty(SUBDIR_OVERRIDE) WMAKE_TGTS+= _bootstrap-tools .endif WMAKE_TGTS+= _cleanobj .if !defined(NO_OBJWALK) WMAKE_TGTS+= _obj .endif WMAKE_TGTS+= _build-tools _cross-tools WMAKE_TGTS+= _build-metadata WMAKE_TGTS+= _includes .endif .if !defined(NO_LIBS) WMAKE_TGTS+= _libraries .endif WMAKE_TGTS+= everything .if defined(LIBCOMPAT) && empty(SUBDIR_OVERRIDE) WMAKE_TGTS+= build${libcompat} .endif # record buildworld time in seconds .if make(buildworld) _BUILDWORLD_START!= date '+%s' .export _BUILDWORLD_START .endif buildworld: buildworld_prologue ${WMAKE_TGTS} buildworld_epilogue .PHONY .ORDER: buildworld_prologue ${WMAKE_TGTS} buildworld_epilogue buildworld_prologue: .PHONY @echo "--------------------------------------------------------------" @echo ">>> World build started on `LC_ALL=C date`" @echo "--------------------------------------------------------------" buildworld_epilogue: .PHONY @echo @echo "--------------------------------------------------------------" @echo ">>> World build completed on `LC_ALL=C date`" @seconds=$$(($$(date '+%s') - ${_BUILDWORLD_START})); \ echo -n ">>> World built in $$seconds seconds, "; \ echo "ncpu: $$(sysctl -n hw.ncpu)${.MAKE.JOBS:S/^/, make -j/}" @echo "--------------------------------------------------------------" # # We need to have this as a target because the indirection between Makefile # and Makefile.inc1 causes the correct PATH to be used, rather than a # modification of the current environment's PATH. In addition, we need # to quote multiword values. # buildenvvars: .PHONY @echo ${WMAKEENV:Q} ${.MAKE.EXPORTED:@v@$v=\"${$v}\"@} .if ${.TARGETS:Mbuildenv} .if ${.MAKEFLAGS:M-j} .error The buildenv target is incompatible with -j .endif .endif BUILDENV_DIR?= ${.CURDIR} # # Note: make will report any errors the shell reports. This can # be odd if the last command in an interactive shell generates an # error or is terminated by SIGINT. These reported errors look bad, # but are harmless. Allowing them also allows BUIDLENV_SHELL to # be a complex command whose status will be returned to the caller. # Some scripts in tools rely on this behavior to report build errors. # buildenv: .PHONY @echo Entering world for ${TARGET_ARCH}:${TARGET} .if ${BUILDENV_SHELL:M*zsh*} @echo For ZSH you must run: export CPUTYPE=${TARGET_CPUTYPE} .endif @cd ${BUILDENV_DIR} && env ${WMAKEENV} BUILDENV=1 ${BUILDENV_SHELL} TOOLCHAIN_TGTS= ${WMAKE_TGTS:Neverything:Nbuild${libcompat}} toolchain: ${TOOLCHAIN_TGTS} .PHONY KERNEL_TOOLCHAIN_TGTS= ${TOOLCHAIN_TGTS:N_obj:N_cleanobj:N_includes:N_libraries} .if make(kernel-toolchain) .ORDER: ${KERNEL_TOOLCHAIN_TGTS} .endif kernel-toolchain: ${KERNEL_TOOLCHAIN_TGTS} .PHONY # # installcheck # # Checks to be sure system is ready for installworld/installkernel. # installcheck: _installcheck_world _installcheck_kernel .PHONY _installcheck_world: .PHONY @echo "--------------------------------------------------------------" @echo ">>> Install check world" @echo "--------------------------------------------------------------" _installcheck_kernel: .PHONY @echo "--------------------------------------------------------------" @echo ">>> Install check kernel" @echo "--------------------------------------------------------------" # # Require DESTDIR to be set if installing for a different architecture or # using the user/group database in the source tree. # .if ${TARGET_ARCH} != ${MACHINE_ARCH} || ${TARGET} != ${MACHINE} || \ defined(DB_FROM_SRC) .if !make(distributeworld) _installcheck_world: __installcheck_DESTDIR _installcheck_kernel: __installcheck_DESTDIR __installcheck_DESTDIR: .PHONY .if !defined(DESTDIR) || empty(DESTDIR) @echo "ERROR: Please set DESTDIR!"; \ false .endif .endif .endif .if !defined(DB_FROM_SRC) # # Check for missing UIDs/GIDs. # CHECK_UIDS= auditdistd CHECK_GIDS= audit CHECK_UIDS+= ntpd CHECK_GIDS+= ntpd CHECK_UIDS+= proxy CHECK_GIDS+= proxy authpf CHECK_UIDS+= smmsp CHECK_GIDS+= smmsp CHECK_UIDS+= unbound CHECK_GIDS+= unbound _installcheck_world: __installcheck_UGID __installcheck_UGID: .PHONY .for uid in ${CHECK_UIDS} @if ! `id -u ${uid} >/dev/null 2>&1`; then \ echo "ERROR: Required ${uid} user is missing, see /usr/src/UPDATING."; \ false; \ fi .endfor .for gid in ${CHECK_GIDS} @if ! `find / -prune -group ${gid} >/dev/null 2>&1`; then \ echo "ERROR: Required ${gid} group is missing, see /usr/src/UPDATING."; \ false; \ fi .endfor .endif # # If installing over the running system (DESTDIR is / or unset) and the install # includes rescue, try running rescue from the objdir as a sanity check. If # rescue is not functional (e.g., because it depends on a system call not # supported by the currently running kernel), abort the installation. # .if !make(distributeworld) && ${MK_RESCUE} != "no" && \ (empty(DESTDIR) || ${DESTDIR} == "/") && empty(BYPASS_INSTALLCHECK_SH) _installcheck_world: __installcheck_sh_check __installcheck_sh_check: .PHONY @if [ "`${OBJTOP}/rescue/rescue/rescue sh -c 'echo OK'`" != \ OK ]; then \ echo "rescue/sh check failed, installation aborted" >&2; \ false; \ fi .endif # # Required install tools to be saved in a scratch dir for safety. # .if ${MK_ZONEINFO} != "no" _zoneinfo= zic tzsetup .endif ITOOLS= [ awk cap_mkdb cat chflags chmod chown cmp cp \ date echo egrep find grep id install ${_install-info} \ ln make mkdir mtree mv pwd_mkdb \ rm sed services_mkdb sh sort strip sysctl test true uname wc ${_zoneinfo} \ ${LOCAL_ITOOLS} # Needed for share/man .if ${MK_MAN_UTILS} != "no" ITOOLS+=makewhatis .endif # # distributeworld # # Distributes everything compiled by a `buildworld'. # # installworld # # Installs everything compiled by a 'buildworld'. # # Non-base distributions produced by the base system EXTRA_DISTRIBUTIONS= .if defined(LIBCOMPAT) EXTRA_DISTRIBUTIONS+= lib${libcompat} .endif .if ${MK_TESTS} != "no" EXTRA_DISTRIBUTIONS+= tests .endif DEBUG_DISTRIBUTIONS= .if ${MK_DEBUG_FILES} != "no" DEBUG_DISTRIBUTIONS+= base ${EXTRA_DISTRIBUTIONS:S,tests,,} .endif MTREE_MAGIC?= mtree 2.0 distributeworld installworld stageworld: _installcheck_world .PHONY mkdir -p ${INSTALLTMP} progs=$$(for prog in ${ITOOLS}; do \ if progpath=`which $$prog`; then \ echo $$progpath; \ else \ echo "Required tool $$prog not found in PATH." >&2; \ exit 1; \ fi; \ done); \ libs=$$(ldd -f "%o %p\n" -f "%o %p\n" $$progs 2>/dev/null | sort -u | \ while read line; do \ set -- $$line; \ if [ "$$2 $$3" != "not found" ]; then \ echo $$2; \ else \ echo "Required library $$1 not found." >&2; \ exit 1; \ fi; \ done); \ cp $$libs $$progs ${INSTALLTMP} cp -R $${PATH_LOCALE:-"/usr/share/locale"} ${INSTALLTMP}/locale .if defined(NO_ROOT) -mkdir -p ${METALOG:H} echo "#${MTREE_MAGIC}" > ${METALOG} .endif .if make(distributeworld) .for dist in ${EXTRA_DISTRIBUTIONS} -mkdir ${DESTDIR}/${DISTDIR}/${dist} ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.root.dist \ -p ${DESTDIR}/${DISTDIR}/${dist} >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/include >/dev/null .if ${MK_DEBUG_FILES} != "no" ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.debug.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib >/dev/null .endif .if defined(LIBCOMPAT) ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr >/dev/null .if ${MK_DEBUG_FILES} != "no" ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib/debug/usr >/dev/null .endif .endif .if ${MK_TESTS} != "no" && ${dist} == "tests" -mkdir -p ${DESTDIR}/${DISTDIR}/${dist}${TESTSBASE} ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}${TESTSBASE} >/dev/null .if ${MK_DEBUG_FILES} != "no" ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib/debug/${TESTSBASE} >/dev/null .endif .endif .if defined(NO_ROOT) ${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.root.dist | \ sed -e 's#^\./#./${dist}/#' >> ${METALOG} ${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.usr.dist | \ sed -e 's#^\./#./${dist}/usr/#' >> ${METALOG} ${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.include.dist | \ sed -e 's#^\./#./${dist}/usr/include/#' >> ${METALOG} .if defined(LIBCOMPAT) ${IMAKEENV} ${MTREE} -C -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist | \ sed -e 's#^\./#./${dist}/usr/#' >> ${METALOG} .endif .endif .endfor -mkdir ${DESTDIR}/${DISTDIR}/base ${_+_}cd ${.CURDIR}/etc; ${CROSSENV} PATH=${TMPPATH} ${MAKE} \ METALOG=${METALOG} ${IMAKE_INSTALL} ${IMAKE_MTREE} \ DISTBASE=/base DESTDIR=${DESTDIR}/${DISTDIR}/base \ LOCAL_MTREE=${LOCAL_MTREE:Q} distrib-dirs ${INSTALL_SYMLINK} ${INSTALLFLAGS} usr/src/sys ${INSTALL_DDIR}/base/sys .endif ${_+_}cd ${.CURDIR}; ${IMAKE} re${.TARGET:S/world$//}; \ ${IMAKEENV} rm -rf ${INSTALLTMP} .if make(distributeworld) .for dist in ${EXTRA_DISTRIBUTIONS} find ${DESTDIR}/${DISTDIR}/${dist} -mindepth 1 -type d -empty -delete .endfor .if defined(NO_ROOT) .for dist in base ${EXTRA_DISTRIBUTIONS} @# For each file that exists in this dist, print the corresponding @# line from the METALOG. This relies on the fact that @# a line containing only the filename will sort immediately before @# the relevant mtree line. cd ${DESTDIR}/${DISTDIR}; \ find ./${dist} | sort -u ${METALOG} - | \ awk 'BEGIN { print "#${MTREE_MAGIC}" } !/ type=/ { file = $$1 } / type=/ { if ($$1 == file) { sub(/^\.\/${dist}\//, "./"); print } }' > \ ${DESTDIR}/${DISTDIR}/${dist}.meta .endfor .for dist in ${DEBUG_DISTRIBUTIONS} @# For each file that exists in this dist, print the corresponding @# line from the METALOG. This relies on the fact that @# a line containing only the filename will sort immediately before @# the relevant mtree line. cd ${DESTDIR}/${DISTDIR}; \ find ./${dist}/usr/lib/debug | sort -u ${METALOG} - | \ awk 'BEGIN { print "#${MTREE_MAGIC}" } !/ type=/ { file = $$1 } / type=/ { if ($$1 == file) { sub(/^\.\/${dist}\//, "./"); print } }' > \ ${DESTDIR}/${DISTDIR}/${dist}.debug.meta .endfor .endif .endif packageworld: .PHONY .for dist in base ${EXTRA_DISTRIBUTIONS} .if defined(NO_ROOT) ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvf - --exclude usr/lib/debug \ @${DESTDIR}/${DISTDIR}/${dist}.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}.txz .else ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvf - --exclude usr/lib/debug . | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}.txz .endif .endfor .for dist in ${DEBUG_DISTRIBUTIONS} . if defined(NO_ROOT) ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvf - @${DESTDIR}/${DISTDIR}/${dist}.debug.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}-dbg.txz . else ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvLf - usr/lib/debug | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}-dbg.txz . endif .endfor _sysent_dirs= sys/kern _sysent_dirs+= sys/compat/freebsd32 _sysent_dirs+= sys/amd64/linux \ sys/amd64/linux32 \ sys/arm64/linux \ sys/i386/linux sysent: .PHONY .for _dir in ${_sysent_dirs} ${_+_}${MAKE} -C ${.CURDIR}/${_dir} sysent .endfor # # reinstall # # If you have a build server, you can NFS mount the source and obj directories # and do a 'make reinstall' on the *client* to install new binaries from the # most recent server build. # restage reinstall: .MAKE .PHONY @echo "--------------------------------------------------------------" @echo ">>> Making hierarchy" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 \ LOCAL_MTREE=${LOCAL_MTREE:Q} hierarchy .if make(restage) @echo "--------------------------------------------------------------" @echo ">>> Making distribution" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 \ LOCAL_MTREE=${LOCAL_MTREE:Q} distribution .endif @echo @echo "--------------------------------------------------------------" @echo ">>> Installing everything started on `LC_ALL=C date`" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 install .if defined(LIBCOMPAT) ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 install${libcompat} .endif @echo "--------------------------------------------------------------" @echo ">>> Installing everything completed on `LC_ALL=C date`" @echo "--------------------------------------------------------------" redistribute: .MAKE .PHONY @echo "--------------------------------------------------------------" @echo ">>> Distributing everything" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 distribute .if defined(LIBCOMPAT) ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 distribute${libcompat} \ DISTRIBUTION=lib${libcompat} .endif distrib-dirs distribution: .MAKE .PHONY ${_+_}cd ${.CURDIR}/etc; ${CROSSENV} PATH=${TMPPATH} ${MAKE} \ ${IMAKE_INSTALL} ${IMAKE_MTREE} METALOG=${METALOG} ${.TARGET} .if make(distribution) ${_+_}cd ${.CURDIR}; ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} -f Makefile.inc1 ${IMAKE_INSTALL} \ METALOG=${METALOG} MK_TESTS=no installconfig .endif # # buildkernel and installkernel # # Which kernels to build and/or install is specified by setting # KERNCONF. If not defined a GENERIC kernel is built/installed. # Only the existing (depending TARGET) config files are used # for building kernels and only the first of these is designated # as the one being installed. # # Note that we have to use TARGET instead of TARGET_ARCH when # we're in kernel-land. Since only TARGET_ARCH is (expected) to # be set to cross-build, we have to make sure TARGET is set # properly. .if defined(KERNFAST) NO_KERNELCLEAN= t NO_KERNELCONFIG= t NO_KERNELOBJ= t # Shortcut for KERNCONF=Blah -DKERNFAST is now KERNFAST=Blah .if !defined(KERNCONF) && ${KERNFAST} != "1" KERNCONF=${KERNFAST} .endif .endif .if ${TARGET_ARCH} == "powerpc64" KERNCONF?= GENERIC64 .else KERNCONF?= GENERIC .endif INSTKERNNAME?= kernel KERNSRCDIR?= ${.CURDIR}/sys KRNLCONFDIR= ${KERNSRCDIR}/${TARGET}/conf KRNLOBJDIR= ${OBJTOP}${KERNSRCDIR:C,^${.CURDIR},,} KERNCONFDIR?= ${KRNLCONFDIR} BUILDKERNELS= INSTALLKERNEL= .if defined(NO_INSTALLKERNEL) # All of the BUILDKERNELS loops start at index 1. BUILDKERNELS+= dummy .endif .for _kernel in ${KERNCONF} .if !defined(_MKSHOWCONFIG) && exists(${KERNCONFDIR}/${_kernel}) BUILDKERNELS+= ${_kernel} .if empty(INSTALLKERNEL) && !defined(NO_INSTALLKERNEL) INSTALLKERNEL= ${_kernel} .endif .else .if make(buildkernel) .error Missing KERNCONF ${KERNCONFDIR}/${_kernel} .endif .endif .endfor _cleankernobj_fast_depend_hack: .PHONY # 20180320 remove stale generated assym.s after renaming to .inc in r331254 @if [ -e "${OBJTOP}/sys/${KERNCONF}/assym.s" ]; then \ echo "Removing stale generated assym files"; \ rm -f ${OBJTOP}/sys/${KERNCONF}/assym.* \ ${OBJTOP}/sys/${KERNCONF}/.depend.assym.*; \ fi ${WMAKE_TGTS:N_worldtmp:Nbuild${libcompat}} ${.ALLTARGETS:M_*:N_worldtmp}: .MAKE .PHONY # record kernel(s) build time in seconds .if make(buildkernel) _BUILDKERNEL_START!= date '+%s' .endif # # buildkernel # # Builds all kernels defined by BUILDKERNELS. # buildkernel: .MAKE .PHONY .if empty(BUILDKERNELS:Ndummy) @echo "ERROR: Missing kernel configuration file(s) (${KERNCONF})."; \ false .endif @echo .for _kernel in ${BUILDKERNELS:Ndummy} @echo "--------------------------------------------------------------" @echo ">>> Kernel build for ${_kernel} started on `LC_ALL=C date`" @echo "--------------------------------------------------------------" @echo "===> ${_kernel}" mkdir -p ${KRNLOBJDIR} .if !defined(NO_KERNELCONFIG) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 1: configuring the kernel" @echo "--------------------------------------------------------------" cd ${KRNLCONFDIR}; \ PATH=${TMPPATH} \ config ${CONFIGARGS} -d ${KRNLOBJDIR}/${_kernel} \ -I '${KERNCONFDIR}' '${KERNCONFDIR}/${_kernel}' .endif .if !defined(NO_CLEAN) && !defined(NO_KERNELCLEAN) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.1: cleaning up the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} ${CLEANDIR} .else ${_+_}cd ${.CURDIR}; ${WMAKE} _cleankernobj_fast_depend_hack .endif .if !defined(NO_KERNELOBJ) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.2: rebuilding the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} obj .endif @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.3: build tools" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${KTMAKE} kernel-tools @echo @echo "--------------------------------------------------------------" @echo ">>> stage 3.1: building everything" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} all -DNO_MODULES_OBJ @echo "--------------------------------------------------------------" @echo ">>> Kernel build for ${_kernel} completed on `LC_ALL=C date`" @echo "--------------------------------------------------------------" .endfor @seconds=$$(($$(date '+%s') - ${_BUILDKERNEL_START})); \ echo -n ">>> Kernel(s) ${BUILDKERNELS} built in $$seconds seconds, "; \ echo "ncpu: $$(sysctl -n hw.ncpu)${.MAKE.JOBS:S/^/, make -j/}" @echo "--------------------------------------------------------------" NO_INSTALLEXTRAKERNELS?= yes # # installkernel, etc. # # Install the kernel defined by INSTALLKERNEL # installkernel installkernel.debug \ reinstallkernel reinstallkernel.debug: _installcheck_kernel .PHONY .if !defined(NO_INSTALLKERNEL) .if empty(INSTALLKERNEL) @echo "ERROR: No kernel \"${KERNCONF}\" to install."; \ false .endif @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${INSTALLKERNEL} on $$(LC_ALL=C date)" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${INSTALLKERNEL}; \ ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} ${IMAKE_INSTALL} KERNEL=${INSTKERNNAME} ${.TARGET:S/kernel//} @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${INSTALLKERNEL} completed on $$(LC_ALL=C date)" @echo "--------------------------------------------------------------" .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${_kernel} $$(LC_ALL=C date)" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; \ ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} ${IMAKE_INSTALL} KERNEL=${INSTKERNNAME}.${_kernel} ${.TARGET:S/kernel//} @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${_kernel} completed on $$(LC_ALL=C date)" @echo "--------------------------------------------------------------" .endfor .endif distributekernel distributekernel.debug: .PHONY .if !defined(NO_INSTALLKERNEL) .if empty(INSTALLKERNEL) @echo "ERROR: No kernel \"${KERNCONF}\" to install."; \ false .endif mkdir -p ${DESTDIR}/${DISTDIR} .if defined(NO_ROOT) @echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.premeta .endif ${_+_}cd ${KRNLOBJDIR}/${INSTALLKERNEL}; \ ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.premeta/} \ ${IMAKE_MTREE} PATH=${TMPPATH} ${MAKE} KERNEL=${INSTKERNNAME} \ DESTDIR=${INSTALL_DDIR}/kernel \ ${.TARGET:S/distributekernel/install/} .if defined(NO_ROOT) @sed -e 's|^./kernel|.|' ${DESTDIR}/${DISTDIR}/kernel.premeta > \ ${DESTDIR}/${DISTDIR}/kernel.meta .endif .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} .if defined(NO_ROOT) @echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta .endif ${_+_}cd ${KRNLOBJDIR}/${_kernel}; \ ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.${_kernel}.premeta/} \ ${IMAKE_MTREE} PATH=${TMPPATH} ${MAKE} \ KERNEL=${INSTKERNNAME}.${_kernel} \ DESTDIR=${INSTALL_DDIR}/kernel.${_kernel} \ ${.TARGET:S/distributekernel/install/} .if defined(NO_ROOT) @sed -e "s|^./kernel.${_kernel}|.|" \ ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta > \ ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta .endif .endfor .endif packagekernel: .PHONY .if defined(NO_ROOT) .if !defined(NO_INSTALLKERNEL) cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --exclude '*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.txz .endif .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --include '*/*/*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.meta | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --exclude '*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.${_kernel}.txz .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --include '*/*/*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}-dbg.txz .endif .endfor .endif .else .if !defined(NO_INSTALLKERNEL) cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --exclude '*.debug' . | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.txz .endif .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --include '*/*/*.debug' $$(eval find .) | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --exclude '*.debug' . | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.${_kernel}.txz .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --include '*/*/*.debug' $$(eval find .) | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}-dbg.txz .endif .endfor .endif .endif stagekernel: .PHONY ${_+_}${MAKE} -C ${.CURDIR} ${.MAKEFLAGS} distributekernel PORTSDIR?= /usr/ports WSTAGEDIR?= ${OBJTOP}/worldstage KSTAGEDIR?= ${OBJTOP}/kernelstage REPODIR?= ${OBJROOT}repo PKGSIGNKEY?= # empty .ORDER: stage-packages create-packages .ORDER: create-packages create-world-packages .ORDER: create-packages create-kernel-packages .ORDER: create-packages sign-packages _pkgbootstrap: .PHONY .if make(*package*) && !exists(${LOCALBASE}/sbin/pkg) @env ASSUME_ALWAYS_YES=YES pkg bootstrap .endif packages: .PHONY ${_+_}${MAKE} -C ${.CURDIR} PKG_VERSION=${PKG_VERSION} real-packages package-pkg: .PHONY rm -rf /tmp/ports.${TARGET} || : env ${WMAKEENV:Q} SRCDIR=${.CURDIR} PORTSDIR=${PORTSDIR} REVISION=${_REVISION} \ PKG_CMD=${PKG_CMD} PKG_VERSION=${PKG_VERSION} REPODIR=${REPODIR} \ WSTAGEDIR=${WSTAGEDIR} \ sh ${.CURDIR}/release/scripts/make-pkg-package.sh real-packages: stage-packages create-packages sign-packages .PHONY stage-packages-world: .PHONY @mkdir -p ${WSTAGEDIR} ${_+_}@cd ${.CURDIR}; \ ${MAKE} DESTDIR=${WSTAGEDIR} -DNO_ROOT stageworld stage-packages-kernel: .PHONY @mkdir -p ${KSTAGEDIR} ${_+_}@cd ${.CURDIR}; \ ${MAKE} DESTDIR=${KSTAGEDIR} -DNO_ROOT stagekernel stage-packages: .PHONY stage-packages-world stage-packages-kernel _repodir: .PHONY @mkdir -p ${REPODIR} create-packages-world: _pkgbootstrap _repodir .PHONY ${_+_}@cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 \ DESTDIR=${WSTAGEDIR} \ PKG_VERSION=${PKG_VERSION} create-world-packages create-packages-kernel: _pkgbootstrap _repodir .PHONY ${_+_}@cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 \ DESTDIR=${KSTAGEDIR} \ PKG_VERSION=${PKG_VERSION} DISTDIR=kernel \ create-kernel-packages create-packages: .PHONY create-packages-world create-packages-kernel create-world-packages: _pkgbootstrap .PHONY @rm -f ${WSTAGEDIR}/*.plist 2>/dev/null || : @cd ${WSTAGEDIR} ; \ env -i LC_COLLATE=C sort ${WSTAGEDIR}/${DISTDIR}/METALOG | \ awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk @for plist in ${WSTAGEDIR}/*.plist; do \ plist=$${plist##*/} ; \ pkgname=$${plist%.plist} ; \ echo "_PKGS+= $${pkgname}" ; \ done > ${WSTAGEDIR}/packages.mk ${_+_}@cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 create-world-packages-jobs \ .MAKE.JOB.PREFIX= .if make(create-world-packages-jobs) .include "${WSTAGEDIR}/packages.mk" .endif create-world-packages-jobs: .PHONY .for pkgname in ${_PKGS} create-world-packages-jobs: create-world-package-${pkgname} create-world-package-${pkgname}: .PHONY @sh ${SRCDIR}/release/packages/generate-ucl.sh -o ${pkgname} \ -s ${SRCDIR} -u ${WSTAGEDIR}/${pkgname}.ucl @awk -F\" ' \ /^name/ { printf("===> Creating %s-", $$2); next } \ /^version/ { print $$2; next } \ ' ${WSTAGEDIR}/${pkgname}.ucl @if [ "${pkgname}" == "runtime" ]; then \ sed -i '' -e "s/%VCS_REVISION%/${VCS_REVISION}/" ${WSTAGEDIR}/${pkgname}.ucl ; \ fi ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \ create -M ${WSTAGEDIR}/${pkgname}.ucl \ -p ${WSTAGEDIR}/${pkgname}.plist \ -r ${WSTAGEDIR} \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} .endfor _default_flavor= -default .if make(*package*) && exists(${KSTAGEDIR}/kernel.meta) . if ${MK_DEBUG_FILES} != "no" _debug=-debug . endif create-kernel-packages: .PHONY . for flavor in "" ${_debug} create-kernel-packages: create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},} create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},}: _pkgbootstrap .PHONY @cd ${KSTAGEDIR}/${DISTDIR} ; \ env -i LC_COLLATE=C sort ${KSTAGEDIR}/kernel.meta | \ awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk \ -v kernel=yes -v _kernconf=${INSTALLKERNEL} ; \ sed -e "s/%VERSION%/${PKG_VERSION}/" \ -e "s/%PKGNAME%/kernel-${INSTALLKERNEL:tl}${flavor}/" \ -e "s/%KERNELDIR%/kernel/" \ -e "s/%COMMENT%/FreeBSD ${INSTALLKERNEL} kernel ${flavor}/" \ -e "s/%DESC%/FreeBSD ${INSTALLKERNEL} kernel ${flavor}/" \ -e "s/ %VCS_REVISION%/${VCS_REVISION}/" \ ${SRCDIR}/release/packages/kernel.ucl \ > ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \ awk -F\" ' \ /name/ { printf("===> Creating %s-", $$2); next } \ /version/ {print $$2; next } ' \ ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \ ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \ create -M ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl \ -p ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.plist \ -r ${KSTAGEDIR}/${DISTDIR} \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} . endfor .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" . for _kernel in ${BUILDKERNELS:[2..-1]} . if exists(${KSTAGEDIR}/kernel.${_kernel}.meta) . if ${MK_DEBUG_FILES} != "no" _debug=-debug . endif . for flavor in "" ${_debug} create-kernel-packages: create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kernel} create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kernel}: _pkgbootstrap .PHONY @cd ${KSTAGEDIR}/kernel.${_kernel} ; \ env -i LC_COLLATE=C sort ${KSTAGEDIR}/kernel.${_kernel}.meta | \ awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk \ -v kernel=yes -v _kernconf=${_kernel} ; \ sed -e "s/%VERSION%/${PKG_VERSION}/" \ -e "s/%PKGNAME%/kernel-${_kernel:tl}${flavor}/" \ -e "s/%KERNELDIR%/kernel.${_kernel}/" \ -e "s/%COMMENT%/FreeBSD ${_kernel} kernel ${flavor}/" \ -e "s/%DESC%/FreeBSD ${_kernel} kernel ${flavor}/" \ -e "s/ %VCS_REVISION%/${VCS_REVISION}/" \ ${SRCDIR}/release/packages/kernel.ucl \ > ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \ awk -F\" ' \ /name/ { printf("===> Creating %s-", $$2); next } \ /version/ {print $$2; next } ' \ ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \ ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname -o ALLOW_BASE_SHLIBS=yes \ create -M ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl \ -p ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.plist \ -r ${KSTAGEDIR}/kernel.${_kernel} \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} . endfor . endif . endfor .endif sign-packages: _pkgbootstrap .PHONY @[ -L "${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/latest" ] && \ unlink ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/latest ; \ ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname repo \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} \ ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI)/${PKG_VERSION} \ ${PKGSIGNKEY} ; \ cd ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/usr/bin/uname config ABI); \ ln -s ${PKG_VERSION} latest # # # checkworld # # Run test suite on installed world. # checkworld: .PHONY @if [ ! -x "${LOCALBASE}/bin/kyua" ]; then \ echo "You need kyua (devel/kyua) to run the test suite." | /usr/bin/fmt; \ exit 1; \ fi ${_+_}PATH="$$PATH:${LOCALBASE}/bin" kyua test -k ${TESTSBASE}/Kyuafile # # # doxygen # # Build the API documentation with doxygen # doxygen: .PHONY @if [ ! -x "${LOCALBASE}/bin/doxygen" ]; then \ echo "You need doxygen (devel/doxygen) to generate the API documentation of the kernel." | /usr/bin/fmt; \ exit 1; \ fi ${_+_}cd ${.CURDIR}/tools/kerneldoc/subsys; ${MAKE} obj all # # update # # Update the source tree(s), by running svn/svnup to update to the # latest copy. # update: .PHONY .if defined(SVN_UPDATE) @echo "--------------------------------------------------------------" @echo ">>> Updating ${.CURDIR} using Subversion" @echo "--------------------------------------------------------------" @(cd ${.CURDIR}; ${SVN_CMD} update ${SVNFLAGS}) .endif # # ------------------------------------------------------------------------ # # From here onwards are utility targets used by the 'make world' and # related targets. If your 'world' breaks, you may like to try to fix # the problem and manually run the following targets to attempt to # complete the build. Beware, this is *not* guaranteed to work, you # need to have a pretty good grip on the current state of the system # to attempt to manually finish it. If in doubt, 'make world' again. # # # legacy: Build compatibility shims for the next three targets. This is a # minimal set of tools and shims necessary to compensate for older systems # which don't have the APIs required by the targets built in bootstrap-tools, # build-tools or cross-tools. # # libnv and libl are both requirements for config(8), which is an unconditional # bootstrap-tool. _config_deps= lib/libnv usr.bin/lex/lib legacy: .PHONY .if ${BOOTSTRAPPING} < ${MINIMUM_SUPPORTED_OSREL} && ${BOOTSTRAPPING} != 0 @echo "ERROR: Source upgrades from versions prior to ${MINIMUM_SUPPORTED_REL} are not supported."; \ false .endif .for _tool in tools/build ${_config_deps} ${_+_}@${ECHODIR} "===> ${_tool} (obj,includes,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${WORLDTMP}/legacy includes; \ ${MAKE} DIRPRFX=${_tool}/ MK_INCLUDES=no all; \ ${MAKE} DIRPRFX=${_tool}/ MK_INCLUDES=no \ DESTDIR=${WORLDTMP}/legacy install .endfor # # bootstrap-tools: Build tools needed for compatibility. These are binaries that # are built to build other binaries in the system. However, the focus of these # binaries is usually quite narrow. Bootstrap tools use the host's compiler and # libraries, augmented by -legacy, in addition to the libraries built during # bootstrap-tools. # _bt= _bootstrap-tools # We want to run the build with only ${WORLDTMP} in $PATH to ensure we don't # accidentally run tools that are incompatible but happen to be in $PATH. # This is especially important when building on Linux/MacOS where many of the # programs used during the build accept different flags or generate different # output. On those platforms we only symlink the tools known to be compatible # (e.g. basic utilities such as mkdir) into ${WORLDTMP} and build all others # from the FreeBSD sources during the bootstrap-tools stage. # We want to build without the user's $PATH starting in the bootstrap-tools # phase so the tools used in that phase (ln, cp, etc) must have already been # linked to $WORLDTMP. The tools are listed in the _host_tools_to_symlink # variable in tools/build/Makefile and are linked during the legacy phase. # Since they could be Linux or MacOS binaries, too we must only use flags that # are portable across operating systems. # If BOOTSTRAP_ALL_TOOLS is set we will build all the required tools from the # current source tree. Otherwise we create a symlink to the version found in # $PATH during the bootstrap-tools stage. .if defined(BOOTSTRAP_ALL_TOOLS) # BOOTSTRAPPING will be set on the command line so we can't override it here. # Instead set BOOTSTRAPPING_OSRELDATE so that the value 0 is set ${BSARGS} BOOTSTRAPPING_OSRELDATE:= 0 .endif .if ${MK_GAMES} != "no" _strfile= usr.bin/fortune/strfile .endif .if ${MK_GCC} != "no" && ${MK_CXX} != "no" _gperf= gnu/usr.bin/gperf .endif .if ${MK_VT} != "no" _vtfontcvt= usr.bin/vtfontcvt .endif # If we are not building the bootstrap because BOOTSTRAPPING is sufficient # we symlink the host version to $WORLDTMP instead. By doing this we can also # detect when a bootstrap tool is being used without the required MK_FOO. # If you add a new bootstrap tool where we could also use the host version, # please ensure that you also add a .else case where you add the tool to the # _bootstrap_tools_links variable. .if ${BOOTSTRAPPING} < 1000033 _m4= usr.bin/m4 _lex= usr.bin/lex # Note: lex needs m4 to build but m4 also depends on lex. However, lex can be # bootstrapped so we build lex first. ${_bt}-usr.bin/m4: ${_bt}-lib/libopenbsd ${_bt}-usr.bin/yacc ${_bt}-${_lex} _bt_m4_depend=${_bt}-${_m4} _bt_lex_depend=${_bt}-${_lex} ${_bt_m4_depend} .else _bootstrap_tools_links+=m4 lex .endif # ELF Tool Chain libraries are needed for ELF tools and dtrace tools. # r296685 fix cross-endian objcopy # r310724 fixed PR 215350, a crash in libdwarf with objects built by GCC 6.2. # r334881 added libdwarf constants used by ctfconvert. # r338478 fixed a crash in objcopy for mips64el objects # r339083 libelf: correct mips64el test to use ELF header # r348347 Add missing powerpc64 relocation support to libdwarf .if ${BOOTSTRAPPING} < 1300030 _elftoolchain_libs= lib/libelf lib/libdwarf ${_bt}-lib/libelf: ${_bt_m4_depend} ${_bt}-lib/libdwarf: ${_bt_m4_depend} .endif # r245440 mtree -N support added # r313404 requires sha384.h for libnetbsd, added to libmd in r292782 .if ${BOOTSTRAPPING} < 1100093 _nmtree= lib/libmd \ lib/libnetbsd \ usr.sbin/nmtree ${_bt}-lib/libnetbsd: ${_bt}-lib/libmd ${_bt}-usr.sbin/nmtree: ${_bt}-lib/libnetbsd .else _bootstrap_tools_links+=mtree .endif # r246097: log addition login.conf.db, passwd, pwd.db, and spwd.db with cat -l .if ${BOOTSTRAPPING} < 1000027 _cat= bin/cat .else _bootstrap_tools_links+=cat .endif # r277259 crunchide: Correct 64-bit section header offset # r281674 crunchide: always include both 32- and 64-bit ELF support .if ${BOOTSTRAPPING} < 1100078 _crunchide= usr.sbin/crunch/crunchide .else _bootstrap_tools_links+=crunchide .endif # r285986 crunchen: use STRIPBIN rather than STRIP # 1100113: Support MK_AUTO_OBJ # 1200006: META_MODE fixes .if ${BOOTSTRAPPING} < 1100078 || \ (${MK_AUTO_OBJ} == "yes" && ${BOOTSTRAPPING} < 1100114) || \ (${MK_META_MODE} == "yes" && ${BOOTSTRAPPING} < 1200006) _crunchgen= usr.sbin/crunch/crunchgen .else _bootstrap_tools_links+=crunchgen .endif # r296926 -P keymap search path, MFC to stable/10 in r298297 .if ${BOOTSTRAPPING} < 1003501 || \ (${BOOTSTRAPPING} >= 1100000 && ${BOOTSTRAPPING} < 1100103) _kbdcontrol= usr.sbin/kbdcontrol .else _bootstrap_tools_links+=kbdcontrol .endif _yacc= lib/liby \ usr.bin/yacc ${_bt}-usr.bin/yacc: ${_bt}-lib/liby .if ${MK_BSNMP} != "no" _gensnmptree= usr.sbin/bsnmpd/gensnmptree .endif .if ${MK_LOCALES} != "no" _localedef= usr.bin/localedef .endif # We need to build tblgen when we're building clang or lld, either as # bootstrap tools, or as the part of the normal build. .if ${MK_CLANG_BOOTSTRAP} != "no" || ${MK_CLANG} != "no" || \ ${MK_LLD_BOOTSTRAP} != "no" || ${MK_LLD} != "no" _clang_tblgen= \ lib/clang/libllvmminimal \ usr.bin/clang/llvm-tblgen \ usr.bin/clang/clang-tblgen \ usr.bin/clang/lldb-tblgen # XXX: lldb-tblgen is not needed, if top-level MK_LLDB=no ${_bt}-usr.bin/clang/clang-tblgen: ${_bt}-lib/clang/libllvmminimal ${_bt}-usr.bin/clang/llvm-tblgen: ${_bt}-lib/clang/libllvmminimal ${_bt}-usr.bin/clang/lldb-tblgen: ${_bt}-lib/clang/libllvmminimal .endif # Default to building the GPL DTC, but build the BSDL one if users explicitly # request it. _dtc= usr.bin/dtc .if ${MK_GPL_DTC} != "no" _dtc= gnu/usr.bin/dtc .endif .if ${MK_LOCALES} != "no" _localedef= usr.bin/localedef .endif .if ${MK_KERBEROS} != "no" _kerberos5_bootstrap_tools= \ kerberos5/tools/make-roken \ kerberos5/lib/libroken \ kerberos5/lib/libvers \ kerberos5/tools/asn1_compile \ kerberos5/tools/slc \ usr.bin/compile_et .ORDER: ${_kerberos5_bootstrap_tools:C/^/${_bt}-/g} .for _tool in ${_kerberos5_bootstrap_tools} ${_bt}-${_tool}: ${_bt}-usr.bin/yacc ${_bt_lex_depend} .endfor .endif ${_bt}-usr.bin/mandoc: ${_bt}-lib/libopenbsd # The tools listed in _basic_bootstrap_tools will generally not be # bootstrapped unless BOOTSTRAP_ALL_TOOL is set. However, when building on a # Linux or MacOS host the host versions are incompatible so we need to build # them from the source tree. Usually the link name will be the same as the subdir, # but some directories such as grep or test install multiple binaries. In that # case we use the _basic_bootstrap_tools_multilink variable which is a list of # subdirectory and comma-separated list of files. _basic_bootstrap_tools_multilink=usr.bin/grep grep,egrep,fgrep _basic_bootstrap_tools_multilink+=bin/test test,[ # bootstrap tools needed by buildworld: _basic_bootstrap_tools=usr.bin/awk usr.bin/cut bin/expr usr.bin/gencat \ usr.bin/join usr.bin/mktemp bin/rmdir usr.bin/sed usr.bin/sort \ usr.bin/truncate usr.bin/tsort # elf2aout is required for sparc64 build _basic_bootstrap_tools+=usr.bin/elf2aout # file2c is required for building usr.sbin/config: _basic_bootstrap_tools+=usr.bin/file2c # uuencode/uudecode required for share/tabset _basic_bootstrap_tools+=usr.bin/uuencode usr.bin/uudecode # xargs is required by mkioctls _basic_bootstrap_tools+=usr.bin/xargs # cap_mkdb is required for share/termcap: _basic_bootstrap_tools+=usr.bin/cap_mkdb # ldd is required for installcheck (TODO: just always use /usr/bin/ldd instead?) _basic_bootstrap_tools+=usr.bin/ldd # services_mkdb/pwd_mkdb are required for installworld: _basic_bootstrap_tools+=usr.sbin/services_mkdb usr.sbin/pwd_mkdb # sysctl/chflags are required for installkernel: _basic_bootstrap_tools+=sbin/sysctl bin/chflags # mkfifo is used by sys/conf/newvers.sh _basic_bootstrap_tools+=usr.bin/mkfifo .if ${MK_AMD} != "no" # unifdef is only used by usr.sbin/amd/libamu/Makefile _basic_bootstrap_tools+=usr.bin/unifdef .endif .if ${MK_BOOT} != "no" _basic_bootstrap_tools+=bin/dd # xz/unxz is used by EFI _basic_bootstrap_tools_multilink+=usr.bin/xz xz,unxz # md5 is used by boot/beri (and possibly others) _basic_bootstrap_tools+=sbin/md5 .if defined(BOOTSTRAP_ALL_TOOLS) ${_bt}-sbin/md5: ${_bt}-lib/libmd .endif .endif .if ${MK_ZONEINFO} != "no" _basic_bootstrap_tools+=usr.sbin/zic usr.sbin/tzsetup .endif .if defined(BOOTSTRAP_ALL_TOOLS) _other_bootstrap_tools+=${_basic_bootstrap_tools} .for _subdir _links in ${_basic_bootstrap_tools_multilink} _other_bootstrap_tools+=${_subdir} .endfor ${_bt}-usr.bin/awk: ${_bt_lex_depend} ${_bt}-usr.bin/yacc ${_bt}-bin/expr: ${_bt_lex_depend} ${_bt}-usr.bin/yacc # If we are bootstrapping file2c, we have to build it before config: ${_bt}-usr.sbin/config: ${_bt}-usr.bin/file2c ${_bt_lex_depend} # Note: no symlink to make/bmake in the !BOOTSTRAP_ALL_TOOLS case here since # the links to make/bmake make links will have already have been created in the # `make legacy` step. Not adding a link to make is important on non-FreeBSD # since "make" will usually point to GNU make there. _other_bootstrap_tools+=usr.bin/bmake .else # All tools in _basic_bootstrap_tools have the same name as the subdirectory # so we can use :T to get the name of the symlinks that we need to create. _bootstrap_tools_links+=${_basic_bootstrap_tools:T} .for _subdir _links in ${_basic_bootstrap_tools_multilink} _bootstrap_tools_links+=${_links:S/,/ /g} .endfor .endif # defined(BOOTSTRAP_ALL_TOOLS) # Link the tools that we need for building but don't need to bootstrap because # the host version is known to be compatible into ${WORLDTMP}/legacy # We do this before building any of the bootstrap tools in case they depend on # the presence of any of the links (e.g. as m4/lex/awk) ${_bt}-links: .PHONY .for _tool in ${_bootstrap_tools_links} ${_bt}-link-${_tool}: .PHONY .MAKE @if [ ! -e "${WORLDTMP}/legacy/bin/${_tool}" ]; then \ source_path=`which ${_tool}`; \ if [ ! -e "$${source_path}" ] ; then \ echo "Cannot find host tool '${_tool}'"; false; \ fi; \ ln -sfnv "$${source_path}" "${WORLDTMP}/legacy/bin/${_tool}"; \ fi ${_bt}-links: ${_bt}-link-${_tool} .endfor bootstrap-tools: ${_bt}-links .PHONY # Please document (add comment) why something is in 'bootstrap-tools'. # Try to bound the building of the bootstrap-tool to just the # FreeBSD versions that need the tool built at this stage of the build. .for _tool in \ ${_clang_tblgen} \ ${_kerberos5_bootstrap_tools} \ ${_strfile} \ ${_gperf} \ ${_dtc} \ ${_cat} \ ${_kbdcontrol} \ ${_elftoolchain_libs} \ usr.bin/lorder \ lib/libopenbsd \ usr.bin/mandoc \ usr.bin/rpcgen \ ${_yacc} \ ${_m4} \ ${_lex} \ ${_other_bootstrap_tools} \ usr.bin/xinstall \ ${_gensnmptree} \ usr.sbin/config \ ${_crunchide} \ ${_crunchgen} \ ${_nmtree} \ ${_vtfontcvt} \ ${_localedef} ${_bt}-${_tool}: ${_bt}-links .PHONY .MAKE ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ if [ "${_tool}" = "usr.bin/lex" ]; then \ ${MAKE} DIRPRFX=${_tool}/ bootstrap; \ fi; \ ${MAKE} DIRPRFX=${_tool}/ all; \ ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${WORLDTMP}/legacy install bootstrap-tools: ${_bt}-${_tool} .endfor # # build-tools: Build special purpose build tools # .if !defined(NO_SHARE) && ${MK_SYSCONS} != "no" _share= share/syscons/scrnmaps .endif .if ${MK_GCC} != "no" _gcc_tools= gnu/usr.bin/cc/cc_tools .endif .if ${MK_RESCUE} != "no" # rescue includes programs that have build-tools targets _rescue=rescue/rescue .endif .if ${MK_TCSH} != "no" _tcsh=bin/csh .endif .if ${MK_FILE} != "no" _libmagic=lib/libmagic .endif .if ${MK_PMC} != "no" && \ (${TARGET_ARCH} == "amd64" || ${TARGET_ARCH} == "i386") _jevents=lib/libpmc/pmu-events .endif # kernel-toolchain skips _cleanobj, so handle cleaning up previous # build-tools directories if needed. .if !defined(NO_CLEAN) && make(kernel-toolchain) _bt_clean= ${CLEANDIR} .endif .for _tool in \ ${_tcsh} \ bin/sh \ ${LOCAL_TOOL_DIRS} \ ${_jevents} \ lib/ncurses/ncurses \ lib/ncurses/ncursesw \ ${_rescue} \ ${_share} \ usr.bin/awk \ ${_libmagic} \ usr.bin/mkesdb_static \ usr.bin/mkcsmapper_static \ usr.bin/vi/catalog \ ${_gcc_tools} build-tools_${_tool}: .PHONY ${_+_}@${ECHODIR} "===> ${_tool} (${_bt_clean:D${_bt_clean},}obj,build-tools)"; \ cd ${.CURDIR}/${_tool}; \ if [ -n "${_bt_clean}" ]; then ${MAKE} DIRPRFX=${_tool}/ ${_bt_clean}; fi; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ build-tools build-tools: build-tools_${_tool} .endfor # # kernel-tools: Build kernel-building tools # kernel-tools: .PHONY mkdir -p ${WORLDTMP}/usr ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${WORLDTMP}/usr >/dev/null # # cross-tools: All the tools needed to build the rest of the system after # we get done with the earlier stages. It is the last set of tools needed # to begin building the target binaries. # .if ${TARGET_ARCH} != ${MACHINE_ARCH} || ${BUILD_WITH_STRICT_TMPPATH} != 0 .if ${TARGET_ARCH} == "amd64" || ${TARGET_ARCH} == "i386" _btxld= usr.sbin/btxld .endif .endif # Rebuild ctfconvert and ctfmerge to avoid difficult-to-diagnose failures # resulting from missing bug fixes or ELF Toolchain updates. .if ${MK_CDDL} != "no" _dtrace_tools= cddl/lib/libctf cddl/usr.bin/ctfconvert \ cddl/usr.bin/ctfmerge .endif # If we're given an XAS, don't build binutils. .if ${XAS:M/*} == "" .if ${MK_BINUTILS_BOOTSTRAP} != "no" _binutils= gnu/usr.bin/binutils .endif .if ${MK_ELFTOOLCHAIN_BOOTSTRAP} != "no" _elftctools= lib/libelftc \ lib/libpe \ usr.bin/objcopy \ usr.bin/nm \ usr.bin/size \ usr.bin/strings # These are not required by the build, but can be useful for developers who # cross-build on a FreeBSD 10 host: _elftctools+= usr.bin/addr2line .endif .elif ${TARGET_ARCH} != ${MACHINE_ARCH} && ${MK_ELFTOOLCHAIN_BOOTSTRAP} != "no" # If cross-building with an external binutils we still need to build strip for # the target (for at least crunchide). _elftctools= lib/libelftc \ lib/libpe \ usr.bin/objcopy .endif .if ${MK_CLANG_BOOTSTRAP} != "no" _clang= usr.bin/clang .endif .if ${MK_LLD_BOOTSTRAP} != "no" _lld= usr.bin/clang/lld .endif .if ${MK_CLANG_BOOTSTRAP} != "no" || ${MK_LLD_BOOTSTRAP} != "no" _clang_libs= lib/clang .endif .if ${MK_GCC_BOOTSTRAP} != "no" _gcc= gnu/usr.bin/cc .endif .if ${MK_USB} != "no" _usb_tools= stand/usb/tools .endif .if ${BUILD_WITH_STRICT_TMPPATH} != 0 || defined(BOOTSTRAP_ALL_TOOLS) _ar=usr.bin/ar .endif cross-tools: .MAKE .PHONY .for _tool in \ ${LOCAL_XTOOL_DIRS} \ ${_ar} \ ${_clang_libs} \ ${_clang} \ ${_lld} \ ${_binutils} \ ${_elftctools} \ ${_dtrace_tools} \ ${_gcc} \ ${_btxld} \ ${_usb_tools} ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ all; \ ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${WORLDTMP} install .endfor # # native-xtools is the current target for qemu-user cross builds of ports # via poudriere and the imgact_binmisc kernel module. # This target merely builds a toolchan/sysroot, then builds the tools it wants # with the options it wants in a special MAKEOBJDIRPREFIX, using the toolchain # already built. It then installs the static tools to NXBDESTDIR for Poudriere # to pickup. # NXBOBJROOT= ${OBJROOT}${MACHINE}.${MACHINE_ARCH}/nxb/ NXBOBJTOP= ${NXBOBJROOT}${NXB_TARGET}.${NXB_TARGET_ARCH} NXTP?= /nxb-bin .if ${NXTP:N/*} .error NXTP variable should be an absolute path .endif NXBDESTDIR?= ${DESTDIR}${NXTP} # This is the list of tools to be built/installed as static and where # appropriate to build for the given TARGET.TARGET_ARCH. NXBDIRS+= \ bin/cat \ bin/chmod \ bin/cp \ ${_tcsh} \ bin/echo \ bin/expr \ bin/hostname \ bin/ln \ bin/ls \ bin/mkdir \ bin/mv \ bin/ps \ bin/realpath \ bin/rm \ bin/rmdir \ bin/sh \ bin/sleep \ sbin/md5 \ sbin/sysctl \ usr.bin/addr2line \ usr.bin/ar \ usr.bin/awk \ usr.bin/basename \ usr.bin/bmake \ usr.bin/bzip2 \ usr.bin/cmp \ usr.bin/diff \ usr.bin/dirname \ usr.bin/objcopy \ usr.bin/env \ usr.bin/fetch \ usr.bin/find \ usr.bin/grep \ usr.bin/gzip \ usr.bin/head \ usr.bin/id \ usr.bin/lex \ usr.bin/limits \ usr.bin/lorder \ usr.bin/mandoc \ usr.bin/mktemp \ usr.bin/mt \ usr.bin/nm \ usr.bin/patch \ usr.bin/readelf \ usr.bin/sed \ usr.bin/size \ usr.bin/sort \ usr.bin/strings \ usr.bin/tar \ usr.bin/touch \ usr.bin/tr \ usr.bin/true \ usr.bin/uniq \ usr.bin/unzip \ usr.bin/wc \ usr.bin/xargs \ usr.bin/xinstall \ usr.bin/xz \ usr.bin/yacc \ usr.sbin/chown SUBDIR_DEPEND_usr.bin/clang= lib/clang .if ${MK_CLANG} != "no" NXBDIRS+= lib/clang NXBDIRS+= usr.bin/clang .endif .if ${MK_GCC} != "no" NXBDIRS+= gnu/usr.bin/cc .endif .if ${MK_BINUTILS} != "no" NXBDIRS+= gnu/usr.bin/binutils .endif # XXX: native-xtools passes along ${NXBDIRS} in SUBDIR_OVERRIDE that needs # to be evaluated after NXBDIRS is set. .if make(install) && !empty(SUBDIR_OVERRIDE) SUBDIR= ${SUBDIR_OVERRIDE} .endif NXBMAKEARGS+= \ OBJTOP=${NXBOBJTOP:Q} \ OBJROOT=${NXBOBJROOT:Q} \ MAKEOBJDIRPREFIX= \ -DNO_SHARED \ -DNO_CPU_CFLAGS \ -DNO_PIC \ SSP_CFLAGS= \ MK_CASPER=no \ MK_CLANG_EXTRAS=no \ MK_CLANG_FULL=no \ MK_CTF=no \ MK_DEBUG_FILES=no \ MK_GDB=no \ MK_HTML=no \ MK_LLDB=no \ MK_MAN=no \ MK_MAN_UTILS=yes \ MK_OFED=no \ MK_OPENSSH=no \ MK_PROFILE=no \ MK_RETPOLINE=no \ MK_SENDMAIL=no \ MK_SVNLITE=no \ MK_TESTS=no \ MK_WARNS=no \ MK_ZFS=no .if make(native-xtools*) && \ (!defined(NXB_TARGET) || !defined(NXB_TARGET_ARCH)) .error Missing NXB_TARGET / NXB_TARGET_ARCH .endif # For 'toolchain' we want to produce native binaries that themselves generate # native binaries. NXBTMAKE= ${NXBMAKEENV} ${MAKE} ${NXBMAKEARGS:N-DNO_PIC:N-DNO_SHARED} \ TARGET=${MACHINE} TARGET_ARCH=${MACHINE_ARCH} # For 'everything' we want to produce native binaries (hence -target to # be MACHINE) that themselves generate TARGET.TARGET_ARCH binaries. # TARGET/TARGET_ARCH are still passed along from user. # # Use the toolchain we create as an external toolchain. .if ${USING_SYSTEM_COMPILER} == "yes" || ${XCC:N${CCACHE_BIN}:M/*} NXBMAKE+= XCC="${XCC}" \ XCXX="${XCXX}" \ XCPP="${XCPP}" .else NXBMAKE+= XCC="${NXBOBJTOP}/tmp/usr/bin/cc" \ XCXX="${NXBOBJTOP}/tmp/usr/bin/c++" \ XCPP="${NXBOBJTOP}/tmp/usr/bin/cpp" .endif NXBMAKE+= ${NXBMAKEENV} ${MAKE} -f Makefile.inc1 ${NXBMAKEARGS} \ TARGET=${NXB_TARGET} TARGET_ARCH=${NXB_TARGET_ARCH} \ TARGET_TRIPLE=${MACHINE_TRIPLE:Q} # NXBDIRS is improperly based on MACHINE rather than NXB_TARGET. Need to # invoke a sub-make to reevaluate MK_GCC, etc, for NXBDIRS. NXBMAKE+= SUBDIR_OVERRIDE='$${NXBDIRS:M*}' # Need to avoid the -isystem logic when using clang as an external toolchain # even if the TARGET being built for wants GCC. NXBMAKE+= WANT_COMPILER_TYPE='$${X_COMPILER_TYPE}' native-xtools: .PHONY ${_+_}cd ${.CURDIR}; ${NXBTMAKE} _cleanobj MK_GCC=yes # Build the bootstrap/host/cross tools that produce native binaries # Pass along MK_GCC=yes to ensure GCC-needed build tools are built. # We don't quite know what the NXB_TARGET wants so just build it. ${_+_}cd ${.CURDIR}; ${NXBTMAKE} kernel-toolchain MK_GCC=yes # Populate includes/libraries sysroot that produce native binaries. # This is split out from 'toolchain' above mostly so that target LLVM # libraries have a proper LLVM_DEFAULT_TARGET_TRIPLE without # polluting the cross-compiler build. The LLVM/GCC libs are skipped # here to avoid the problem but are kept in 'toolchain' so that # needed build tools are built. ${_+_}cd ${.CURDIR}; ${NXBTMAKE} _includes MK_CLANG=no MK_GCC=no ${_+_}cd ${.CURDIR}; ${NXBTMAKE} _libraries MK_CLANG=no MK_GCC=no # Clean out improper TARGET=MACHINE files ${_+_}cd ${.CURDIR}/gnu/usr.bin/cc/cc_tools; ${NXBTMAKE} cleandir .if !defined(NO_OBJWALK) ${_+_}cd ${.CURDIR}; ${NXBMAKE} _obj .endif ${_+_}cd ${.CURDIR}; ${NXBMAKE} everything @echo ">> native-xtools done. Use 'make native-xtools-install' to install to a given DESTDIR" native-xtools-install: .PHONY mkdir -p ${NXBDESTDIR}/bin ${NXBDESTDIR}/sbin ${NXBDESTDIR}/usr ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${NXBDESTDIR}/usr >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${NXBDESTDIR}/usr/include >/dev/null ${_+_}cd ${.CURDIR}; ${NXBMAKE} \ DESTDIR=${NXBDESTDIR} \ -DNO_ROOT \ install # # hierarchy - ensure that all the needed directories are present # hierarchy hier: .MAKE .PHONY ${_+_}cd ${.CURDIR}/etc; ${HMAKE} distrib-dirs # # libraries - build all libraries, and install them under ${DESTDIR}. # # The list of libraries with dependents (${_prebuild_libs}) and their # interdependencies (__L) are built automatically by the # ${.CURDIR}/tools/make_libdeps.sh script. # libraries: .MAKE .PHONY ${_+_}cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 _prereq_libs; \ ${MAKE} -f Makefile.inc1 _startup_libs; \ ${MAKE} -f Makefile.inc1 _prebuild_libs; \ ${MAKE} -f Makefile.inc1 _generic_libs # # static libgcc.a prerequisite for shared libc # _prereq_libs= lib/libcompiler_rt .if ${MK_SSP} != "no" _prereq_libs+= gnu/lib/libssp/libssp_nonshared .endif # These dependencies are not automatically generated: # # gnu/lib/csu, gnu/lib/libgcc, lib/csu and lib/libc must be built before # all shared libraries for ELF. # _startup_libs= lib/csu .if ${MK_BSD_CRTBEGIN} == "no" _startup_libs+= gnu/lib/csu .endif _startup_libs+= lib/libcompiler_rt _startup_libs+= lib/libc _startup_libs+= lib/libc_nonshared .if ${MK_LIBCPLUSPLUS} != "no" _startup_libs+= lib/libcxxrt .endif .if ${MK_LLVM_LIBUNWIND} != "no" _prereq_libs+= lib/libgcc_eh lib/libgcc_s _startup_libs+= lib/libgcc_eh lib/libgcc_s lib/libgcc_s__L: lib/libc__L lib/libgcc_s__L: lib/libc_nonshared__L .if ${MK_LIBCPLUSPLUS} != "no" lib/libcxxrt__L: lib/libgcc_s__L .endif .else # MK_LLVM_LIBUNWIND == no _prereq_libs+= gnu/lib/libgcc _startup_libs+= gnu/lib/libgcc gnu/lib/libgcc__L: lib/libc__L gnu/lib/libgcc__L: lib/libc_nonshared__L .if ${MK_LIBCPLUSPLUS} != "no" lib/libcxxrt__L: gnu/lib/libgcc__L .endif .endif _prebuild_libs= ${_kerberos5_lib_libasn1} \ ${_kerberos5_lib_libhdb} \ ${_kerberos5_lib_libheimbase} \ ${_kerberos5_lib_libheimntlm} \ ${_libsqlite3} \ ${_kerberos5_lib_libheimipcc} \ ${_kerberos5_lib_libhx509} ${_kerberos5_lib_libkrb5} \ ${_kerberos5_lib_libroken} \ ${_kerberos5_lib_libwind} \ lib/libbz2 ${_libcom_err} lib/libcrypt \ lib/libelf lib/libexpat \ lib/libfigpar \ ${_lib_libgssapi} \ lib/libkiconv lib/libkvm lib/liblzma lib/libmd lib/libnv \ lib/libzstd \ ${_lib_casper} \ lib/ncurses/ncurses lib/ncurses/ncursesw \ lib/libopie lib/libpam/libpam ${_lib_libthr} \ ${_lib_libradius} lib/libsbuf lib/libtacplus \ lib/libgeom \ ${_cddl_lib_libumem} ${_cddl_lib_libnvpair} \ ${_cddl_lib_libuutil} \ ${_cddl_lib_libavl} \ ${_cddl_lib_libzfs_core} ${_cddl_lib_libzfs} \ ${_cddl_lib_libctf} \ lib/libufs \ lib/libutil lib/libpjdlog ${_lib_libypclnt} lib/libz lib/msun \ ${_secure_lib_libcrypto} ${_secure_lib_libssl} \ ${_lib_libldns} ${_secure_lib_libssh} .if ${MK_GNUCXX} != "no" _prebuild_libs+= gnu/lib/libstdc++ gnu/lib/libsupc++ gnu/lib/libstdc++__L: lib/msun__L gnu/lib/libsupc++__L: gnu/lib/libstdc++__L .endif .if ${MK_DIALOG} != "no" _prebuild_libs+= gnu/lib/libdialog gnu/lib/libdialog__L: lib/msun__L lib/ncurses/ncursesw__L .endif .if ${MK_LIBCPLUSPLUS} != "no" _prebuild_libs+= lib/libc++ .endif lib/libgeom__L: lib/libexpat__L lib/libkvm__L: lib/libelf__L .if ${MK_LIBTHR} != "no" _lib_libthr= lib/libthr .endif .if ${MK_RADIUS_SUPPORT} != "no" _lib_libradius= lib/libradius .endif .if ${MK_OFED} != "no" _prebuild_libs+= \ lib/ofed/libibverbs \ lib/ofed/libibmad \ lib/ofed/libibumad \ lib/ofed/complib \ lib/ofed/libmlx5 lib/ofed/libibmad__L: lib/ofed/libibumad__L lib/ofed/complib__L: lib/libthr__L lib/ofed/libmlx5__L: lib/ofed/libibverbs__L lib/libthr__L .endif .if ${MK_CASPER} != "no" _lib_casper= lib/libcasper .endif lib/libpjdlog__L: lib/libutil__L lib/libcasper__L: lib/libnv__L lib/liblzma__L: lib/libthr__L +lib/libzstd__L: lib/libthr__L _generic_libs= ${_cddl_lib} gnu/lib ${_kerberos5_lib} lib ${_secure_lib} usr.bin/lex/lib .if ${MK_IPFILTER} != "no" _generic_libs+= sbin/ipf/libipf .endif .for _DIR in ${LOCAL_LIB_DIRS} .if ${_DIR} == ".WAIT" || (empty(_generic_libs:M${_DIR}) && exists(${.CURDIR}/${_DIR}/Makefile)) _generic_libs+= ${_DIR} .endif .endfor lib/libopie__L lib/libtacplus__L: lib/libmd__L .if ${MK_CDDL} != "no" _cddl_lib_libumem= cddl/lib/libumem _cddl_lib_libnvpair= cddl/lib/libnvpair _cddl_lib_libavl= cddl/lib/libavl _cddl_lib_libuutil= cddl/lib/libuutil .if ${MK_ZFS} != "no" _cddl_lib_libzfs_core= cddl/lib/libzfs_core _cddl_lib_libzfs= cddl/lib/libzfs cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L cddl/lib/libzfs__L: lib/libthr__L lib/libmd__L lib/libz__L cddl/lib/libumem__L cddl/lib/libzfs__L: cddl/lib/libuutil__L cddl/lib/libavl__L lib/libgeom__L lib/libbe__L: cddl/lib/libzfs__L .endif _cddl_lib_libctf= cddl/lib/libctf _cddl_lib= cddl/lib cddl/lib/libctf__L: lib/libz__L .endif # cddl/lib/libdtrace requires lib/libproc and lib/librtld_db; it's only built # on select architectures though (see cddl/lib/Makefile) .if ${MACHINE_CPUARCH} != "sparc64" _prebuild_libs+= lib/libprocstat lib/libproc lib/librtld_db lib/libprocstat__L: lib/libelf__L lib/libkvm__L lib/libutil__L lib/libproc__L: lib/libprocstat__L lib/librtld_db__L: lib/libprocstat__L .endif .if ${MK_CRYPT} != "no" .if ${MK_OPENSSL} != "no" _secure_lib_libcrypto= secure/lib/libcrypto _secure_lib_libssl= secure/lib/libssl lib/libradius__L secure/lib/libssl__L: secure/lib/libcrypto__L secure/lib/libcrypto__L: lib/libthr__L .if ${MK_LDNS} != "no" _lib_libldns= lib/libldns lib/libldns__L: secure/lib/libssl__L .endif .if ${MK_OPENSSH} != "no" _secure_lib_libssh= secure/lib/libssh secure/lib/libssh__L: lib/libz__L secure/lib/libcrypto__L lib/libcrypt__L .if ${MK_LDNS} != "no" secure/lib/libssh__L: lib/libldns__L .endif .if ${MK_GSSAPI} != "no" && ${MK_KERBEROS_SUPPORT} != "no" secure/lib/libssh__L: lib/libgssapi__L kerberos5/lib/libkrb5__L \ kerberos5/lib/libhx509__L kerberos5/lib/libasn1__L lib/libcom_err__L \ lib/libmd__L kerberos5/lib/libroken__L .endif .endif .endif _secure_lib= secure/lib .endif .if ${MK_KERBEROS} != "no" kerberos5/lib/libasn1__L: lib/libcom_err__L kerberos5/lib/libroken__L kerberos5/lib/libhdb__L: kerberos5/lib/libasn1__L lib/libcom_err__L \ kerberos5/lib/libkrb5__L kerberos5/lib/libroken__L \ kerberos5/lib/libwind__L lib/libsqlite3__L kerberos5/lib/libheimntlm__L: secure/lib/libcrypto__L kerberos5/lib/libkrb5__L \ kerberos5/lib/libroken__L lib/libcom_err__L kerberos5/lib/libhx509__L: kerberos5/lib/libasn1__L lib/libcom_err__L \ secure/lib/libcrypto__L kerberos5/lib/libroken__L kerberos5/lib/libwind__L kerberos5/lib/libkrb5__L: kerberos5/lib/libasn1__L lib/libcom_err__L \ lib/libcrypt__L secure/lib/libcrypto__L kerberos5/lib/libhx509__L \ kerberos5/lib/libroken__L kerberos5/lib/libwind__L \ kerberos5/lib/libheimbase__L kerberos5/lib/libheimipcc__L kerberos5/lib/libroken__L: lib/libcrypt__L kerberos5/lib/libwind__L: kerberos5/lib/libroken__L lib/libcom_err__L kerberos5/lib/libheimbase__L: lib/libthr__L kerberos5/lib/libheimipcc__L: kerberos5/lib/libroken__L kerberos5/lib/libheimbase__L lib/libthr__L .endif lib/libsqlite3__L: lib/libthr__L .if ${MK_GSSAPI} != "no" _lib_libgssapi= lib/libgssapi .endif .if ${MK_KERBEROS} != "no" _kerberos5_lib= kerberos5/lib _kerberos5_lib_libasn1= kerberos5/lib/libasn1 _kerberos5_lib_libhdb= kerberos5/lib/libhdb _kerberos5_lib_libheimbase= kerberos5/lib/libheimbase _kerberos5_lib_libkrb5= kerberos5/lib/libkrb5 _kerberos5_lib_libhx509= kerberos5/lib/libhx509 _kerberos5_lib_libroken= kerberos5/lib/libroken _kerberos5_lib_libheimntlm= kerberos5/lib/libheimntlm _libsqlite3= lib/libsqlite3 _kerberos5_lib_libheimipcc= kerberos5/lib/libheimipcc _kerberos5_lib_libwind= kerberos5/lib/libwind _libcom_err= lib/libcom_err .endif .if ${MK_NIS} != "no" _lib_libypclnt= lib/libypclnt .endif .if ${MK_OPENSSL} == "no" lib/libradius__L: lib/libmd__L .endif lib/libproc__L: \ ${_cddl_lib_libctf:D${_cddl_lib_libctf}__L} lib/libelf__L lib/librtld_db__L lib/libutil__L .if ${MK_CXX} != "no" .if ${MK_LIBCPLUSPLUS} != "no" lib/libproc__L: lib/libcxxrt__L .else # This implies MK_GNUCXX != "no"; see lib/libproc lib/libproc__L: gnu/lib/libsupc++__L .endif .endif .for _lib in ${_prereq_libs} ${_lib}__PL: .PHONY .MAKE .if !defined(_MKSHOWCONFIG) && exists(${.CURDIR}/${_lib}) ${_+_}@${ECHODIR} "===> ${_lib} (obj,all,install)"; \ cd ${.CURDIR}/${_lib}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ obj; fi; \ ${MAKE} MK_TESTS=no MK_PROFILE=no -DNO_PIC \ DIRPRFX=${_lib}/ all; \ ${MAKE} MK_TESTS=no MK_PROFILE=no -DNO_PIC \ DIRPRFX=${_lib}/ install .endif .endfor .for _lib in ${_startup_libs} ${_prebuild_libs} ${_generic_libs} ${_lib}__L: .PHONY .MAKE .if !defined(_MKSHOWCONFIG) && exists(${.CURDIR}/${_lib}) ${_+_}@${ECHODIR} "===> ${_lib} (obj,all,install)"; \ cd ${.CURDIR}/${_lib}; \ if [ -z "${NO_OBJWALK}" ]; then ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ obj; fi; \ ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ all; \ ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ install .endif .endfor _prereq_libs: ${_prereq_libs:S/$/__PL/} _startup_libs: ${_startup_libs:S/$/__L/} _prebuild_libs: ${_prebuild_libs:S/$/__L/} _generic_libs: ${_generic_libs:S/$/__L/} # Enable SUBDIR_PARALLEL when not calling 'make all', unless called from # 'everything' with _PARALLEL_SUBDIR_OK set. This is because it is unlikely # that running 'make all' from the top-level, especially with a SUBDIR_OVERRIDE # or LOCAL_DIRS set, will have a reliable build if SUBDIRs are built in # parallel. This is safe for the world stage of buildworld though since it has # already built libraries in a proper order and installed includes into # WORLDTMP. Special handling is done for SUBDIR ordering for 'install*' to # avoid trashing a system if it crashes mid-install. .if !make(all) || defined(_PARALLEL_SUBDIR_OK) SUBDIR_PARALLEL= .endif .include .if make(check-old) || make(check-old-dirs) || \ make(check-old-files) || make(check-old-libs) || \ make(delete-old) || make(delete-old-dirs) || \ make(delete-old-files) || make(delete-old-libs) # # check for / delete old files section # .include "ObsoleteFiles.inc" OLD_LIBS_MESSAGE="Please be sure no application still uses those libraries, \ else you can not start such an application. Consult UPDATING for more \ information regarding how to cope with the removal/revision bump of a \ specific library." .if !defined(BATCH_DELETE_OLD_FILES) RM_I=-i .else RM_I=-v .endif delete-old-files: .PHONY @echo ">>> Removing old files (only deletes safe to delete libs)" # Ask for every old file if the user really wants to remove it. # It's annoying, but better safe than sorry. # NB: We cannot pass the list of OLD_FILES as a parameter because the # argument list will get too long. Using .for/.endfor make "loops" will make # the Makefile parser segfault. @exec 3<&0; \ cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_FILES -V "OLD_FILES:Musr/share/*.gz:R" | xargs -n1 | sort | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ chflags noschg "${DESTDIR}/$${file}" 2>/dev/null || true; \ rm ${RM_I} "${DESTDIR}/$${file}" <&3; \ fi; \ for ext in debug symbols; do \ if ! [ -e "${DESTDIR}/$${file}" ] && [ -f \ "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ rm ${RM_I} "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" \ <&3; \ fi; \ done; \ done # Remove catpages without corresponding manpages. @exec 3<&0; \ find ${DESTDIR}/usr/share/man/cat* ! -type d 2>/dev/null | sort | \ sed -ep -e's:${DESTDIR}/usr/share/man/cat:${DESTDIR}/usr/share/man/man:' | \ while read catpage; do \ read manpage; \ if [ ! -e "$${manpage}" ]; then \ rm ${RM_I} $${catpage} <&3; \ fi; \ done @echo ">>> Old files removed" check-old-files: .PHONY @echo ">>> Checking for old files" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_FILES -V "OLD_FILES:Musr/share/*.gz:R" | xargs -n1 | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ echo "${DESTDIR}/$${file}"; \ fi; \ for ext in debug symbols; do \ if [ -f "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}"; \ fi; \ done; \ done | sort # Check for catpages without corresponding manpages. @find ${DESTDIR}/usr/share/man/cat* ! -type d 2>/dev/null | \ sed -ep -e's:${DESTDIR}/usr/share/man/cat:${DESTDIR}/usr/share/man/man:' | \ while read catpage; do \ read manpage; \ if [ ! -e "$${manpage}" ]; then \ echo $${catpage}; \ fi; \ done | sort delete-old-libs: .PHONY @echo ">>> Removing old libraries" @echo "${OLD_LIBS_MESSAGE}" | fmt @exec 3<&0; \ cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_LIBS | xargs -n1 | sort | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ chflags noschg "${DESTDIR}/$${file}" 2>/dev/null || true; \ rm ${RM_I} "${DESTDIR}/$${file}" <&3; \ fi; \ for ext in debug symbols; do \ if ! [ -e "${DESTDIR}/$${file}" ] && [ -f \ "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ rm ${RM_I} "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" \ <&3; \ fi; \ done; \ done @echo ">>> Old libraries removed" check-old-libs: .PHONY @echo ">>> Checking for old libraries" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_LIBS | xargs -n1 | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ echo "${DESTDIR}/$${file}"; \ fi; \ for ext in debug symbols; do \ if [ -f "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}"; \ fi; \ done; \ done | sort delete-old-dirs: .PHONY @echo ">>> Removing old directories" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_DIRS | xargs -n1 | sort -r | \ while read dir; do \ if [ -d "${DESTDIR}/$${dir}" ]; then \ rmdir -v "${DESTDIR}/$${dir}" || true; \ elif [ -L "${DESTDIR}/$${dir}" ]; then \ echo "${DESTDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ if [ -d "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ rmdir -v "${DESTDIR}${DEBUGDIR}/$${dir}" || true; \ elif [ -L "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ done @echo ">>> Old directories removed" check-old-dirs: .PHONY @echo ">>> Checking for old directories" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_DIRS | xargs -n1 | sort -r | \ while read dir; do \ if [ -d "${DESTDIR}/$${dir}" ]; then \ echo "${DESTDIR}/$${dir}"; \ elif [ -L "${DESTDIR}/$${dir}" ]; then \ echo "${DESTDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ if [ -d "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${dir}"; \ elif [ -L "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ done delete-old: delete-old-files delete-old-dirs .PHONY @echo "To remove old libraries run '${MAKE_CMD} delete-old-libs'." check-old: check-old-files check-old-libs check-old-dirs .PHONY @echo "To remove old files and directories run '${MAKE_CMD} delete-old'." @echo "To remove old libraries run '${MAKE_CMD} delete-old-libs'." .endif # # showconfig - show build configuration. # showconfig: .PHONY @(${MAKE} -n -f ${.CURDIR}/sys/conf/kern.opts.mk -V dummy -dg1 UPDATE_DEPENDFILE=no NO_OBJ=yes; \ ${MAKE} -n -f ${.CURDIR}/share/mk/src.opts.mk -V dummy -dg1 UPDATE_DEPENDFILE=no NO_OBJ=yes) 2>&1 | grep ^MK_ | sort -u .if !empty(KRNLOBJDIR) && !empty(KERNCONF) DTBOUTPUTPATH= ${KRNLOBJDIR}/${KERNCONF}/ .if !defined(FDT_DTS_FILE) || empty(FDT_DTS_FILE) .if !defined(_MKSHOWCONFIG) && exists(${KERNCONFDIR}/${KERNCONF}) FDT_DTS_FILE!= awk 'BEGIN {FS="="} /^makeoptions[[:space:]]+FDT_DTS_FILE/ {print $$2}' \ '${KERNCONFDIR}/${KERNCONF}' ; echo .endif .endif .endif .if !defined(DTBOUTPUTPATH) || !exists(${DTBOUTPUTPATH}) DTBOUTPUTPATH= ${.CURDIR} .endif # # Build 'standalone' Device Tree Blob # builddtb: .PHONY @PATH=${TMPPATH} MACHINE=${TARGET} \ ${.CURDIR}/sys/tools/fdt/make_dtb.sh ${.CURDIR}/sys \ "${FDT_DTS_FILE}" ${DTBOUTPUTPATH} ############### # cleanworld # In the following, the first 'rm' in a series will usually remove all # files and directories. If it does not, then there are probably some # files with file flags set, so this unsets them and tries the 'rm' a # second time. There are situations where this target will be cleaning # some directories via more than one method, but that duplication is # needed to correctly handle all the possible situations. Removing all # files without file flags set in the first 'rm' instance saves time, # because 'chflags' will need to operate on fewer files afterwards. # # It is expected that BW_CANONICALOBJDIR == the CANONICALOBJDIR as would be # created by bsd.obj.mk, except that we don't want to .include that file # in this makefile. We don't do a cleandir walk if MK_AUTO_OBJ is yes # since it is not possible for files to land in the wrong place. # .if make(cleanworld) BW_CANONICALOBJDIR:=${OBJTOP}/ .elif make(cleanuniverse) BW_CANONICALOBJDIR:=${OBJROOT} .if ${MK_UNIFIED_OBJDIR} == "no" .error ${.TARGETS} only supported with WITH_UNIFIED_OBJDIR enabled. .endif .endif cleanworld cleanuniverse: .PHONY .if !empty(BW_CANONICALOBJDIR) && exists(${BW_CANONICALOBJDIR}) && \ ${.CURDIR:tA} != ${BW_CANONICALOBJDIR:tA} -rm -rf ${BW_CANONICALOBJDIR}* -chflags -R 0 ${BW_CANONICALOBJDIR} rm -rf ${BW_CANONICALOBJDIR}* .endif .if make(cleanworld) && ${MK_AUTO_OBJ} == "no" && \ (empty(BW_CANONICALOBJDIR) || ${.CURDIR:tA} == ${BW_CANONICALOBJDIR:tA}) .if ${.CURDIR} == ${.OBJDIR} || ${.CURDIR}/obj == ${.OBJDIR} # To be safe in this case, fall back to a 'make cleandir' ${_+_}@cd ${.CURDIR}; ${MAKE} cleandir .endif .endif .if ${TARGET} == ${MACHINE} && ${TARGET_ARCH} == ${MACHINE_ARCH} XDEV_CPUTYPE?=${CPUTYPE} .else XDEV_CPUTYPE?=${TARGET_CPUTYPE} .endif NOFUN=-DNO_FSCHG MK_HTML=no -DNO_LINT \ MK_MAN=no MK_NLS=no MK_PROFILE=no \ MK_KERBEROS=no MK_RESCUE=no MK_TESTS=no MK_WARNS=no \ TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \ CPUTYPE=${XDEV_CPUTYPE} XDDIR=${TARGET_ARCH}-freebsd XDTP?=/usr/${XDDIR} .if ${XDTP:N/*} .error XDTP variable should be an absolute path .endif CDBOBJROOT= ${OBJROOT}${MACHINE}.${MACHINE_ARCH}/xdev/ CDBOBJTOP= ${CDBOBJROOT}${XDDIR} CDBENV= \ INSTALL="sh ${.CURDIR}/tools/install.sh" CDENV= ${CDBENV} \ TOOLS_PREFIX=${XDTP} CDMAKEARGS= \ OBJTOP=${CDBOBJTOP:Q} \ OBJROOT=${CDBOBJROOT:Q} CD2MAKEARGS= ${CDMAKEARGS} .if ${WANT_COMPILER_TYPE} == gcc || \ (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == gcc) # GCC requires -isystem and -L when using a cross-compiler. --sysroot # won't set header path and -L is used to ensure the base library path # is added before the port PREFIX library path. CD2CFLAGS+= -isystem ${XDDESTDIR}/usr/include -L${XDDESTDIR}/usr/lib # GCC requires -B to find /usr/lib/crti.o when using a cross-compiler # combined with --sysroot. CD2CFLAGS+= -B${XDDESTDIR}/usr/lib # Force using libc++ for external GCC. .if defined(X_COMPILER_TYPE) && \ ${X_COMPILER_TYPE} == gcc && ${X_COMPILER_VERSION} >= 40800 CD2CXXFLAGS+= -isystem ${XDDESTDIR}/usr/include/c++/v1 -std=c++11 \ -nostdinc++ .endif .endif CD2CFLAGS+= --sysroot=${XDDESTDIR}/ CD2ENV=${CDENV} CC="${CC} ${CD2CFLAGS}" CXX="${CXX} ${CD2CXXFLAGS} ${CD2CFLAGS}" \ CPP="${CPP} ${CD2CFLAGS}" \ MACHINE=${TARGET} MACHINE_ARCH=${TARGET_ARCH} CDTMP= ${OBJTOP}/${XDDIR}/tmp CDMAKE=${CDENV} PATH=${CDTMP}/usr/bin:${PATH} ${MAKE} ${CDMAKEARGS} ${NOFUN} CD2MAKE=${CD2ENV} PATH=${CDTMP}/usr/bin:${XDDESTDIR}/usr/bin:${PATH} \ ${MAKE} ${CD2MAKEARGS} ${NOFUN} .if ${MK_META_MODE} != "no" # Don't rebuild build-tools targets during normal build. CD2MAKE+= BUILD_TOOLS_META=.NOMETA .endif XDDESTDIR=${DESTDIR}${XDTP} .ORDER: xdev-build xdev-install xdev-links xdev: xdev-build xdev-install .PHONY .ORDER: _xb-worldtmp _xb-bootstrap-tools _xb-build-tools _xb-cross-tools xdev-build: _xb-worldtmp _xb-bootstrap-tools _xb-build-tools _xb-cross-tools .PHONY _xb-worldtmp: .PHONY mkdir -p ${CDTMP}/usr ${WORLDTMP_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${CDTMP}/usr >/dev/null _xb-bootstrap-tools: .PHONY .for _tool in \ ${_clang_tblgen} \ ${_gperf} \ ${_yacc} ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${CDMAKE} DIRPRFX=${_tool}/ obj; fi; \ ${CDMAKE} DIRPRFX=${_tool}/ all; \ ${CDMAKE} DIRPRFX=${_tool}/ DESTDIR=${CDTMP} install .endfor _xb-build-tools: .PHONY ${_+_}@cd ${.CURDIR}; \ ${CDBENV} ${MAKE} ${CDMAKEARGS} -f Makefile.inc1 ${NOFUN} build-tools XDEVDIRS= \ ${_clang_libs} \ ${_lld} \ ${_binutils} \ ${_elftctools} \ usr.bin/ar \ ${_clang} \ ${_gcc} _xb-cross-tools: .PHONY .for _tool in ${XDEVDIRS} ${_+_}@${ECHODIR} "===> xdev ${_tool} (obj,all)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJWALK}" ]; then ${CDMAKE} DIRPRFX=${_tool}/ obj; fi; \ ${CDMAKE} DIRPRFX=${_tool}/ all .endfor _xi-mtree: .PHONY ${_+_}@${ECHODIR} "mtree populating ${XDDESTDIR}" mkdir -p ${XDDESTDIR} ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.root.dist \ -p ${XDDESTDIR} >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${XDDESTDIR}/usr >/dev/null ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${XDDESTDIR}/usr/include >/dev/null .if defined(LIBCOMPAT) ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${XDDESTDIR}/usr >/dev/null .endif .if ${MK_TESTS} != "no" mkdir -p ${XDDESTDIR}${TESTSBASE} ${DESTDIR_MTREE} -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${XDDESTDIR}${TESTSBASE} >/dev/null .endif .ORDER: xdev-build _xi-mtree _xi-cross-tools _xi-includes _xi-libraries xdev-install: xdev-build _xi-mtree _xi-cross-tools _xi-includes _xi-libraries .PHONY _xi-cross-tools: .PHONY @echo "_xi-cross-tools" .for _tool in ${XDEVDIRS} ${_+_}@${ECHODIR} "===> xdev ${_tool} (install)"; \ cd ${.CURDIR}/${_tool}; \ ${CDMAKE} DIRPRFX=${_tool}/ install DESTDIR=${XDDESTDIR} .endfor _xi-includes: .PHONY .if !defined(NO_OBJWALK) ${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 _obj \ DESTDIR=${XDDESTDIR} .endif ${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 includes \ DESTDIR=${XDDESTDIR} _xi-libraries: .PHONY ${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 libraries \ DESTDIR=${XDDESTDIR} xdev-links: .PHONY ${_+_}cd ${XDDESTDIR}/usr/bin; \ mkdir -p ../../../../usr/bin; \ for i in *; do \ ln -sf ../../${XDTP}/usr/bin/$$i \ ../../../../usr/bin/${XDDIR}-$$i; \ ln -sf ../../${XDTP}/usr/bin/$$i \ ../../../../usr/bin/${XDDIR}${_REVISION}-$$i; \ done Index: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c (revision 352586) +++ projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c (revision 352587) @@ -1,3855 +1,3853 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Igor Kozhukhov */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" #include #include #include #include #ifdef __FreeBSD__ extern int zfs_ioctl_version; #endif /* in libzfs_dataset.c */ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); /* We need to use something for ENODATA. */ #define ENODATA EIDRM static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *, const char *); static int guid_to_name(libzfs_handle_t *, const char *, uint64_t, boolean_t, char *); static const zio_cksum_t zero_cksum = { 0 }; typedef struct dedup_arg { int inputfd; int outputfd; libzfs_handle_t *dedup_hdl; } dedup_arg_t; typedef struct progress_arg { zfs_handle_t *pa_zhp; int pa_fd; boolean_t pa_parsable; boolean_t pa_astitle; uint64_t pa_size; } progress_arg_t; typedef struct dataref { uint64_t ref_guid; uint64_t ref_object; uint64_t ref_offset; } dataref_t; typedef struct dedup_entry { struct dedup_entry *dde_next; zio_cksum_t dde_chksum; uint64_t dde_prop; dataref_t dde_ref; } dedup_entry_t; #define MAX_DDT_PHYSMEM_PERCENT 20 #define SMALLEST_POSSIBLE_MAX_DDT_MB 128 typedef struct dedup_table { dedup_entry_t **dedup_hash_array; umem_cache_t *ddecache; uint64_t max_ddt_size; /* max dedup table size in bytes */ uint64_t cur_ddt_size; /* current dedup table size in bytes */ uint64_t ddt_count; int numhashbits; boolean_t ddt_full; } dedup_table_t; static int high_order_bit(uint64_t n) { int count; for (count = 0; n != 0; count++) n >>= 1; return (count); } static size_t ssread(void *buf, size_t len, FILE *stream) { size_t outlen; if ((outlen = fread(buf, len, 1, stream)) == 0) return (0); return (outlen); } static void ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp, zio_cksum_t *cs, uint64_t prop, dataref_t *dr) { dedup_entry_t *dde; if (ddt->cur_ddt_size >= ddt->max_ddt_size) { if (ddt->ddt_full == B_FALSE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Dedup table full. Deduplication will continue " "with existing table entries")); ddt->ddt_full = B_TRUE; } return; } if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT)) != NULL) { assert(*ddepp == NULL); dde->dde_next = NULL; dde->dde_chksum = *cs; dde->dde_prop = prop; dde->dde_ref = *dr; *ddepp = dde; ddt->cur_ddt_size += sizeof (dedup_entry_t); ddt->ddt_count++; } } /* * Using the specified dedup table, do a lookup for an entry with * the checksum cs. If found, return the block's reference info * in *dr. Otherwise, insert a new entry in the dedup table, using * the reference information specified by *dr. * * return value: true - entry was found * false - entry was not found */ static boolean_t ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs, uint64_t prop, dataref_t *dr) { uint32_t hashcode; dedup_entry_t **ddepp; hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits); for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL; ddepp = &((*ddepp)->dde_next)) { if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) && (*ddepp)->dde_prop == prop) { *dr = (*ddepp)->dde_ref; return (B_TRUE); } } ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr); return (B_FALSE); } static int dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, zio_cksum_t *zc, int outfd) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); (void) fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } (void) fletcher_4_incremental_native( &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { (void) fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } /* * This function is started in a separate thread when the dedup option * has been requested. The main send thread determines the list of * snapshots to be included in the send stream and makes the ioctl calls * for each one. But instead of having the ioctl send the output to the * the output fd specified by the caller of zfs_send()), the * ioctl is told to direct the output to a pipe, which is read by the * alternate thread running THIS function. This function does the * dedup'ing by: * 1. building a dedup table (the DDT) * 2. doing checksums on each data block and inserting a record in the DDT * 3. looking for matching checksums, and * 4. sending a DRR_WRITE_BYREF record instead of a write record whenever * a duplicate block is found. * The output of this function then goes to the output fd requested * by the caller of zfs_send(). */ static void * cksummer(void *arg) { dedup_arg_t *dda = arg; char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE); dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; FILE *ofp; int outfd; dedup_table_t ddt; zio_cksum_t stream_cksum; uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); uint64_t numbuckets; ddt.max_ddt_size = MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100, SMALLEST_POSSIBLE_MAX_DDT_MB << 20); numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t)); /* * numbuckets must be a power of 2. Increase number to * a power of 2 if necessary. */ if (!ISP2(numbuckets)) numbuckets = 1 << high_order_bit(numbuckets); ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *)); ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *); ddt.numhashbits = high_order_bit(numbuckets) - 1; ddt.ddt_full = B_FALSE; outfd = dda->outputfd; ofp = fdopen(dda->inputfd, "r"); while (ssread(drr, sizeof (*drr), ofp) != 0) { /* * kernel filled in checksum, we are going to write same * record, but need to regenerate checksum. */ if (drr->drr_type != DRR_BEGIN) { bzero(&drr->drr_u.drr_checksum.drr_checksum, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } switch (drr->drr_type) { case DRR_BEGIN: { struct drr_begin *drrb = &drr->drr_u.drr_begin; int fflags; int sz = 0; ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); /* set the DEDUP feature flag for this stream */ fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); fflags |= (DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); if (drr->drr_payloadlen != 0) { sz = drr->drr_payloadlen; if (sz > SPA_MAXBLOCKSIZE) { buf = zfs_realloc(dda->dedup_hdl, buf, SPA_MAXBLOCKSIZE, sz); } (void) ssread(buf, sz, ofp); if (ferror(stdin)) perror("fread"); } if (dump_record(drr, buf, sz, &stream_cksum, outfd) != 0) goto out; break; } case DRR_END: { struct drr_end *drre = &drr->drr_u.drr_end; /* use the recalculated checksum */ drre->drr_checksum = stream_cksum; if (dump_record(drr, NULL, 0, &stream_cksum, outfd) != 0) goto out; break; } case DRR_OBJECT: { struct drr_object *drro = &drr->drr_u.drr_object; if (drro->drr_bonuslen > 0) { (void) ssread(buf, P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), ofp); } if (dump_record(drr, buf, P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), &stream_cksum, outfd) != 0) goto out; break; } case DRR_SPILL: { struct drr_spill *drrs = &drr->drr_u.drr_spill; (void) ssread(buf, drrs->drr_length, ofp); if (dump_record(drr, buf, drrs->drr_length, &stream_cksum, outfd) != 0) goto out; break; } case DRR_FREEOBJECTS: { if (dump_record(drr, NULL, 0, &stream_cksum, outfd) != 0) goto out; break; } case DRR_WRITE: { struct drr_write *drrw = &drr->drr_u.drr_write; dataref_t dataref; uint64_t payload_size; payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); (void) ssread(buf, payload_size, ofp); /* * Use the existing checksum if it's dedup-capable, * else calculate a SHA256 checksum for it. */ if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum, zero_cksum) || !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) { SHA256_CTX ctx; zio_cksum_t tmpsha256; SHA256Init(&ctx); SHA256Update(&ctx, buf, payload_size); SHA256Final(&tmpsha256, &ctx); drrw->drr_key.ddk_cksum.zc_word[0] = BE_64(tmpsha256.zc_word[0]); drrw->drr_key.ddk_cksum.zc_word[1] = BE_64(tmpsha256.zc_word[1]); drrw->drr_key.ddk_cksum.zc_word[2] = BE_64(tmpsha256.zc_word[2]); drrw->drr_key.ddk_cksum.zc_word[3] = BE_64(tmpsha256.zc_word[3]); drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256; drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP; } dataref.ref_guid = drrw->drr_toguid; dataref.ref_object = drrw->drr_object; dataref.ref_offset = drrw->drr_offset; if (ddt_update(dda->dedup_hdl, &ddt, &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop, &dataref)) { dmu_replay_record_t wbr_drr = {0}; struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref; /* block already present in stream */ wbr_drr.drr_type = DRR_WRITE_BYREF; wbr_drrr->drr_object = drrw->drr_object; wbr_drrr->drr_offset = drrw->drr_offset; wbr_drrr->drr_length = drrw->drr_logical_size; wbr_drrr->drr_toguid = drrw->drr_toguid; wbr_drrr->drr_refguid = dataref.ref_guid; wbr_drrr->drr_refobject = dataref.ref_object; wbr_drrr->drr_refoffset = dataref.ref_offset; wbr_drrr->drr_checksumtype = drrw->drr_checksumtype; wbr_drrr->drr_checksumflags = drrw->drr_checksumtype; wbr_drrr->drr_key.ddk_cksum = drrw->drr_key.ddk_cksum; wbr_drrr->drr_key.ddk_prop = drrw->drr_key.ddk_prop; if (dump_record(&wbr_drr, NULL, 0, &stream_cksum, outfd) != 0) goto out; } else { /* block not previously seen */ if (dump_record(drr, buf, payload_size, &stream_cksum, outfd) != 0) goto out; } break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &drr->drr_u.drr_write_embedded; (void) ssread(buf, P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp); if (dump_record(drr, buf, P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), &stream_cksum, outfd) != 0) goto out; break; } case DRR_FREE: { if (dump_record(drr, NULL, 0, &stream_cksum, outfd) != 0) goto out; break; } default: (void) fprintf(stderr, "INVALID record type 0x%x\n", drr->drr_type); /* should never happen, so assert */ assert(B_FALSE); } } out: umem_cache_destroy(ddt.ddecache); free(ddt.dedup_hash_array); free(buf); (void) fclose(ofp); return (NULL); } /* * Routines for dealing with the AVL tree of fs-nvlists */ typedef struct fsavl_node { avl_node_t fn_node; nvlist_t *fn_nvfs; char *fn_snapname; uint64_t fn_guid; } fsavl_node_t; static int fsavl_compare(const void *arg1, const void *arg2) { const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; return (AVL_CMP(fn1->fn_guid, fn2->fn_guid)); } /* * Given the GUID of a snapshot, find its containing filesystem and * (optionally) name. */ static nvlist_t * fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname) { fsavl_node_t fn_find; fsavl_node_t *fn; fn_find.fn_guid = snapguid; fn = avl_find(avl, &fn_find, NULL); if (fn) { if (snapname) *snapname = fn->fn_snapname; return (fn->fn_nvfs); } return (NULL); } static void fsavl_destroy(avl_tree_t *avl) { fsavl_node_t *fn; void *cookie; if (avl == NULL) return; cookie = NULL; while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL) free(fn); avl_destroy(avl); free(avl); } /* * Given an nvlist, produce an avl tree of snapshots, ordered by guid */ static avl_tree_t * fsavl_create(nvlist_t *fss) { avl_tree_t *fsavl; nvpair_t *fselem = NULL; if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL) return (NULL); avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t), offsetof(fsavl_node_t, fn_node)); while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) { nvlist_t *nvfs, *snaps; nvpair_t *snapelem = NULL; VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); while ((snapelem = nvlist_next_nvpair(snaps, snapelem)) != NULL) { fsavl_node_t *fn; uint64_t guid; VERIFY(0 == nvpair_value_uint64(snapelem, &guid)); if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { fsavl_destroy(fsavl); return (NULL); } fn->fn_nvfs = nvfs; fn->fn_snapname = nvpair_name(snapelem); fn->fn_guid = guid; /* * Note: if there are multiple snaps with the * same GUID, we ignore all but one. */ if (avl_find(fsavl, fn, NULL) == NULL) avl_add(fsavl, fn); else free(fn); } } return (fsavl); } /* * Routines for dealing with the giant nvlist of fs-nvlists, etc. */ typedef struct send_data { /* * assigned inside every recursive call, * restored from *_save on return: * * guid of fromsnap snapshot in parent dataset * txg of fromsnap snapshot in current dataset * txg of tosnap snapshot in current dataset */ uint64_t parent_fromsnap_guid; uint64_t fromsnap_txg; uint64_t tosnap_txg; /* the nvlists get accumulated during depth-first traversal */ nvlist_t *parent_snaps; nvlist_t *fss; nvlist_t *snapprops; /* send-receive configuration, does not change during traversal */ const char *fsname; const char *fromsnap; const char *tosnap; boolean_t recursive; boolean_t verbose; /* * The header nvlist is of the following format: * { * "tosnap" -> string * "fromsnap" -> string (if incremental) * "fss" -> { * id -> { * * "name" -> string (full name; for debugging) * "parentfromsnap" -> number (guid of fromsnap in parent) * * "props" -> { name -> value (only if set here) } * "snaps" -> { name (lastname) -> number (guid) } * "snapprops" -> { name (lastname) -> { name -> value } } * * "origin" -> number (guid) (if clone) * "sent" -> boolean (not on-disk) * } * } * } * */ } send_data_t; static void send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv); static int send_iterate_snap(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; char *snapname; nvlist_t *nv; snapname = strrchr(zhp->zfs_name, '@')+1; if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping snapshot %s because it was created " "after the destination snapshot (%s)\n"), zhp->zfs_name, sd->tosnap); } zfs_close(zhp); return (0); } VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid)); /* * NB: if there is no fromsnap here (it's a newly created fs in * an incremental replication), we will substitute the tosnap. */ if ((sd->fromsnap && strcmp(snapname, sd->fromsnap) == 0) || (sd->parent_fromsnap_guid == 0 && sd->tosnap && strcmp(snapname, sd->tosnap) == 0)) { sd->parent_fromsnap_guid = guid; } VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); send_iterate_prop(zhp, nv); VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv)); nvlist_free(nv); zfs_close(zhp); return (0); } static void send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) { char *propname = nvpair_name(elem); zfs_prop_t prop = zfs_name_to_prop(propname); nvlist_t *propnv; if (!zfs_prop_user(propname)) { /* * Realistically, this should never happen. However, * we want the ability to add DSL properties without * needing to make incompatible version changes. We * need to ignore unknown properties to allow older * software to still send datasets containing these * properties, with the unknown properties elided. */ if (prop == ZPROP_INVAL) continue; if (zfs_prop_readonly(prop)) continue; } verify(nvpair_value_nvlist(elem, &propnv) == 0); if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION) { char *source; uint64_t value; verify(nvlist_lookup_uint64(propnv, ZPROP_VALUE, &value) == 0); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) continue; /* * May have no source before SPA_VERSION_RECVD_PROPS, * but is still modifiable. */ if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) == 0) { if ((strcmp(source, zhp->zfs_name) != 0) && (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) continue; } } else { char *source; if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) != 0) continue; if ((strcmp(source, zhp->zfs_name) != 0) && (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) continue; } if (zfs_prop_user(propname) || zfs_prop_get_type(prop) == PROP_TYPE_STRING) { char *value; verify(nvlist_lookup_string(propnv, ZPROP_VALUE, &value) == 0); VERIFY(0 == nvlist_add_string(nv, propname, value)); } else { uint64_t value; verify(nvlist_lookup_uint64(propnv, ZPROP_VALUE, &value) == 0); VERIFY(0 == nvlist_add_uint64(nv, propname, value)); } } } /* * returns snapshot creation txg * and returns 0 if the snapshot does not exist */ static uint64_t get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap) { char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t txg = 0; if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') return (txg); (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) { zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG); zfs_close(zhp); } } return (txg); } /* * recursively generate nvlists describing datasets. See comment * for the data structure send_data_t above for description of contents * of the nvlist. */ static int send_iterate_fs(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; nvlist_t *nvfs, *nv; int rv = 0; uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; uint64_t fromsnap_txg_save = sd->fromsnap_txg; uint64_t tosnap_txg_save = sd->tosnap_txg; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t fromsnap_txg, tosnap_txg; char guidstring[64]; fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap); if (fromsnap_txg != 0) sd->fromsnap_txg = fromsnap_txg; tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap); if (tosnap_txg != 0) sd->tosnap_txg = tosnap_txg; /* * on the send side, if the current dataset does not have tosnap, * perform two additional checks: * * - skip sending the current dataset if it was created later than * the parent tosnap * - return error if the current dataset was created earlier than * the parent tosnap */ if (sd->tosnap != NULL && tosnap_txg == 0) { if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping dataset %s: snapshot %s does " "not exist\n"), zhp->zfs_name, sd->tosnap); } } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s%s: snapshot %s@%s does not " "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? dgettext(TEXT_DOMAIN, " recursively") : "", zhp->zfs_name, sd->tosnap); rv = -1; } goto out; } VERIFY(0 == nvlist_alloc(&nvfs, NV_UNIQUE_NAME, 0)); VERIFY(0 == nvlist_add_string(nvfs, "name", zhp->zfs_name)); VERIFY(0 == nvlist_add_uint64(nvfs, "parentfromsnap", sd->parent_fromsnap_guid)); if (zhp->zfs_dmustats.dds_origin[0]) { zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); if (origin == NULL) { rv = -1; goto out; } VERIFY(0 == nvlist_add_uint64(nvfs, "origin", origin->zfs_dmustats.dds_guid)); } /* iterate over props */ VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); send_iterate_prop(zhp, nv); VERIFY(0 == nvlist_add_nvlist(nvfs, "props", nv)); nvlist_free(nv); /* iterate over snaps, and set sd->parent_fromsnap_guid */ sd->parent_fromsnap_guid = 0; VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0)); VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0)); (void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd); VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps)); VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops)); nvlist_free(sd->parent_snaps); nvlist_free(sd->snapprops); /* add this fs to nvlist */ (void) snprintf(guidstring, sizeof (guidstring), "0x%llx", (longlong_t)guid); VERIFY(0 == nvlist_add_nvlist(sd->fss, guidstring, nvfs)); nvlist_free(nvfs); /* iterate over children */ if (sd->recursive) rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); out: sd->parent_fromsnap_guid = parent_fromsnap_guid_save; sd->fromsnap_txg = fromsnap_txg_save; sd->tosnap_txg = tosnap_txg_save; zfs_close(zhp); return (rv); } static int gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, const char *tosnap, boolean_t recursive, boolean_t verbose, nvlist_t **nvlp, avl_tree_t **avlp) { zfs_handle_t *zhp; send_data_t sd = { 0 }; int error; zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (EZFS_BADTYPE); VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0)); sd.fsname = fsname; sd.fromsnap = fromsnap; sd.tosnap = tosnap; sd.recursive = recursive; sd.verbose = verbose; if ((error = send_iterate_fs(zhp, &sd)) != 0) { nvlist_free(sd.fss); if (avlp != NULL) *avlp = NULL; *nvlp = NULL; return (error); } if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { nvlist_free(sd.fss); *nvlp = NULL; return (EZFS_NOMEM); } *nvlp = sd.fss; return (0); } /* * Routines specific to "zfs send" */ typedef struct send_dump_data { /* these are all just the short snapname (the part after the @) */ const char *fromsnap; const char *tosnap; char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t verbose, dryrun, parsable, progress, embed_data, std_out; boolean_t progressastitle; boolean_t large_block, compress; int outfd; boolean_t err; nvlist_t *fss; nvlist_t *snapholds; avl_tree_t *fsavl; snapfilter_cb_t *filter_cb; void *filter_cb_arg; nvlist_t *debugnv; char holdtag[ZFS_MAX_DATASET_NAME_LEN]; int cleanup_fd; uint64_t size; } send_dump_data_t; static int estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, boolean_t fromorigin, enum lzc_send_flags flags, uint64_t *sizep) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); assert(fromsnap_obj == 0 || !fromorigin); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_obj = fromorigin; zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_guid = 1; /* estimate flag */ zc.zc_flags = flags; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot estimate space for '%s'"), zhp->zfs_name); switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (@%s) does not exist"), zc.zc_value); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } *sizep = zc.zc_objset_type; return (0); } /* * Dumps a backup of the given snapshot (incremental from fromsnap if it's not * NULL) to the file descriptor specified by outfd. */ static int dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, boolean_t fromorigin, int outfd, enum lzc_send_flags flags, nvlist_t *debugnv) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *thisdbg; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); assert(fromsnap_obj == 0 || !fromorigin); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = outfd; zc.zc_obj = fromorigin; zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_flags = flags; VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); if (fromsnap && fromsnap[0] != '\0') { VERIFY(0 == nvlist_add_string(thisdbg, "fromsnap", fromsnap)); } if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno)); if (debugnv) { VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); } nvlist_free(thisdbg); switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (@%s) does not exist"), zc.zc_value); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: #ifdef illumos case ENOSTR: #endif case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } if (debugnv) VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); nvlist_free(thisdbg); return (0); } static void gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) { assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); /* * zfs_send() only sets snapholds for sends that need them, * e.g. replication and doall. */ if (sdd->snapholds == NULL) return; fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); } static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; zfs_cmd_t zc = { 0 }; zfs_handle_t *zhp = pa->pa_zhp; libzfs_handle_t *hdl = zhp->zfs_hdl; unsigned long long bytes, total; char buf[16]; time_t t; struct tm *tm; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (!pa->pa_parsable && !pa->pa_astitle) (void) fprintf(stderr, "TIME SENT SNAPSHOT\n"); /* * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { (void) sleep(1); zc.zc_cookie = pa->pa_fd; if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) return ((void *)-1); (void) time(&t); tm = localtime(&t); bytes = zc.zc_cookie; if (pa->pa_astitle) { int pct; if (pa->pa_size > bytes) pct = 100 * bytes / pa->pa_size; else pct = 100; setproctitle("sending %s (%d%%: %llu/%llu)", zhp->zfs_name, pct, bytes, pa->pa_size); } else if (pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, bytes, zhp->zfs_name); } else { zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm->tm_hour, tm->tm_min, tm->tm_sec, buf, zhp->zfs_name); } } } static void send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, uint64_t size, boolean_t parsable) { if (parsable) { if (fromsnap != NULL) { (void) fprintf(fout, "incremental\t%s\t%s", fromsnap, tosnap); } else { (void) fprintf(fout, "full\t%s", tosnap); } } else { if (fromsnap != NULL) { if (strchr(fromsnap, '@') == NULL && strchr(fromsnap, '#') == NULL) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from @%s to %s"), fromsnap, tosnap); } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from %s to %s"), fromsnap, tosnap); } } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "full send of %s"), tosnap); } } if (parsable) { (void) fprintf(fout, "\t%llu", (longlong_t)size); } else if (size != 0) { char buf[16]; zfs_nicenum(size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, " estimated size is %s"), buf); } (void) fprintf(fout, "\n"); } static int dump_snapshot(zfs_handle_t *zhp, void *arg) { send_dump_data_t *sdd = arg; progress_arg_t pa = { 0 }; pthread_t tid; char *thissnap; enum lzc_send_flags flags = 0; int err; boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; FILE *fout = sdd->std_out ? stdout : stderr; uint64_t size = 0; err = 0; thissnap = strchr(zhp->zfs_name, '@') + 1; isfromsnap = (sdd->fromsnap != NULL && strcmp(sdd->fromsnap, thissnap) == 0); if (!sdd->seenfrom && isfromsnap) { gather_holds(zhp, sdd); sdd->seenfrom = B_TRUE; (void) strcpy(sdd->prevsnap, thissnap); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (0); } if (sdd->seento || !sdd->seenfrom) { zfs_close(zhp); return (0); } istosnap = (strcmp(sdd->tosnap, thissnap) == 0); if (istosnap) sdd->seento = B_TRUE; if (sdd->large_block) flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (sdd->embed_data) flags |= LZC_SEND_FLAG_EMBED_DATA; if (sdd->compress) flags |= LZC_SEND_FLAG_COMPRESS; if (!sdd->doall && !isfromsnap && !istosnap) { if (sdd->replicate) { char *snapname; nvlist_t *snapprops; /* * Filter out all intermediate snapshots except origin * snapshots needed to replicate clones. */ nvlist_t *nvfs = fsavl_find(sdd->fsavl, zhp->zfs_dmustats.dds_guid, &snapname); VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snapprops", &snapprops)); VERIFY(0 == nvlist_lookup_nvlist(snapprops, thissnap, &snapprops)); exclude = !nvlist_exists(snapprops, "is_clone_origin"); } else { exclude = B_TRUE; } } /* * If a filter function exists, call it to determine whether * this snapshot will be sent. */ if (exclude || (sdd->filter_cb != NULL && sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { /* * This snapshot is filtered out. Don't send it, and don't * set prevsnap_obj, so it will be as if this snapshot didn't * exist, and the next accepted snapshot will be sent as * an incremental from the last accepted one, or as the * first (and full) snapshot in the case of a replication, * non-incremental send. */ zfs_close(zhp); return (0); } gather_holds(zhp, sdd); fromorigin = sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate); - if (sdd->progress && sdd->dryrun) { + if (sdd->verbose || sdd->progress) { (void) estimate_ioctl(zhp, sdd->prevsnap_obj, fromorigin, flags, &size); sdd->size += size; - } - if (sdd->verbose) { send_print_verbose(fout, zhp->zfs_name, sdd->prevsnap[0] ? sdd->prevsnap : NULL, size, sdd->parsable); } if (!sdd->dryrun) { /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (sdd->progress) { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; pa.pa_size = sdd->size; pa.pa_astitle = sdd->progressastitle; if ((err = pthread_create(&tid, NULL, send_progress_thread, &pa)) != 0) { zfs_close(zhp); return (err); } } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); if (sdd->progress) { (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); } } (void) strcpy(sdd->prevsnap, thissnap); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (err); } static int dump_filesystem(zfs_handle_t *zhp, void *arg) { int rv = 0; send_dump_data_t *sdd = arg; boolean_t missingfrom = B_FALSE; zfs_cmd_t zc = { 0 }; (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->tosnap); if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); sdd->err = B_TRUE; return (0); } if (sdd->replicate && sdd->fromsnap) { /* * If this fs does not have fromsnap, and we're doing * recursive, we need to send a full stream from the * beginning (or an incremental from the origin if this * is a clone). If we're doing non-recursive, then let * them get the error. */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->fromsnap); if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { missingfrom = B_TRUE; } } sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0; sdd->prevsnap_obj = 0; if (sdd->fromsnap == NULL || missingfrom) sdd->seenfrom = B_TRUE; rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg); if (!sdd->seenfrom) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) does not exist\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); sdd->err = B_TRUE; } else if (!sdd->seento) { if (sdd->fromsnap) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) " "is not earlier than it\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: " "could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); } sdd->err = B_TRUE; } return (rv); } static int dump_filesystems(zfs_handle_t *rzhp, void *arg) { send_dump_data_t *sdd = arg; nvpair_t *fspair; boolean_t needagain, progress; if (!sdd->replicate) return (dump_filesystem(rzhp, sdd)); /* Mark the clone origin snapshots. */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *nvfs; uint64_t origin_guid = 0; VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs)); (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); if (origin_guid != 0) { char *snapname; nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, &snapname); if (origin_nv != NULL) { nvlist_t *snapprops; VERIFY(0 == nvlist_lookup_nvlist(origin_nv, "snapprops", &snapprops)); VERIFY(0 == nvlist_lookup_nvlist(snapprops, snapname, &snapprops)); VERIFY(0 == nvlist_add_boolean( snapprops, "is_clone_origin")); } } } again: needagain = progress = B_FALSE; for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist, *parent_nv; char *fsname; zfs_handle_t *zhp; int err; uint64_t origin_guid = 0; uint64_t parent_guid = 0; VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); if (nvlist_lookup_boolean(fslist, "sent") == 0) continue; VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0); (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); (void) nvlist_lookup_uint64(fslist, "parentfromsnap", &parent_guid); if (parent_guid != 0) { parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); if (!nvlist_exists(parent_nv, "sent")) { /* parent has not been sent; skip this one */ needagain = B_TRUE; continue; } } if (origin_guid != 0) { nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL); if (origin_nv != NULL && !nvlist_exists(origin_nv, "sent")) { /* * origin has not been sent yet; * skip this clone. */ needagain = B_TRUE; continue; } } zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); err = dump_filesystem(zhp, sdd); VERIFY(nvlist_add_boolean(fslist, "sent") == 0); progress = B_TRUE; zfs_close(zhp); if (err) return (err); } if (needagain) { assert(progress); goto again; } /* clean out the sent flags in case we reuse this fss */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist; VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); (void) nvlist_remove_all(fslist, "sent"); } return (0); } nvlist_t * zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) { unsigned int version; int nread; unsigned long long checksum, packed_len; /* * Decode token header, which is: * -- * Note that the only supported token version is 1. */ nread = sscanf(token, "%u-%llx-%llx-", &version, &checksum, &packed_len); if (nread != 3) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid format)")); return (NULL); } if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid version %u)"), version); return (NULL); } /* convert hexadecimal representation to binary */ token = strrchr(token, '-') + 1; int len = strlen(token) / 2; unsigned char *compressed = zfs_alloc(hdl, len); for (int i = 0; i < len; i++) { nread = sscanf(token + i * 2, "%2hhx", compressed + i); if (nread != 1) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt " "(payload is not hex-encoded)")); return (NULL); } } /* verify checksum */ zio_cksum_t cksum; fletcher_4_native(compressed, len, NULL, &cksum); if (cksum.zc_word[0] != checksum) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (incorrect checksum)")); return (NULL); } /* uncompress */ void *packed = zfs_alloc(hdl, packed_len); uLongf packed_len_long = packed_len; if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || packed_len_long != packed_len) { free(packed); free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (decompression failed)")); return (NULL); } /* unpack nvlist */ nvlist_t *nv; int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); free(packed); free(compressed); if (error != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (nvlist_unpack failed)")); return (NULL); } return (nv); } int zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, const char *resume_token) { char errbuf[1024]; char *toname; char *fromname = NULL; uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; zfs_handle_t *zhp; int error = 0; char name[ZFS_MAX_DATASET_NAME_LEN]; enum lzc_send_flags lzc_flags = 0; uint64_t size = 0; FILE *fout = (flags->verbose && flags->dryrun) ? stdout : stderr; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); nvlist_t *resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist */ return (zfs_error(hdl, EZFS_FAULT, errbuf)); } if (flags->verbose) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "resume token contents:\n")); nvlist_print(fout, resume_nvl); } if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt")); return (zfs_error(hdl, EZFS_FAULT, errbuf)); } fromguid = 0; (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok")) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (flags->compress || nvlist_exists(resume_nvl, "compressok")) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) { if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is no longer the same snapshot used in " "the initial send"), toname); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' used in the initial send no longer exists"), toname); } return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to access '%s'"), name); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } if (fromguid != 0) { if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source %#llx no longer exists"), (longlong_t)fromguid); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } fromname = name; } - if (flags->progress) { + if (flags->progress || flags->verbose) { error = lzc_send_space(zhp->zfs_name, fromname, lzc_flags, &size); if (error == 0) size = MAX(0, (int64_t)(size - bytes)); } if (flags->verbose) { send_print_verbose(fout, zhp->zfs_name, fromname, size, flags->parsable); } if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (flags->progress) { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; pa.pa_size = size; pa.pa_astitle = flags->progressastitle; error = pthread_create(&tid, NULL, send_progress_thread, &pa); if (error != 0) { zfs_close(zhp); return (error); } } error = lzc_send_resume(zhp->zfs_name, fromname, outfd, lzc_flags, resumeobj, resumeoff); if (flags->progress) { (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); } char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); zfs_close(zhp); switch (error) { case 0: return (0); case EXDEV: case ENOENT: case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: #ifdef illumos case ENOSTR: #endif case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } zfs_close(zhp); return (error); } /* * Generate a send stream for the dataset identified by the argument zhp. * * The content of the send stream is the snapshot identified by * 'tosnap'. Incremental streams are requested in two ways: * - from the snapshot identified by "fromsnap" (if non-null) or * - from the origin of the dataset identified by zhp, which must * be a clone. In this case, "fromsnap" is null and "fromorigin" * is TRUE. * * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) * if "replicate" is set. If "doall" is set, dump all the intermediate * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" * case too. If "props" is set, send properties. */ int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { char errbuf[1024]; send_dump_data_t sdd = { 0 }; int err = 0; nvlist_t *fss = NULL; avl_tree_t *fsavl = NULL; static uint64_t holdseq; int spa_version; pthread_t tid = 0; int pipefd[2]; dedup_arg_t dda = { 0 }; int featureflags = 0; FILE *fout; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot send '%s'"), zhp->zfs_name); if (fromsnap && fromsnap[0] == '\0') { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "zero-length incremental source")); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); } if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) { uint64_t version; version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); if (version >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } } if (flags->dedup && !flags->dryrun) { featureflags |= (DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); if ((err = pipe(pipefd)) != 0) { zfs_error_aux(zhp->zfs_hdl, strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); } dda.outputfd = outfd; dda.inputfd = pipefd[1]; dda.dedup_hdl = zhp->zfs_hdl; if ((err = pthread_create(&tid, NULL, cksummer, &dda)) != 0) { (void) close(pipefd[0]); (void) close(pipefd[1]); zfs_error_aux(zhp->zfs_hdl, strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } } if (flags->replicate || flags->doall || flags->props) { dmu_replay_record_t drr = { 0 }; char *packbuf = NULL; size_t buflen = 0; zio_cksum_t zc = { 0 }; if (flags->replicate || flags->props) { nvlist_t *hdrnv; VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0)); if (fromsnap) { VERIFY(0 == nvlist_add_string(hdrnv, "fromsnap", fromsnap)); } VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap)); if (!flags->replicate) { VERIFY(0 == nvlist_add_boolean(hdrnv, "not_recursive")); } err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name, fromsnap, tosnap, flags->replicate, flags->verbose, &fss, &fsavl); if (err) goto err_out; VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); err = nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, 0); if (debugnvp) *debugnvp = hdrnv; else nvlist_free(hdrnv); if (err) goto stderr_out; } if (!flags->dryrun) { /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. drr_versioninfo, DMU_COMPOUNDSTREAM); DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. drr_versioninfo, featureflags); (void) snprintf(drr.drr_u.drr_begin.drr_toname, sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", zhp->zfs_name, tosnap); drr.drr_payloadlen = buflen; err = dump_record(&drr, packbuf, buflen, &zc, outfd); free(packbuf); if (err != 0) goto stderr_out; /* write end record */ bzero(&drr, sizeof (drr)); drr.drr_type = DRR_END; drr.drr_u.drr_end.drr_checksum = zc; err = write(outfd, &drr, sizeof (drr)); if (err == -1) { err = errno; goto stderr_out; } err = 0; } } /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; if (tid != 0) sdd.outfd = pipefd[0]; else sdd.outfd = outfd; sdd.replicate = flags->replicate; sdd.doall = flags->doall; sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; sdd.verbose = flags->verbose; sdd.parsable = flags->parsable; sdd.progress = flags->progress; sdd.progressastitle = flags->progressastitle; sdd.dryrun = flags->dryrun; sdd.large_block = flags->largeblock; sdd.embed_data = flags->embed_data; sdd.compress = flags->compress; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; if (sdd.verbose && sdd.dryrun) sdd.std_out = B_TRUE; fout = sdd.std_out ? stdout : stderr; /* * Some flags require that we place user holds on the datasets that are * being sent so they don't get destroyed during the send. We can skip * this step if the pool is imported read-only since the datasets cannot * be destroyed. */ if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), ZPOOL_PROP_READONLY, NULL) && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS && (flags->doall || flags->replicate)) { ++holdseq; (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); if (sdd.cleanup_fd < 0) { err = errno; goto stderr_out; } sdd.snapholds = fnvlist_alloc(); } else { sdd.cleanup_fd = -1; sdd.snapholds = NULL; } - if (flags->progress || sdd.snapholds != NULL) { + if (flags->progress || flags->verbose || sdd.snapholds != NULL) { /* * Do a verbose no-op dry run to get all the verbose output * or to gather snapshot hold's before generating any data, * then do a non-verbose real run to generate the streams. */ sdd.dryrun = B_TRUE; err = dump_filesystems(zhp, &sdd); if (err != 0) goto stderr_out; if (flags->verbose) { if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)sdd.size); } else { char buf[16]; zfs_nicenum(sdd.size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } } /* Ensure no snaps found is treated as an error. */ if (!sdd.seento) { err = ENOENT; goto err_out; } /* Skip the second run if dryrun was requested. */ if (flags->dryrun) goto err_out; if (sdd.snapholds != NULL) { err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); if (err != 0) goto stderr_out; fnvlist_free(sdd.snapholds); sdd.snapholds = NULL; } sdd.dryrun = B_FALSE; sdd.verbose = B_FALSE; } err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); nvlist_free(fss); /* Ensure no snaps found is treated as an error. */ if (err == 0 && !sdd.seento) err = ENOENT; if (tid != 0) { if (err != 0) (void) pthread_cancel(tid); (void) close(pipefd[0]); (void) pthread_join(tid, NULL); } if (sdd.cleanup_fd != -1) { VERIFY(0 == close(sdd.cleanup_fd)); sdd.cleanup_fd = -1; } if (!flags->dryrun && (flags->replicate || flags->doall || flags->props)) { /* * write final end record. NB: want to do this even if * there was some error, because it might not be totally * failed. */ dmu_replay_record_t drr = { 0 }; drr.drr_type = DRR_END; if (write(outfd, &drr, sizeof (drr)) == -1) { return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } } return (err || sdd.err); stderr_out: err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); err_out: fsavl_destroy(fsavl); nvlist_free(fss); fnvlist_free(sdd.snapholds); if (sdd.cleanup_fd != -1) VERIFY(0 == close(sdd.cleanup_fd)); if (tid != 0) { (void) pthread_cancel(tid); (void) close(pipefd[0]); (void) pthread_join(tid, NULL); } return (err); } int zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t flags) { int err = 0; libzfs_handle_t *hdl = zhp->zfs_hdl; enum lzc_send_flags lzc_flags = 0; FILE *fout = (flags.verbose && flags.dryrun) ? stdout : stderr; char errbuf[1024]; if (flags.largeblock) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags.embed_data) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (flags.compress) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (flags.verbose) { uint64_t size = 0; err = lzc_send_space(zhp->zfs_name, from, lzc_flags, &size); if (err == 0) { send_print_verbose(fout, zhp->zfs_name, from, size, flags.parsable); } else { (void) fprintf(stderr, "Cannot estimate send size: " "%s\n", strerror(errno)); } } if (flags.dryrun) return (err); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); err = lzc_send(zhp->zfs_name, from, fd, lzc_flags); if (err != 0) { switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: case ESRCH: if (lzc_exists(zhp->zfs_name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), from); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "target is busy; if a filesystem, " "it must not be mounted")); return (zfs_error(hdl, EZFS_BUSY, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: #ifdef illumos case ENOSTR: #endif case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (err != 0); } /* * Routines specific to "zfs recv" */ static int recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, boolean_t byteswap, zio_cksum_t *zc) { char *cp = buf; int rv; int len = ilen; assert(ilen <= SPA_MAXBLOCKSIZE); do { rv = read(fd, cp, len); cp += rv; len -= rv; } while (rv > 0); if (rv < 0 || len != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to read from stream")); return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN, "cannot receive"))); } if (zc) { if (byteswap) (void) fletcher_4_incremental_byteswap(buf, ilen, zc); else (void) fletcher_4_incremental_native(buf, ilen, zc); } return (0); } static int recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, boolean_t byteswap, zio_cksum_t *zc) { char *buf; int err; buf = zfs_alloc(hdl, len); if (buf == NULL) return (ENOMEM); err = recv_read(hdl, fd, buf, len, byteswap, zc); if (err != 0) { free(buf); return (err); } err = nvlist_unpack(buf, len, nvp, 0); free(buf); if (err != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (malformed nvlist)")); return (EINVAL); } return (0); } static int recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, int baselen, char *newname, recvflags_t *flags) { static int seq; int err; prop_changelist_t *clp; zfs_handle_t *zhp; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) return (-1); err = changelist_prefix(clp); if (err) return (err); if (tryname) { (void) strcpy(newname, tryname); if (flags->verbose) { (void) printf("attempting rename %s to %s\n", name, newname); } err = lzc_rename(name, newname); if (err == 0) changelist_rename(clp, name, tryname); } else { err = ENOENT; } if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { seq++; (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%.*srecv-%u-%u", baselen, name, getpid(), seq); if (flags->verbose) { (void) printf("failed - trying rename %s to %s\n", name, newname); } err = lzc_rename(name, newname); if (err == 0) changelist_rename(clp, name, newname); if (err && flags->verbose) { (void) printf("failed (%u) - " "will try again on next pass\n", errno); } err = EAGAIN; } else if (flags->verbose) { if (err == 0) (void) printf("success\n"); else (void) printf("failed (%u)\n", errno); } (void) changelist_postfix(clp); changelist_free(clp); return (err); } static int recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, char *newname, recvflags_t *flags) { int err = 0; prop_changelist_t *clp; zfs_handle_t *zhp; boolean_t defer = B_FALSE; int spa_version; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS) defer = B_TRUE; zfs_close(zhp); if (clp == NULL) return (-1); err = changelist_prefix(clp); if (err) return (err); if (flags->verbose) (void) printf("attempting destroy %s\n", name); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, name); err = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { err = lzc_destroy(name); } if (err == 0) { if (flags->verbose) (void) printf("success\n"); changelist_remove(clp, name); } (void) changelist_postfix(clp); changelist_free(clp); /* * Deferred destroy might destroy the snapshot or only mark it to be * destroyed later, and it returns success in either case. */ if (err != 0 || (defer && zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT))) { err = recv_rename(hdl, name, NULL, baselen, newname, flags); } return (err); } typedef struct guid_to_name_data { uint64_t guid; boolean_t bookmark_ok; char *name; char *skip; } guid_to_name_data_t; static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { guid_to_name_data_t *gtnd = arg; const char *slash; int err; if (gtnd->skip != NULL && (slash = strrchr(zhp->zfs_name, '/')) != NULL && strcmp(slash + 1, gtnd->skip) == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); if (err != EEXIST && gtnd->bookmark_ok) err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } /* * Attempt to find the local dataset associated with this guid. In the case of * multiple matches, we attempt to find the "best" match by searching * progressively larger portions of the hierarchy. This allows one to send a * tree of datasets individually and guarantee that we will find the source * guid within that hierarchy, even if there are multiple matches elsewhere. */ static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, char *name) { char pname[ZFS_MAX_DATASET_NAME_LEN]; guid_to_name_data_t gtnd; gtnd.guid = guid; gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; /* * Search progressively larger portions of the hierarchy, starting * with the filesystem specified by 'parent'. This will * select the "most local" version of the origin snapshot in the case * that there are multiple matching snapshots in the system. */ (void) strlcpy(pname, parent, sizeof (pname)); char *cp = strrchr(pname, '@'); if (cp == NULL) cp = strchr(pname, '\0'); for (; cp != NULL; cp = strrchr(pname, '/')) { /* Chop off the last component and open the parent */ *cp = '\0'; zfs_handle_t *zhp = make_dataset_handle(hdl, pname); if (zhp == NULL) continue; int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); if (err != EEXIST) err = zfs_iter_children(zhp, guid_to_name_cb, >nd); if (err != EEXIST && bookmark_ok) err = zfs_iter_bookmarks(zhp, guid_to_name_cb, >nd); zfs_close(zhp); if (err == EEXIST) return (0); /* * Remember the last portion of the dataset so we skip it next * time through (as we've already searched that portion of the * hierarchy). */ gtnd.skip = strrchr(pname, '/') + 1; } return (ENOENT); } /* * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if * guid1 is after guid2. */ static int created_before(libzfs_handle_t *hdl, avl_tree_t *avl, uint64_t guid1, uint64_t guid2) { nvlist_t *nvfs; char *fsname, *snapname; char buf[ZFS_MAX_DATASET_NAME_LEN]; int rv; zfs_handle_t *guid1hdl, *guid2hdl; uint64_t create1, create2; if (guid2 == 0) return (0); if (guid1 == 0) return (1); nvfs = fsavl_find(avl, guid1, &snapname); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid1hdl == NULL) return (-1); nvfs = fsavl_find(avl, guid2, &snapname); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid2hdl == NULL) { zfs_close(guid1hdl); return (-1); } create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); if (create1 < create2) rv = -1; else if (create1 > create2) rv = +1; else rv = 0; zfs_close(guid1hdl); zfs_close(guid2hdl); return (rv); } static int recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, nvlist_t *renamed) { nvlist_t *local_nv, *deleted = NULL; avl_tree_t *local_avl; nvpair_t *fselem, *nextfselem; char *fromsnap; char newname[ZFS_MAX_DATASET_NAME_LEN]; char guidname[32]; int error; boolean_t needagain, progress, recursive; char *s1, *s2; VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap)); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (flags->dryrun) return (0); again: needagain = progress = B_FALSE; VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0)); if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, recursive, B_FALSE, &local_nv, &local_avl)) != 0) return (error); /* * Process deletes and renames */ for (fselem = nvlist_next_nvpair(local_nv, NULL); fselem; fselem = nextfselem) { nvlist_t *nvfs, *snaps; nvlist_t *stream_nvfs = NULL; nvpair_t *snapelem, *nextsnapelem; uint64_t fromguid = 0; uint64_t originguid = 0; uint64_t stream_originguid = 0; uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid; char *fsname, *stream_fsname; nextfselem = nvlist_next_nvpair(local_nv, fselem); VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap", &parent_fromsnap_guid)); (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); /* * First find the stream's fs, so we can check for * a different origin (due to "zfs promote") */ for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { uint64_t thisguid; VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); if (stream_nvfs != NULL) break; } /* check for promote */ (void) nvlist_lookup_uint64(stream_nvfs, "origin", &stream_originguid); if (stream_nvfs && originguid != stream_originguid) { switch (created_before(hdl, local_avl, stream_originguid, originguid)) { case 1: { /* promote it! */ zfs_cmd_t zc = { 0 }; nvlist_t *origin_nvfs; char *origin_fsname; if (flags->verbose) (void) printf("promoting %s\n", fsname); origin_nvfs = fsavl_find(local_avl, originguid, NULL); VERIFY(0 == nvlist_lookup_string(origin_nvfs, "name", &origin_fsname)); (void) strlcpy(zc.zc_value, origin_fsname, sizeof (zc.zc_value)); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); error = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); if (error == 0) progress = B_TRUE; break; } default: break; case -1: fsavl_destroy(local_avl); nvlist_free(local_nv); return (-1); } /* * We had/have the wrong origin, therefore our * list of snapshots is wrong. Need to handle * them on the next pass. */ needagain = B_TRUE; continue; } for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nextsnapelem) { uint64_t thisguid; char *stream_snapname; nvlist_t *found, *props; nextsnapelem = nvlist_next_nvpair(snaps, snapelem); VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); found = fsavl_find(stream_avl, thisguid, &stream_snapname); /* check for delete */ if (found == NULL) { char name[ZFS_MAX_DATASET_NAME_LEN]; if (!flags->force) continue; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); error = recv_destroy(hdl, name, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%" PRIu64, thisguid); nvlist_add_boolean(deleted, guidname); continue; } stream_nvfs = found; if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops", &props) && 0 == nvlist_lookup_nvlist(props, stream_snapname, &props)) { zfs_cmd_t zc = { 0 }; zc.zc_cookie = B_TRUE; /* received */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", fsname, nvpair_name(snapelem)); if (zcmd_write_src_nvlist(hdl, &zc, props) == 0) { (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } } /* check for different snapname */ if (strcmp(nvpair_name(snapelem), stream_snapname) != 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; char tryname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); (void) snprintf(tryname, sizeof (name), "%s@%s", fsname, stream_snapname); error = recv_rename(hdl, name, tryname, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; } if (strcmp(stream_snapname, fromsnap) == 0) fromguid = thisguid; } /* check for delete */ if (stream_nvfs == NULL) { if (!flags->force) continue; error = recv_destroy(hdl, fsname, strlen(tofs)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%" PRIu64, parent_fromsnap_guid); nvlist_add_boolean(deleted, guidname); continue; } if (fromguid == 0) { if (flags->verbose) { (void) printf("local fs %s does not have " "fromsnap (%s in stream); must have " "been deleted locally; ignoring\n", fsname, fromsnap); } continue; } VERIFY(0 == nvlist_lookup_string(stream_nvfs, "name", &stream_fsname)); VERIFY(0 == nvlist_lookup_uint64(stream_nvfs, "parentfromsnap", &stream_parent_fromsnap_guid)); s1 = strrchr(fsname, '/'); s2 = strrchr(stream_fsname, '/'); /* * Check if we're going to rename based on parent guid change * and the current parent guid was also deleted. If it was then * rename will fail and is likely unneeded, so avoid this and * force an early retry to determine the new * parent_fromsnap_guid. */ if (stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) { sprintf(guidname, "%" PRIu64, parent_fromsnap_guid); if (nvlist_exists(deleted, guidname)) { progress = B_TRUE; needagain = B_TRUE; goto doagain; } } /* * Check for rename. If the exact receive path is specified, it * does not count as a rename, but we still need to check the * datasets beneath it. */ if ((stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) || ((flags->isprefix || strcmp(tofs, fsname) != 0) && (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { nvlist_t *parent; char tryname[ZFS_MAX_DATASET_NAME_LEN]; parent = fsavl_find(local_avl, stream_parent_fromsnap_guid, NULL); /* * NB: parent might not be found if we used the * tosnap for stream_parent_fromsnap_guid, * because the parent is a newly-created fs; * we'll be able to rename it after we recv the * new fs. */ if (parent != NULL) { char *pname; VERIFY(0 == nvlist_lookup_string(parent, "name", &pname)); (void) snprintf(tryname, sizeof (tryname), "%s%s", pname, strrchr(stream_fsname, '/')); } else { tryname[0] = '\0'; if (flags->verbose) { (void) printf("local fs %s new parent " "not found\n", fsname); } } newname[0] = '\0'; error = recv_rename(hdl, fsname, tryname, strlen(tofs)+1, newname, flags); if (renamed != NULL && newname[0] != '\0') { VERIFY(0 == nvlist_add_boolean(renamed, newname)); } if (error) needagain = B_TRUE; else progress = B_TRUE; } } doagain: fsavl_destroy(local_avl); nvlist_free(local_nv); nvlist_free(deleted); if (needagain && progress) { /* do another pass to fix up temporary names */ if (flags->verbose) (void) printf("another pass:\n"); goto again; } return (needagain); } static int zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, char **top_zfs, int cleanup_fd, uint64_t *action_handlep) { nvlist_t *stream_nv = NULL; avl_tree_t *stream_avl = NULL; char *fromsnap = NULL; char *sendsnap = NULL; char *cp; char tofs[ZFS_MAX_DATASET_NAME_LEN]; char sendfs[ZFS_MAX_DATASET_NAME_LEN]; char errbuf[1024]; dmu_replay_record_t drre; int error; boolean_t anyerr = B_FALSE; boolean_t softerr = B_FALSE; boolean_t recursive; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); assert(drr->drr_type == DRR_BEGIN); assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == DMU_COMPOUNDSTREAM); /* * Read in the nvlist from the stream. */ if (drr->drr_payloadlen != 0) { error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, &stream_nv, flags->byteswap, zc); if (error) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } } recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (recursive && strchr(destname, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot stream")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } /* * Read in the end record and verify checksum. */ if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), flags->byteswap, NULL))) goto out; if (flags->byteswap) { drre.drr_type = BSWAP_32(drre.drr_type); drre.drr_u.drr_end.drr_checksum.zc_word[0] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); drre.drr_u.drr_end.drr_checksum.zc_word[1] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]); drre.drr_u.drr_end.drr_checksum.zc_word[2] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]); drre.drr_u.drr_end.drr_checksum.zc_word[3] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]); } if (drre.drr_type != DRR_END) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incorrect header checksum")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap); if (drr->drr_payloadlen != 0) { nvlist_t *stream_fss; VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss", &stream_fss)); if ((stream_avl = fsavl_create(stream_fss)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "couldn't allocate avl tree")); error = zfs_error(hdl, EZFS_NOMEM, errbuf); goto out; } if (fromsnap != NULL && recursive) { nvlist_t *renamed = NULL; nvpair_t *pair = NULL; (void) strlcpy(tofs, destname, sizeof (tofs)); if (flags->isprefix) { struct drr_begin *drrb = &drr->drr_u.drr_begin; int i; if (flags->istail) { cp = strrchr(drrb->drr_toname, '/'); if (cp == NULL) { (void) strlcat(tofs, "/", sizeof (tofs)); i = 0; } else { i = (cp - drrb->drr_toname); } } else { i = strcspn(drrb->drr_toname, "/@"); } /* zfs_receive_one() will create_parents() */ (void) strlcat(tofs, &drrb->drr_toname[i], sizeof (tofs)); *strchr(tofs, '@') = '\0'; } if (!flags->dryrun && !flags->nomount) { VERIFY(0 == nvlist_alloc(&renamed, NV_UNIQUE_NAME, 0)); } softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, renamed); /* Unmount renamed filesystems before receiving. */ while ((pair = nvlist_next_nvpair(renamed, pair)) != NULL) { zfs_handle_t *zhp; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, nvpair_name(pair), ZFS_TYPE_FILESYSTEM); if (zhp != NULL) { clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, 0, 0); zfs_close(zhp); if (clp != NULL) { softerr |= changelist_prefix(clp); changelist_free(clp); } } } nvlist_free(renamed); } } /* * Get the fs specified by the first path in the stream (the top level * specified by 'zfs send') and pass it to each invocation of * zfs_receive_one(). */ (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, sizeof (sendfs)); if ((cp = strchr(sendfs, '@')) != NULL) { *cp = '\0'; /* * Find the "sendsnap", the final snapshot in a replication * stream. zfs_receive_one() handles certain errors * differently, depending on if the contained stream is the * last one or not. */ sendsnap = (cp + 1); } /* Finally, receive each contained stream */ do { /* * we should figure out if it has a recoverable * error, in which case do a recv_skip() and drive on. * Note, if we fail due to already having this guid, * zfs_receive_one() will take care of it (ie, * recv_skip() and return 0). */ error = zfs_receive_impl(hdl, destname, NULL, flags, fd, sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, action_handlep, sendsnap); if (error == ENODATA) { error = 0; break; } anyerr |= error; } while (error == 0); if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) { /* * Now that we have the fs's they sent us, try the * renames again. */ softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, NULL); } out: fsavl_destroy(stream_avl); nvlist_free(stream_nv); if (softerr) error = -2; if (anyerr) error = -1; return (error); } static void trunc_prop_errs(int truncated) { ASSERT(truncated != 0); if (truncated == 1) (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "1 more property could not be set\n")); else (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "%d more properties could not be set\n"), truncated); } static int recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) { dmu_replay_record_t *drr; void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive:")); /* XXX would be great to use lseek if possible... */ drr = buf; while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t), byteswap, NULL) == 0) { if (byteswap) drr->drr_type = BSWAP_32(drr->drr_type); switch (drr->drr_type) { case DRR_BEGIN: if (drr->drr_payloadlen != 0) { (void) recv_read(hdl, fd, buf, drr->drr_payloadlen, B_FALSE, NULL); } break; case DRR_END: free(buf); return (0); case DRR_OBJECT: if (byteswap) { drr->drr_u.drr_object.drr_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_bonuslen); } (void) recv_read(hdl, fd, buf, P2ROUNDUP(drr->drr_u.drr_object.drr_bonuslen, 8), B_FALSE, NULL); break; case DRR_WRITE: if (byteswap) { drr->drr_u.drr_write.drr_logical_size = BSWAP_64( drr->drr_u.drr_write.drr_logical_size); drr->drr_u.drr_write.drr_compressed_size = BSWAP_64( drr->drr_u.drr_write.drr_compressed_size); } uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_SPILL: if (byteswap) { drr->drr_u.drr_spill.drr_length = BSWAP_64(drr->drr_u.drr_spill.drr_length); } (void) recv_read(hdl, fd, buf, drr->drr_u.drr_spill.drr_length, B_FALSE, NULL); break; case DRR_WRITE_EMBEDDED: if (byteswap) { drr->drr_u.drr_write_embedded.drr_psize = BSWAP_32(drr->drr_u.drr_write_embedded. drr_psize); } (void) recv_read(hdl, fd, buf, P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, 8), B_FALSE, NULL); break; case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } } free(buf); return (-1); } static void recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, boolean_t resumable) { char target_fs[ZFS_MAX_DATASET_NAME_LEN]; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "checksum mismatch or incomplete stream")); if (!resumable) return; (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); *strchr(target_fs, '@') = '\0'; zfs_handle_t *zhp = zfs_open(hdl, target_fs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return; char token_buf[ZFS_MAXPROPLEN]; int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (error == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "checksum mismatch or incomplete stream.\n" "Partially received snapshot is saved.\n" "A resuming stream can be generated on the sending " "system by running:\n" " zfs send -t %s"), token_buf); } zfs_close(zhp); } /* * Restores a backup of tosnap from the file descriptor specified by infd. */ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, uint64_t *action_handlep, const char *finalsnap) { zfs_cmd_t zc = { 0 }; time_t begin_time; int ioctl_err, ioctl_errno, err; char *cp; struct drr_begin *drrb = &drr->drr_u.drr_begin; char errbuf[1024]; char prop_errbuf[1024]; const char *chopprefix; boolean_t newfs = B_FALSE; boolean_t stream_wantsnewfs; uint64_t parent_snapguid = 0; prop_changelist_t *clp = NULL; nvlist_t *snapprops_nvlist = NULL; zprop_errflags_t prop_errflags; boolean_t recursive; char *snapname = NULL; begin_time = time(NULL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (stream_avl != NULL) { nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, &snapname); nvlist_t *props; int ret; (void) nvlist_lookup_uint64(fs, "parentfromsnap", &parent_snapguid); err = nvlist_lookup_nvlist(fs, "props", &props); if (err) VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); if (flags->canmountoff) { VERIFY(0 == nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0)); } ret = zcmd_write_src_nvlist(hdl, &zc, props); if (err) nvlist_free(props); if (0 == nvlist_lookup_nvlist(fs, "snapprops", &props)) { VERIFY(0 == nvlist_lookup_nvlist(props, snapname, &snapprops_nvlist)); } if (ret != 0) return (-1); } cp = NULL; /* * Determine how much of the snapshot name stored in the stream * we are going to tack on to the name they specified on the * command line, and how much we are going to chop off. * * If they specified a snapshot, chop the entire name stored in * the stream. */ if (flags->istail) { /* * A filesystem was specified with -e. We want to tack on only * the tail of the sent snapshot path. */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -e")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } chopprefix = strrchr(sendfs, '/'); if (chopprefix == NULL) { /* * The tail is the poolname, so we need to * prepend a path separator. */ int len = strlen(drrb->drr_toname); cp = malloc(len + 2); cp[0] = '/'; (void) strcpy(&cp[1], drrb->drr_toname); chopprefix = cp; } else { chopprefix = drrb->drr_toname + (chopprefix - sendfs); } } else if (flags->isprefix) { /* * A filesystem was specified with -d. We want to tack on * everything but the first element of the sent snapshot path * (all but the pool name). */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -d")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } chopprefix = strchr(drrb->drr_toname, '/'); if (chopprefix == NULL) chopprefix = strchr(drrb->drr_toname, '@'); } else if (strchr(tosnap, '@') == NULL) { /* * If a filesystem was specified without -d or -e, we want to * tack on everything after the fs specified by 'zfs send'. */ chopprefix = drrb->drr_toname + strlen(sendfs); } else { /* A snapshot was specified as an exact path (no -d or -e). */ if (recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot " "stream")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); } ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); ASSERT(chopprefix > drrb->drr_toname); ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname)); ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || chopprefix[0] == '\0'); /* * Determine name of destination snapshot, store in zc_value. */ (void) strcpy(zc.zc_value, tosnap); (void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value)); #ifdef __FreeBSD__ if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) zfs_ioctl_version = get_zfs_ioctl_version(); /* * For forward compatibility hide tosnap in zc_value */ if (zfs_ioctl_version < ZFS_IOCVER_LZC) (void) strcpy(zc.zc_value + strlen(zc.zc_value) + 1, tosnap); #endif free(cp); if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) { zcmd_free_nvlists(&zc); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* * Determine the name of the origin snapshot, store in zc_string. */ if (originsnap) { (void) strncpy(zc.zc_string, originsnap, sizeof (zc.zc_string)); if (flags->verbose) (void) printf("using provided clone origin %s\n", zc.zc_string); } else if (drrb->drr_flags & DRR_FLAG_CLONE) { if (guid_to_name(hdl, zc.zc_value, drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local origin for clone %s does not exist"), zc.zc_value); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } if (flags->verbose) (void) printf("found clone origin %s\n", zc.zc_string); } boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RESUMING; stream_wantsnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; if (stream_wantsnewfs) { /* * if the parent fs does not exist, look for it based on * the parent snap GUID */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive new filesystem stream")); (void) strcpy(zc.zc_name, zc.zc_value); cp = strrchr(zc.zc_name, '/'); if (cp) *cp = '\0'; if (cp && !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { char suffix[ZFS_MAX_DATASET_NAME_LEN]; (void) strcpy(suffix, strrchr(zc.zc_value, '/')); if (guid_to_name(hdl, zc.zc_name, parent_snapguid, B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, suffix); } } } else { /* * If the fs does not exist, look for it based on the * fromsnap GUID. */ if (resuming) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive resume stream")); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive incremental stream")); } (void) strcpy(zc.zc_name, zc.zc_value); *strchr(zc.zc_name, '@') = '\0'; /* * If the exact receive path was specified and this is the * topmost path in the stream, then if the fs does not exist we * should look no further. */ if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + strlen(sendfs)) != '\0' && *chopprefix != '@')) && !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { char snap[ZFS_MAX_DATASET_NAME_LEN]; (void) strcpy(snap, strchr(zc.zc_value, '@')); if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid, B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, snap); } } } (void) strcpy(zc.zc_name, zc.zc_value); *strchr(zc.zc_name, '@') = '\0'; if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { zfs_handle_t *zhp; /* * Destination fs exists. It must be one of these cases: * - an incremental send stream * - the stream specifies a new fs (full stream or clone) * and they want us to blow away the existing fs (and * have therefore specified -F and removed any snapshots) * - we are resuming a failed receive. */ if (stream_wantsnewfs) { if (!flags->force) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' exists\n" "must specify -F to overwrite it"), zc.zc_name); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has snapshots (eg. %s)\n" "must destroy them to overwrite it"), zc.zc_name); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } } if ((zhp = zfs_open(hdl, zc.zc_name, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { zcmd_free_nvlists(&zc); return (-1); } if (stream_wantsnewfs && zhp->zfs_dmustats.dds_origin[0]) { zcmd_free_nvlists(&zc); zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' is a clone\n" "must destroy it to overwrite it"), zc.zc_name); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && stream_wantsnewfs) { /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0); if (clp == NULL) { zfs_close(zhp); zcmd_free_nvlists(&zc); return (-1); } if (changelist_prefix(clp) != 0) { changelist_free(clp); zfs_close(zhp); zcmd_free_nvlists(&zc); return (-1); } } /* * If we are resuming a newfs, set newfs here so that we will * mount it if the recv succeeds this time. We can tell * that it was a newfs on the first recv because the fs * itself will be inconsistent (if the fs existed when we * did the first recv, we would have received it into * .../%recv). */ if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) newfs = B_TRUE; zfs_close(zhp); } else { /* * Destination filesystem does not exist. Therefore we better * be creating a new filesystem (either from a full backup, or * a clone). It would therefore be invalid if the user * specified only the pool name (i.e. if the destination name * contained no slash character). */ if (!stream_wantsnewfs || (cp = strrchr(zc.zc_name, '/')) == NULL) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' does not exist"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* * Trim off the final dataset component so we perform the * recvbackup ioctl to the filesystems's parent. */ *cp = '\0'; if (flags->isprefix && !flags->istail && !flags->dryrun && create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) { zcmd_free_nvlists(&zc); return (zfs_error(hdl, EZFS_BADRESTORE, errbuf)); } newfs = B_TRUE; } zc.zc_begin_record = *drr_noswap; zc.zc_cookie = infd; zc.zc_guid = flags->force; zc.zc_resumable = flags->resumable; if (flags->verbose) { (void) printf("%s %s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, zc.zc_value); (void) fflush(stdout); } if (flags->dryrun) { zcmd_free_nvlists(&zc); return (recv_skip(hdl, infd, flags->byteswap)); } zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf; zc.zc_nvlist_dst_size = sizeof (prop_errbuf); zc.zc_cleanup_fd = cleanup_fd; zc.zc_action_handle = *action_handlep; err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc); ioctl_errno = errno; prop_errflags = (zprop_errflags_t)zc.zc_obj; if (err == 0) { nvlist_t *prop_errors; VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, zc.zc_nvlist_dst_size, &prop_errors, 0)); nvpair_t *prop_err = NULL; while ((prop_err = nvlist_next_nvpair(prop_errors, prop_err)) != NULL) { char tbuf[1024]; zfs_prop_t prop; int intval; prop = zfs_name_to_prop(nvpair_name(prop_err)); (void) nvpair_value_int32(prop_err, &intval); if (strcmp(nvpair_name(prop_err), ZPROP_N_MORE_ERRORS) == 0) { trunc_prop_errs(intval); break; } else if (snapname == NULL || finalsnap == NULL || strcmp(finalsnap, snapname) == 0 || strcmp(nvpair_name(prop_err), zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) { /* * Skip the special case of, for example, * "refquota", errors on intermediate * snapshots leading up to a final one. * That's why we have all of the checks above. * * See zfs_ioctl.c's extract_delay_props() for * a list of props which can fail on * intermediate snapshots, but shouldn't * affect the overall receive. */ (void) snprintf(tbuf, sizeof (tbuf), dgettext(TEXT_DOMAIN, "cannot receive %s property on %s"), nvpair_name(prop_err), zc.zc_name); zfs_setprop_error(hdl, prop, intval, tbuf); } } nvlist_free(prop_errors); } zc.zc_nvlist_dst = 0; zc.zc_nvlist_dst_size = 0; zcmd_free_nvlists(&zc); if (err == 0 && snapprops_nvlist) { zfs_cmd_t zc2 = { 0 }; (void) strcpy(zc2.zc_name, zc.zc_value); zc2.zc_cookie = B_TRUE; /* received */ if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) { (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2); zcmd_free_nvlists(&zc2); } } if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { /* * It may be that this snapshot already exists, * in which case we want to consume & ignore it * rather than failing. */ avl_tree_t *local_avl; nvlist_t *local_nv, *fs; cp = strchr(zc.zc_value, '@'); /* * XXX Do this faster by just iterating over snaps in * this fs. Also if zc_value does not exist, we will * get a strange "does not exist" error message. */ *cp = '\0'; if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE, B_FALSE, &local_nv, &local_avl) == 0) { *cp = '@'; fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); fsavl_destroy(local_avl); nvlist_free(local_nv); if (fs != NULL) { if (flags->verbose) { (void) printf("snap %s already exists; " "ignoring\n", zc.zc_value); } err = ioctl_err = recv_skip(hdl, infd, flags->byteswap); } } *cp = '@'; } if (ioctl_err != 0) { switch (ioctl_errno) { case ENODEV: cp = strchr(zc.zc_value, '@'); *cp = '\0'; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "most recent snapshot of %s does not\n" "match incremental source"), zc.zc_value); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); *cp = '@'; break; case ETXTBSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s has been modified\n" "since most recent snapshot"), zc.zc_name); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); break; case EEXIST: cp = strchr(zc.zc_value, '@'); if (newfs) { /* it's the containing fs that exists */ *cp = '\0'; } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination already exists")); (void) zfs_error_fmt(hdl, EZFS_EXISTS, dgettext(TEXT_DOMAIN, "cannot restore to %s"), zc.zc_value); *cp = '@'; break; case EINVAL: (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to receive this stream.")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EDQUOT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s space quota exceeded"), zc.zc_name); (void) zfs_error(hdl, EZFS_NOSPC, errbuf); break; default: (void) zfs_standard_error(hdl, ioctl_errno, errbuf); } } /* * Mount the target filesystem (if created). Also mount any * children of the target filesystem if we did a replication * receive (indicated by stream_avl being non-NULL). */ cp = strchr(zc.zc_value, '@'); if (cp && (ioctl_err == 0 || !newfs)) { zfs_handle_t *h; *cp = '\0'; h = zfs_open(hdl, zc.zc_value, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (h != NULL) { if (h->zfs_type == ZFS_TYPE_VOLUME) { *cp = '@'; } else if (newfs || stream_avl) { /* * Track the first/top of hierarchy fs, * for mounting and sharing later. */ if (top_zfs && *top_zfs == NULL) *top_zfs = zfs_strdup(hdl, zc.zc_value); } zfs_close(h); } *cp = '@'; } if (clp) { if (!flags->nomount) err |= changelist_postfix(clp); changelist_free(clp); } if (prop_errflags & ZPROP_ERR_NOCLEAR) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to clear unreceived properties on %s"), zc.zc_name); (void) fprintf(stderr, "\n"); } if (prop_errflags & ZPROP_ERR_NORESTORE) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to restore original properties on %s"), zc.zc_name); (void) fprintf(stderr, "\n"); } if (err || ioctl_err) return (-1); *action_handlep = zc.zc_action_handle; if (flags->verbose) { char buf1[64]; char buf2[64]; uint64_t bytes = zc.zc_cookie; time_t delta = time(NULL) - begin_time; if (delta == 0) delta = 1; zfs_nicenum(bytes, buf1, sizeof (buf1)); zfs_nicenum(bytes/delta, buf2, sizeof (buf1)); (void) printf("received %sB stream in %lu seconds (%sB/sec)\n", buf1, delta, buf2); } return (0); } static int zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, uint64_t *action_handlep, const char *finalsnap) { int err; dmu_replay_record_t drr, drr_noswap; struct drr_begin *drrb = &drr.drr_u.drr_begin; char errbuf[1024]; zio_cksum_t zcksum = { 0 }; uint64_t featureflags; int hdrtype; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); if (flags->isprefix && !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " "(%s) does not exist"), tosnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } if (originsnap && !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " "(%s) does not exist"), originsnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* read in the BEGIN record */ if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, &zcksum))) return (err); if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) { /* It's the double end record at the end of a package */ return (ENODATA); } /* the kernel needs the non-byteswapped begin record */ drr_noswap = drr; flags->byteswap = B_FALSE; if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { /* * We computed the checksum in the wrong byteorder in * recv_read() above; do it again correctly. */ bzero(&zcksum, sizeof (zio_cksum_t)); (void) fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); flags->byteswap = B_TRUE; drr.drr_type = BSWAP_32(drr.drr_type); drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_flags = BSWAP_32(drrb->drr_flags); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad magic number)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); if (!DMU_STREAM_SUPPORTED(featureflags) || (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream has unsupported feature, feature flags = %lx"), featureflags); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } if (strchr(drrb->drr_toname, '@') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad snapshot name)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN]; if (sendfs == NULL) { /* * We were not called from zfs_receive_package(). Get * the fs specified by 'zfs send'. */ char *cp; (void) strlcpy(nonpackage_sendfs, drr.drr_u.drr_begin.drr_toname, sizeof (nonpackage_sendfs)); if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) *cp = '\0'; sendfs = nonpackage_sendfs; VERIFY(finalsnap == NULL); } return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, action_handlep, finalsnap)); } else { assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM); return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, &zcksum, top_zfs, cleanup_fd, action_handlep)); } } /* * Restores a backup of tosnap from the file descriptor specified by infd. * Return 0 on total success, -2 if some things couldn't be * destroyed/renamed/promoted, -1 if some things couldn't be received. * (-1 will override -2, if -1 and the resumable flag was specified the * transfer can be resumed if the sending side supports it). */ int zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, recvflags_t *flags, int infd, avl_tree_t *stream_avl) { char *top_zfs = NULL; int err; int cleanup_fd; uint64_t action_handle = 0; char *originsnap = NULL; if (props) { err = nvlist_lookup_string(props, "origin", &originsnap); if (err && err != ENOENT) return (err); } cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); VERIFY(cleanup_fd >= 0); err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, stream_avl, &top_zfs, cleanup_fd, &action_handle, NULL); VERIFY(0 == close(cleanup_fd)); if (err == 0 && !flags->nomount && top_zfs) { zfs_handle_t *zhp; prop_changelist_t *clp; zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM); if (zhp != NULL) { clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, CL_GATHER_MOUNT_ALWAYS, 0); zfs_close(zhp); if (clp != NULL) { /* mount and share received datasets */ err = changelist_postfix(clp); changelist_free(clp); } } if (zhp == NULL || clp == NULL || err) err = -1; } if (top_zfs) free(top_zfs); return (err); } Index: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs (revision 352586) +++ projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs (revision 352587) Property changes on: projects/clang900-import/cddl/contrib/opensolaris/lib/libzfs ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl/contrib/opensolaris/lib/libzfs:r352537-352586 Index: projects/clang900-import/cddl/contrib/opensolaris =================================================================== --- projects/clang900-import/cddl/contrib/opensolaris (revision 352586) +++ projects/clang900-import/cddl/contrib/opensolaris (revision 352587) Property changes on: projects/clang900-import/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl/contrib/opensolaris:r352537-352586 Index: projects/clang900-import/cddl =================================================================== --- projects/clang900-import/cddl (revision 352586) +++ projects/clang900-import/cddl (revision 352587) Property changes on: projects/clang900-import/cddl ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl:r352537-352586 Index: projects/clang900-import/contrib/ntp/ntpd/ntpd.c =================================================================== --- projects/clang900-import/contrib/ntp/ntpd/ntpd.c (revision 352586) +++ projects/clang900-import/contrib/ntp/ntpd/ntpd.c (revision 352587) @@ -1,1760 +1,1760 @@ /* * ntpd.c - main program for the fixed point NTP daemon */ #ifdef HAVE_CONFIG_H # include #endif #include "ntp_machine.h" #include "ntpd.h" #include "ntp_io.h" #include "ntp_stdlib.h" #include #include "ntp_config.h" #include "ntp_syslog.h" #include "ntp_assert.h" #include "isc/error.h" #include "isc/strerror.h" #include "isc/formatcheck.h" #include "iosignal.h" #ifdef SIM # include "ntpsim.h" #endif #include "ntp_libopts.h" #include "ntpd-opts.h" /* there's a short treatise below what the thread stuff is for. * [Bug 2954] enable the threading warm-up only for Linux. */ #if defined(HAVE_PTHREADS) && HAVE_PTHREADS && !defined(NO_THREADS) # ifdef HAVE_PTHREAD_H # include # endif # if defined(linux) # define NEED_PTHREAD_WARMUP # endif #endif #ifdef HAVE_UNISTD_H # include #endif #ifdef HAVE_SYS_STAT_H # include #endif #include #ifdef HAVE_SYS_PARAM_H # include #endif #ifdef HAVE_SYS_SIGNAL_H # include #else # include #endif #ifdef HAVE_SYS_IOCTL_H # include #endif /* HAVE_SYS_IOCTL_H */ #if defined(HAVE_RTPRIO) # ifdef HAVE_SYS_LOCK_H # include # endif # include #else # ifdef HAVE_PLOCK # ifdef HAVE_SYS_LOCK_H # include # endif # endif #endif #if defined(HAVE_SCHED_SETSCHEDULER) # ifdef HAVE_SCHED_H # include # else # ifdef HAVE_SYS_SCHED_H # include # endif # endif #endif #if defined(HAVE_SYS_MMAN_H) # include #endif #ifdef HAVE_TERMIOS_H # include #endif #ifdef SYS_DOMAINOS # include #endif /* SYS_DOMAINOS */ #include "recvbuff.h" #include "ntp_cmdargs.h" #if 0 /* HMS: I don't think we need this. 961223 */ #ifdef LOCK_PROCESS # ifdef SYS_SOLARIS # include # else # include # endif #endif #endif #ifdef SYS_WINNT # include "ntservice.h" #endif #ifdef _AIX # include #endif /* _AIX */ #ifdef SCO5_CLOCK # include #endif #ifdef HAVE_DROPROOT # include # include # include #ifdef HAVE_LINUX_CAPABILITIES # include # include #endif /* HAVE_LINUX_CAPABILITIES */ #if defined(HAVE_PRIV_H) && defined(HAVE_SOLARIS_PRIVS) # include #endif /* HAVE_PRIV_H */ #if defined(HAVE_TRUSTEDBSD_MAC) # include #endif /* HAVE_TRUSTEDBSD_MAC */ #endif /* HAVE_DROPROOT */ #if defined (LIBSECCOMP) && (KERN_SECCOMP) /* # include */ # include # include #endif /* LIBSECCOMP and KERN_SECCOMP */ #ifdef HAVE_DNSREGISTRATION # include DNSServiceRef mdns; #endif #ifdef HAVE_SETPGRP_0 # define ntp_setpgrp(x, y) setpgrp() #else # define ntp_setpgrp(x, y) setpgrp(x, y) #endif #ifdef HAVE_SOLARIS_PRIVS # define LOWPRIVS "basic,sys_time,net_privaddr,proc_setid,!proc_info,!proc_session,!proc_exec" static priv_set_t *lowprivs = NULL; static priv_set_t *highprivs = NULL; #endif /* HAVE_SOLARIS_PRIVS */ /* * Scheduling priority we run at */ #define NTPD_PRIO (-12) int priority_done = 2; /* 0 - Set priority */ /* 1 - priority is OK where it is */ /* 2 - Don't set priority */ /* 1 and 2 are pretty much the same */ int listen_to_virtual_ips = TRUE; /* * No-fork flag. If set, we do not become a background daemon. */ int nofork; /* Fork by default */ #ifdef HAVE_DNSREGISTRATION /* * mDNS registration flag. If set, we attempt to register with the mDNS system, but only * after we have synched the first time. If the attempt fails, then try again once per * minute for up to 5 times. After all, we may be starting before mDNS. */ int mdnsreg = FALSE; int mdnstries = 5; #endif /* HAVE_DNSREGISTRATION */ #ifdef HAVE_DROPROOT int droproot; int root_dropped; char *user; /* User to switch to */ char *group; /* group to switch to */ const char *chrootdir; /* directory to chroot to */ uid_t sw_uid; gid_t sw_gid; struct group *gr; struct passwd *pw; #endif /* HAVE_DROPROOT */ #ifdef HAVE_WORKING_FORK int waitsync_fd_to_close = -1; /* -w/--wait-sync */ #endif /* * Version declaration */ extern const char *Version; char const *progname; int was_alarmed; #ifdef DECL_SYSCALL /* * We put this here, since the argument profile is syscall-specific */ extern int syscall (int, ...); #endif /* DECL_SYSCALL */ #if !defined(SIM) && defined(SIGDIE1) static volatile int signalled = 0; static volatile int signo = 0; /* In an ideal world, 'finish_safe()' would declared as noreturn... */ static void finish_safe (int); static RETSIGTYPE finish (int); #endif #if !defined(SIM) && defined(HAVE_WORKING_FORK) static int wait_child_sync_if (int, long); #endif #if !defined(SIM) && !defined(SYS_WINNT) # ifdef DEBUG static RETSIGTYPE moredebug (int); static RETSIGTYPE lessdebug (int); # else /* !DEBUG follows */ static RETSIGTYPE no_debug (int); # endif /* !DEBUG */ #endif /* !SIM && !SYS_WINNT */ #ifndef WORK_FORK int saved_argc; char ** saved_argv; #endif #ifndef SIM int ntpdmain (int, char **); static void set_process_priority (void); static void assertion_failed (const char *, int, isc_assertiontype_t, const char *) __attribute__ ((__noreturn__)); static void library_fatal_error (const char *, int, const char *, va_list) ISC_FORMAT_PRINTF(3, 0); static void library_unexpected_error(const char *, int, const char *, va_list) ISC_FORMAT_PRINTF(3, 0); #endif /* !SIM */ /* Bug2332 unearthed a problem in the interaction of reduced user * privileges, the limits on memory usage and some versions of the * pthread library on Linux systems. The 'pthread_cancel()' function and * likely some others need to track the stack of the thread involved, * and uses a function that comes from GCC (--> libgcc_s.so) to do * this. Unfortunately the developers of glibc decided to load the * library on demand, which speeds up program start but can cause * trouble here: Due to all the things NTPD does to limit its resource * usage, this deferred load of libgcc_s does not always work once the * restrictions are in effect. * * One way out of this was attempting a forced link against libgcc_s * when possible because it makes the library available immediately * without deferred load. (The symbol resolution would still be dynamic * and on demand, but the code would already be in the process image.) * * This is a tricky thing to do, since it's not necessary everywhere, * not possible everywhere, has shown to break the build of other * programs in the NTP suite and is now generally frowned upon. * * So we take a different approach here: We creat a worker thread that does * actually nothing except waiting for cancellation and cancel it. If * this is done before all the limitations are put in place, the * machinery is pre-heated and all the runtime stuff should be in place * and useable when needed. * * This uses only the standard pthread API and should work with all * implementations of pthreads. It is not necessary everywhere, but it's * cheap enough to go on nearly unnoticed. * * Addendum: Bug 2954 showed that the assumption that this should work * with all OS is wrong -- at least FreeBSD bombs heavily. */ #ifdef NEED_PTHREAD_WARMUP /* simple thread function: sleep until cancelled, just to exercise * thread cancellation. */ static void* my_pthread_warmup_worker( void *thread_args) { (void)thread_args; for (;;) sleep(10); return NULL; } /* pre-heat threading: create a thread and cancel it, just to exercise * thread cancellation. */ static void my_pthread_warmup(void) { pthread_t thread; pthread_attr_t thr_attr; int rc; pthread_attr_init(&thr_attr); #if defined(HAVE_PTHREAD_ATTR_GETSTACKSIZE) && \ defined(HAVE_PTHREAD_ATTR_SETSTACKSIZE) && \ defined(PTHREAD_STACK_MIN) { size_t ssmin = 32*1024; /* 32kB should be minimum */ if (ssmin < PTHREAD_STACK_MIN) ssmin = PTHREAD_STACK_MIN; rc = pthread_attr_setstacksize(&thr_attr, ssmin); if (0 != rc) msyslog(LOG_ERR, "my_pthread_warmup: pthread_attr_setstacksize() -> %s", strerror(rc)); } #endif rc = pthread_create( &thread, &thr_attr, my_pthread_warmup_worker, NULL); pthread_attr_destroy(&thr_attr); if (0 != rc) { msyslog(LOG_ERR, "my_pthread_warmup: pthread_create() -> %s", strerror(rc)); } else { pthread_cancel(thread); pthread_join(thread, NULL); } } #endif /*defined(NEED_PTHREAD_WARMUP)*/ #ifdef NEED_EARLY_FORK static void dummy_callback(void) { return; } static void fork_nonchroot_worker(void) { getaddrinfo_sometime("localhost", "ntp", NULL, INITIAL_DNS_RETRY, (gai_sometime_callback)&dummy_callback, NULL); } #endif /* NEED_EARLY_FORK */ void parse_cmdline_opts( int * pargc, char ***pargv ) { static int parsed; static int optct; if (!parsed) optct = ntpOptionProcess(&ntpdOptions, *pargc, *pargv); parsed = 1; *pargc -= optct; *pargv += optct; } #ifdef SIM int main( int argc, char *argv[] ) { progname = argv[0]; parse_cmdline_opts(&argc, &argv); #ifdef DEBUG debug = OPT_VALUE_SET_DEBUG_LEVEL; DPRINTF(1, ("%s\n", Version)); #endif return ntpsim(argc, argv); } #else /* !SIM follows */ #ifdef NO_MAIN_ALLOWED CALL(ntpd,"ntpd",ntpdmain); #else /* !NO_MAIN_ALLOWED follows */ #ifndef SYS_WINNT int main( int argc, char *argv[] ) { return ntpdmain(argc, argv); } #endif /* !SYS_WINNT */ #endif /* !NO_MAIN_ALLOWED */ #endif /* !SIM */ #ifdef _AIX /* * OK. AIX is different than solaris in how it implements plock(). * If you do NOT adjust the stack limit, you will get the MAXIMUM * stack size allocated and PINNED with you program. To check the * value, use ulimit -a. * * To fix this, we create an automatic variable and set our stack limit * to that PLUS 32KB of extra space (we need some headroom). * * This subroutine gets the stack address. * * Grover Davidson and Matt Ladendorf * */ static char * get_aix_stack(void) { char ch; return (&ch); } /* * Signal handler for SIGDANGER. */ static void catch_danger(int signo) { msyslog(LOG_INFO, "ntpd: setpgid(): %m"); /* Make the system believe we'll free something, but don't do it! */ return; } #endif /* _AIX */ /* * Set the process priority */ #ifndef SIM static void set_process_priority(void) { # ifdef DEBUG if (debug > 1) msyslog(LOG_DEBUG, "set_process_priority: %s: priority_done is <%d>", ((priority_done) ? "Leave priority alone" : "Attempt to set priority" ), priority_done); # endif /* DEBUG */ # if defined(HAVE_SCHED_SETSCHEDULER) if (!priority_done) { extern int config_priority_override, config_priority; int pmax, pmin; struct sched_param sched; pmax = sched_get_priority_max(SCHED_FIFO); sched.sched_priority = pmax; if ( config_priority_override ) { pmin = sched_get_priority_min(SCHED_FIFO); if ( config_priority > pmax ) sched.sched_priority = pmax; else if ( config_priority < pmin ) sched.sched_priority = pmin; else sched.sched_priority = config_priority; } if ( sched_setscheduler(0, SCHED_FIFO, &sched) == -1 ) msyslog(LOG_ERR, "sched_setscheduler(): %m"); else ++priority_done; } # endif /* HAVE_SCHED_SETSCHEDULER */ # ifdef HAVE_RTPRIO # ifdef RTP_SET if (!priority_done) { struct rtprio srtp; srtp.type = RTP_PRIO_REALTIME; /* was: RTP_PRIO_NORMAL */ srtp.prio = 0; /* 0 (hi) -> RTP_PRIO_MAX (31,lo) */ if (rtprio(RTP_SET, getpid(), &srtp) < 0) msyslog(LOG_ERR, "rtprio() error: %m"); else ++priority_done; } # else /* !RTP_SET follows */ if (!priority_done) { if (rtprio(0, 120) < 0) msyslog(LOG_ERR, "rtprio() error: %m"); else ++priority_done; } # endif /* !RTP_SET */ # endif /* HAVE_RTPRIO */ # if defined(NTPD_PRIO) && NTPD_PRIO != 0 # ifdef HAVE_ATT_NICE if (!priority_done) { errno = 0; if (-1 == nice (NTPD_PRIO) && errno != 0) msyslog(LOG_ERR, "nice() error: %m"); else ++priority_done; } # endif /* HAVE_ATT_NICE */ # ifdef HAVE_BSD_NICE if (!priority_done) { if (-1 == setpriority(PRIO_PROCESS, 0, NTPD_PRIO)) msyslog(LOG_ERR, "setpriority() error: %m"); else ++priority_done; } # endif /* HAVE_BSD_NICE */ # endif /* NTPD_PRIO && NTPD_PRIO != 0 */ if (!priority_done) msyslog(LOG_ERR, "set_process_priority: No way found to improve our priority"); } #endif /* !SIM */ #if !defined(SIM) && !defined(SYS_WINNT) /* * Detach from terminal (much like daemon()) * Nothe that this function calls exit() */ # ifdef HAVE_WORKING_FORK static void detach_from_terminal( int pipe_fds[2], long wait_sync, const char *logfilename ) { int rc; int exit_code; # if !defined(HAVE_SETSID) && !defined (HAVE_SETPGID) && defined(TIOCNOTTY) int fid; # endif # ifdef _AIX struct sigaction sa; # endif rc = fork(); if (-1 == rc) { exit_code = (errno) ? errno : -1; msyslog(LOG_ERR, "fork: %m"); exit(exit_code); } if (rc > 0) { /* parent */ exit_code = wait_child_sync_if(pipe_fds[0], wait_sync); exit(exit_code); } /* * child/daemon * close all open files excepting waitsync_fd_to_close. * msyslog() unreliable until after init_logging(). */ closelog(); if (syslog_file != NULL) { fclose(syslog_file); syslog_file = NULL; syslogit = TRUE; } close_all_except(waitsync_fd_to_close); INSIST(0 == open("/dev/null", 0) && 1 == dup2(0, 1) \ && 2 == dup2(0, 2)); init_logging(progname, 0, TRUE); /* we lost our logfile (if any) daemonizing */ setup_logfile(logfilename); # ifdef SYS_DOMAINOS { uid_$t puid; status_$t st; proc2_$who_am_i(&puid); proc2_$make_server(&puid, &st); } # endif /* SYS_DOMAINOS */ # ifdef HAVE_SETSID if (setsid() == (pid_t)-1) msyslog(LOG_ERR, "setsid(): %m"); # elif defined(HAVE_SETPGID) if (setpgid(0, 0) == -1) msyslog(LOG_ERR, "setpgid(): %m"); # else /* !HAVE_SETSID && !HAVE_SETPGID follows */ # ifdef TIOCNOTTY fid = open("/dev/tty", 2); if (fid >= 0) { ioctl(fid, (u_long)TIOCNOTTY, NULL); close(fid); } # endif /* TIOCNOTTY */ ntp_setpgrp(0, getpid()); # endif /* !HAVE_SETSID && !HAVE_SETPGID */ # ifdef _AIX /* Don't get killed by low-on-memory signal. */ sa.sa_handler = catch_danger; sigemptyset(&sa.sa_mask); sa.sa_flags = SA_RESTART; sigaction(SIGDANGER, &sa, NULL); # endif /* _AIX */ return; } # endif /* HAVE_WORKING_FORK */ #ifdef HAVE_DROPROOT /* * Map user name/number to user ID */ static int map_user( ) { char *endp; if (isdigit((unsigned char)*user)) { sw_uid = (uid_t)strtoul(user, &endp, 0); if (*endp != '\0') goto getuser; if ((pw = getpwuid(sw_uid)) != NULL) { free(user); user = estrdup(pw->pw_name); sw_gid = pw->pw_gid; } else { errno = 0; msyslog(LOG_ERR, "Cannot find user ID %s", user); return 0; } } else { getuser: errno = 0; if ((pw = getpwnam(user)) != NULL) { sw_uid = pw->pw_uid; sw_gid = pw->pw_gid; } else { if (errno) msyslog(LOG_ERR, "getpwnam(%s) failed: %m", user); else msyslog(LOG_ERR, "Cannot find user `%s'", user); return 0; } } return 1; } /* * Map group name/number to group ID */ static int map_group(void) { char *endp; if (isdigit((unsigned char)*group)) { sw_gid = (gid_t)strtoul(group, &endp, 0); if (*endp != '\0') goto getgroup; } else { getgroup: if ((gr = getgrnam(group)) != NULL) { sw_gid = gr->gr_gid; } else { errno = 0; msyslog(LOG_ERR, "Cannot find group `%s'", group); return 0; } } return 1; } static int set_group_ids(void) { if (user && initgroups(user, sw_gid)) { msyslog(LOG_ERR, "Cannot initgroups() to user `%s': %m", user); return 0; } if (group && setgid(sw_gid)) { msyslog(LOG_ERR, "Cannot setgid() to group `%s': %m", group); return 0; } if (group && setegid(sw_gid)) { msyslog(LOG_ERR, "Cannot setegid() to group `%s': %m", group); return 0; } if (group) { if (0 != setgroups(1, &sw_gid)) { msyslog(LOG_ERR, "setgroups(1, %d) failed: %m", sw_gid); return 0; } } else if (pw) if (0 != initgroups(pw->pw_name, pw->pw_gid)) { msyslog(LOG_ERR, "initgroups(<%s>, %d) filed: %m", pw->pw_name, pw->pw_gid); return 0; } return 1; } static int set_user_ids(void) { if (user && setuid(sw_uid)) { msyslog(LOG_ERR, "Cannot setuid() to user `%s': %m", user); return 0; } if (user && seteuid(sw_uid)) { msyslog(LOG_ERR, "Cannot seteuid() to user `%s': %m", user); return 0; } return 1; } /* * Change (effective) user and group IDs, also initialize the supplementary group access list */ int set_user_group_ids(void); int set_user_group_ids(void) { /* If the the user was already mapped, no need to map it again */ if ((NULL != user) && (0 == sw_uid)) { if (0 == map_user()) exit (-1); } /* same applies for the group */ if ((NULL != group) && (0 == sw_gid)) { if (0 == map_group()) exit (-1); } if (getegid() != sw_gid && 0 == set_group_ids()) return 0; if (geteuid() != sw_uid && 0 == set_user_ids()) return 0; return 1; } #endif /* HAVE_DROPROOT */ #endif /* !SIM */ /* * Main program. Initialize us, disconnect us from the tty if necessary, * and loop waiting for I/O and/or timer expiries. */ #ifndef SIM int ntpdmain( int argc, char *argv[] ) { l_fp now; struct recvbuf *rbuf; const char * logfilename; # ifdef HAVE_UMASK mode_t uv; # endif # if defined(HAVE_GETUID) && !defined(MPE) /* MPE lacks the concept of root */ uid_t uid; # endif # if defined(HAVE_WORKING_FORK) long wait_sync = 0; int pipe_fds[2]; int rc; int exit_code; # endif /* HAVE_WORKING_FORK*/ # ifdef SCO5_CLOCK int fd; int zero; # endif # ifdef NEED_PTHREAD_WARMUP my_pthread_warmup(); # endif # ifdef HAVE_UMASK uv = umask(0); if (uv) umask(uv); else umask(022); # endif saved_argc = argc; saved_argv = argv; progname = argv[0]; initializing = TRUE; /* mark that we are initializing */ parse_cmdline_opts(&argc, &argv); # ifdef DEBUG debug = OPT_VALUE_SET_DEBUG_LEVEL; # ifdef HAVE_SETLINEBUF setlinebuf(stdout); # endif # endif if (HAVE_OPT(NOFORK) || HAVE_OPT(QUIT) # ifdef DEBUG || debug # endif || HAVE_OPT(SAVECONFIGQUIT)) nofork = TRUE; init_logging(progname, NLOG_SYNCMASK, TRUE); /* honor -l/--logfile option to log to a file */ if (HAVE_OPT(LOGFILE)) { logfilename = OPT_ARG(LOGFILE); syslogit = FALSE; change_logfile(logfilename, FALSE); } else { logfilename = NULL; if (nofork) msyslog_term = TRUE; if (HAVE_OPT(SAVECONFIGQUIT)) syslogit = FALSE; } msyslog(LOG_NOTICE, "%s: Starting", Version); { int i; char buf[1024]; /* Secret knowledge of msyslog buf length */ char *cp = buf; /* Note that every arg has an initial space character */ snprintf(cp, sizeof(buf), "Command line:"); cp += strlen(cp); for (i = 0; i < saved_argc ; ++i) { snprintf(cp, sizeof(buf) - (cp - buf), " %s", saved_argv[i]); cp += strlen(cp); } msyslog(LOG_INFO, "%s", buf); } /* * Install trap handlers to log errors and assertion failures. * Default handlers print to stderr which doesn't work if detached. */ isc_assertion_setcallback(assertion_failed); isc_error_setfatal(library_fatal_error); isc_error_setunexpected(library_unexpected_error); /* MPE lacks the concept of root */ # if defined(HAVE_GETUID) && !defined(MPE) uid = getuid(); if (uid && !HAVE_OPT( SAVECONFIGQUIT ) # if defined(HAVE_TRUSTEDBSD_MAC) /* We can run as non-root if the mac_ntpd policy is enabled. */ && mac_is_present("ntpd") != 1 # endif ) { msyslog_term = TRUE; msyslog(LOG_ERR, "must be run as root, not uid %ld", (long)uid); exit(1); } # endif /* * Enable the Multi-Media Timer for Windows? */ # ifdef SYS_WINNT if (HAVE_OPT( MODIFYMMTIMER )) set_mm_timer(MM_TIMER_HIRES); # endif #ifdef HAVE_DNSREGISTRATION /* * Enable mDNS registrations? */ if (HAVE_OPT( MDNS )) { mdnsreg = TRUE; } #endif /* HAVE_DNSREGISTRATION */ if (HAVE_OPT( NOVIRTUALIPS )) listen_to_virtual_ips = 0; /* * --interface, listen on specified interfaces */ if (HAVE_OPT( INTERFACE )) { int ifacect = STACKCT_OPT( INTERFACE ); const char** ifaces = STACKLST_OPT( INTERFACE ); sockaddr_u addr; while (ifacect-- > 0) { add_nic_rule( is_ip_address(*ifaces, AF_UNSPEC, &addr) ? MATCH_IFADDR : MATCH_IFNAME, *ifaces, -1, ACTION_LISTEN); ifaces++; } } if (HAVE_OPT( NICE )) priority_done = 0; # ifdef HAVE_SCHED_SETSCHEDULER if (HAVE_OPT( PRIORITY )) { config_priority = OPT_VALUE_PRIORITY; config_priority_override = 1; priority_done = 0; } # endif # ifdef HAVE_WORKING_FORK /* make sure the FDs are initialised */ pipe_fds[0] = -1; pipe_fds[1] = -1; do { /* 'loop' once */ if (!HAVE_OPT( WAIT_SYNC )) break; wait_sync = OPT_VALUE_WAIT_SYNC; if (wait_sync <= 0) { wait_sync = 0; break; } /* -w requires a fork() even with debug > 0 */ nofork = FALSE; if (pipe(pipe_fds)) { exit_code = (errno) ? errno : -1; msyslog(LOG_ERR, "Pipe creation failed for --wait-sync: %m"); exit(exit_code); } waitsync_fd_to_close = pipe_fds[1]; } while (0); /* 'loop' once */ # endif /* HAVE_WORKING_FORK */ init_lib(); # ifdef SYS_WINNT /* * Make sure the service is initialized before we do anything else */ ntservice_init(); /* * Start interpolation thread, must occur before first * get_systime() */ init_winnt_time(); # endif /* * Initialize random generator and public key pair */ get_systime(&now); ntp_srandom((int)(now.l_i * now.l_uf)); /* * Detach us from the terminal. May need an #ifndef GIZMO. */ if (!nofork) { # ifdef HAVE_WORKING_FORK detach_from_terminal(pipe_fds, wait_sync, logfilename); # endif /* HAVE_WORKING_FORK */ } # ifdef SCO5_CLOCK /* * SCO OpenServer's system clock offers much more precise timekeeping * on the base CPU than the other CPUs (for multiprocessor systems), * so we must lock to the base CPU. */ fd = open("/dev/at1", O_RDONLY); if (fd >= 0) { zero = 0; if (ioctl(fd, ACPU_LOCK, &zero) < 0) msyslog(LOG_ERR, "cannot lock to base CPU: %m"); close(fd); } # endif /* Setup stack size in preparation for locking pages in memory. */ # if defined(HAVE_MLOCKALL) # ifdef HAVE_SETRLIMIT ntp_rlimit(RLIMIT_STACK, DFLT_RLIMIT_STACK * 4096, 4096, "4k"); -# ifdef RLIMIT_MEMLOCK +# if defined(RLIMIT_MEMLOCK) && defined(DFLT_RLIMIT_MEMLOCK) && DFLT_RLIMIT_MEMLOCK != -1 /* * The default RLIMIT_MEMLOCK is very low on Linux systems. * Unless we increase this limit malloc calls are likely to * fail if we drop root privilege. To be useful the value * has to be larger than the largest ntpd resident set size. */ ntp_rlimit(RLIMIT_MEMLOCK, DFLT_RLIMIT_MEMLOCK * 1024 * 1024, 1024 * 1024, "MB"); # endif /* RLIMIT_MEMLOCK */ # endif /* HAVE_SETRLIMIT */ # else /* !HAVE_MLOCKALL follows */ # ifdef HAVE_PLOCK # ifdef PROCLOCK # ifdef _AIX /* * set the stack limit for AIX for plock(). * see get_aix_stack() for more info. */ if (ulimit(SET_STACKLIM, (get_aix_stack() - 8 * 4096)) < 0) msyslog(LOG_ERR, "Cannot adjust stack limit for plock: %m"); # endif /* _AIX */ # endif /* PROCLOCK */ # endif /* HAVE_PLOCK */ # endif /* !HAVE_MLOCKALL */ /* * Set up signals we pay attention to locally. */ # ifdef SIGDIE1 signal_no_reset(SIGDIE1, finish); signal_no_reset(SIGDIE2, finish); signal_no_reset(SIGDIE3, finish); signal_no_reset(SIGDIE4, finish); # endif # ifdef SIGBUS signal_no_reset(SIGBUS, finish); # endif # if !defined(SYS_WINNT) && !defined(VMS) # ifdef DEBUG (void) signal_no_reset(MOREDEBUGSIG, moredebug); (void) signal_no_reset(LESSDEBUGSIG, lessdebug); # else (void) signal_no_reset(MOREDEBUGSIG, no_debug); (void) signal_no_reset(LESSDEBUGSIG, no_debug); # endif /* DEBUG */ # endif /* !SYS_WINNT && !VMS */ /* * Set up signals we should never pay attention to. */ # ifdef SIGPIPE signal_no_reset(SIGPIPE, SIG_IGN); # endif /* * Call the init_ routines to initialize the data structures. * * Exactly what command-line options are we expecting here? */ INIT_SSL(); init_auth(); init_util(); init_restrict(); init_mon(); init_timer(); init_request(); init_control(); init_peer(); # ifdef REFCLOCK init_refclock(); # endif set_process_priority(); init_proto(); /* Call at high priority */ init_io(); init_loopfilter(); mon_start(MON_ON); /* monitor on by default now */ /* turn off in config if unwanted */ /* * Get the configuration. This is done in a separate module * since this will definitely be different for the gizmo board. */ getconfig(argc, argv); if (-1 == cur_memlock) { # if defined(HAVE_MLOCKALL) /* * lock the process into memory */ if ( !HAVE_OPT(SAVECONFIGQUIT) # ifdef RLIMIT_MEMLOCK && -1 != DFLT_RLIMIT_MEMLOCK # endif && 0 != mlockall(MCL_CURRENT|MCL_FUTURE)) msyslog(LOG_ERR, "mlockall(): %m"); # else /* !HAVE_MLOCKALL follows */ # ifdef HAVE_PLOCK # ifdef PROCLOCK /* * lock the process into memory */ if (!HAVE_OPT(SAVECONFIGQUIT) && 0 != plock(PROCLOCK)) msyslog(LOG_ERR, "plock(PROCLOCK): %m"); # else /* !PROCLOCK follows */ # ifdef TXTLOCK /* * Lock text into ram */ if (!HAVE_OPT(SAVECONFIGQUIT) && 0 != plock(TXTLOCK)) msyslog(LOG_ERR, "plock(TXTLOCK) error: %m"); # else /* !TXTLOCK follows */ msyslog(LOG_ERR, "plock() - don't know what to lock!"); # endif /* !TXTLOCK */ # endif /* !PROCLOCK */ # endif /* HAVE_PLOCK */ # endif /* !HAVE_MLOCKALL */ } loop_config(LOOP_DRIFTINIT, 0); report_event(EVNT_SYSRESTART, NULL, NULL); initializing = FALSE; # ifdef HAVE_DROPROOT if (droproot) { #ifdef NEED_EARLY_FORK fork_nonchroot_worker(); #endif /* Drop super-user privileges and chroot now if the OS supports this */ # ifdef HAVE_LINUX_CAPABILITIES /* set flag: keep privileges accross setuid() call (we only really need cap_sys_time): */ if (prctl( PR_SET_KEEPCAPS, 1L, 0L, 0L, 0L ) == -1) { msyslog( LOG_ERR, "prctl( PR_SET_KEEPCAPS, 1L ) failed: %m" ); exit(-1); } # elif HAVE_SOLARIS_PRIVS /* Nothing to do here */ # else /* we need a user to switch to */ if (user == NULL) { msyslog(LOG_ERR, "Need user name to drop root privileges (see -u flag!)" ); exit(-1); } # endif /* HAVE_LINUX_CAPABILITIES || HAVE_SOLARIS_PRIVS */ if (user != NULL) { if (0 == map_user()) exit (-1); } if (group != NULL) { if (0 == map_group()) exit (-1); } if (chrootdir ) { /* make sure cwd is inside the jail: */ if (chdir(chrootdir)) { msyslog(LOG_ERR, "Cannot chdir() to `%s': %m", chrootdir); exit (-1); } if (chroot(chrootdir)) { msyslog(LOG_ERR, "Cannot chroot() to `%s': %m", chrootdir); exit (-1); } if (chdir("/")) { msyslog(LOG_ERR, "Cannot chdir() to`root after chroot(): %m"); exit (-1); } } # ifdef HAVE_SOLARIS_PRIVS if ((lowprivs = priv_str_to_set(LOWPRIVS, ",", NULL)) == NULL) { msyslog(LOG_ERR, "priv_str_to_set() failed:%m"); exit(-1); } if ((highprivs = priv_allocset()) == NULL) { msyslog(LOG_ERR, "priv_allocset() failed:%m"); exit(-1); } (void) getppriv(PRIV_PERMITTED, highprivs); (void) priv_intersect(highprivs, lowprivs); if (setppriv(PRIV_SET, PRIV_PERMITTED, lowprivs) == -1) { msyslog(LOG_ERR, "setppriv() failed:%m"); exit(-1); } # endif /* HAVE_SOLARIS_PRIVS */ if (0 == set_user_group_ids()) exit(-1); # if defined(HAVE_TRUSTEDBSD_MAC) /* * To manipulate system time and (re-)bind to NTP_PORT as needed * following interface changes, we must either run as uid 0 or * the mac_ntpd policy module must be enabled. */ if (sw_uid != 0 && mac_is_present("ntpd") != 1) { msyslog(LOG_ERR, "Need MAC 'ntpd' policy enabled to drop root privileges"); exit (-1); } # elif !defined(HAVE_LINUX_CAPABILITIES) && !defined(HAVE_SOLARIS_PRIVS) /* * for now assume that the privilege to bind to privileged ports * is associated with running with uid 0 - should be refined on * ports that allow binding to NTP_PORT with uid != 0 */ disable_dynamic_updates |= (sw_uid != 0); /* also notifies routing message listener */ # endif /* !HAVE_LINUX_CAPABILITIES && !HAVE_SOLARIS_PRIVS */ if (disable_dynamic_updates && interface_interval) { interface_interval = 0; msyslog(LOG_INFO, "running as non-root disables dynamic interface tracking"); } # ifdef HAVE_LINUX_CAPABILITIES { /* * We may be running under non-root uid now, but we still hold full root privileges! * We drop all of them, except for the crucial one or two: cap_sys_time and * cap_net_bind_service if doing dynamic interface tracking. */ cap_t caps; char *captext; captext = (0 != interface_interval) ? "cap_sys_time,cap_net_bind_service=pe" : "cap_sys_time=pe"; caps = cap_from_text(captext); if (!caps) { msyslog(LOG_ERR, "cap_from_text(%s) failed: %m", captext); exit(-1); } if (-1 == cap_set_proc(caps)) { msyslog(LOG_ERR, "cap_set_proc() failed to drop root privs: %m"); exit(-1); } cap_free(caps); } # endif /* HAVE_LINUX_CAPABILITIES */ # ifdef HAVE_SOLARIS_PRIVS if (priv_delset(lowprivs, "proc_setid") == -1) { msyslog(LOG_ERR, "priv_delset() failed:%m"); exit(-1); } if (setppriv(PRIV_SET, PRIV_PERMITTED, lowprivs) == -1) { msyslog(LOG_ERR, "setppriv() failed:%m"); exit(-1); } priv_freeset(lowprivs); priv_freeset(highprivs); # endif /* HAVE_SOLARIS_PRIVS */ root_dropped = TRUE; fork_deferred_worker(); } /* if (droproot) */ # endif /* HAVE_DROPROOT */ /* libssecomp sandboxing */ #if defined (LIBSECCOMP) && (KERN_SECCOMP) scmp_filter_ctx ctx; if ((ctx = seccomp_init(SCMP_ACT_KILL)) < 0) msyslog(LOG_ERR, "%s: seccomp_init(SCMP_ACT_KILL) failed: %m", __func__); else { msyslog(LOG_DEBUG, "%s: seccomp_init(SCMP_ACT_KILL) succeeded", __func__); } #ifdef __x86_64__ int scmp_sc[] = { SCMP_SYS(adjtimex), SCMP_SYS(bind), SCMP_SYS(brk), SCMP_SYS(chdir), SCMP_SYS(clock_gettime), SCMP_SYS(clock_settime), SCMP_SYS(close), SCMP_SYS(connect), SCMP_SYS(exit_group), SCMP_SYS(fstat), SCMP_SYS(fsync), SCMP_SYS(futex), SCMP_SYS(getitimer), SCMP_SYS(getsockname), SCMP_SYS(ioctl), SCMP_SYS(lseek), SCMP_SYS(madvise), SCMP_SYS(mmap), SCMP_SYS(munmap), SCMP_SYS(open), SCMP_SYS(poll), SCMP_SYS(read), SCMP_SYS(recvmsg), SCMP_SYS(rename), SCMP_SYS(rt_sigaction), SCMP_SYS(rt_sigprocmask), SCMP_SYS(rt_sigreturn), SCMP_SYS(select), SCMP_SYS(sendto), SCMP_SYS(setitimer), SCMP_SYS(setsid), SCMP_SYS(socket), SCMP_SYS(stat), SCMP_SYS(time), SCMP_SYS(write), }; #endif #ifdef __i386__ int scmp_sc[] = { SCMP_SYS(_newselect), SCMP_SYS(adjtimex), SCMP_SYS(brk), SCMP_SYS(chdir), SCMP_SYS(clock_gettime), SCMP_SYS(clock_settime), SCMP_SYS(close), SCMP_SYS(exit_group), SCMP_SYS(fsync), SCMP_SYS(futex), SCMP_SYS(getitimer), SCMP_SYS(madvise), SCMP_SYS(mmap), SCMP_SYS(mmap2), SCMP_SYS(munmap), SCMP_SYS(open), SCMP_SYS(poll), SCMP_SYS(read), SCMP_SYS(rename), SCMP_SYS(rt_sigaction), SCMP_SYS(rt_sigprocmask), SCMP_SYS(select), SCMP_SYS(setitimer), SCMP_SYS(setsid), SCMP_SYS(sigprocmask), SCMP_SYS(sigreturn), SCMP_SYS(socketcall), SCMP_SYS(stat64), SCMP_SYS(time), SCMP_SYS(write), }; #endif { int i; for (i = 0; i < COUNTOF(scmp_sc); i++) { if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, scmp_sc[i], 0) < 0) { msyslog(LOG_ERR, "%s: seccomp_rule_add() failed: %m", __func__); } } } if (seccomp_load(ctx) < 0) msyslog(LOG_ERR, "%s: seccomp_load() failed: %m", __func__); else { msyslog(LOG_DEBUG, "%s: seccomp_load() succeeded", __func__); } #endif /* LIBSECCOMP and KERN_SECCOMP */ #ifdef SYS_WINNT ntservice_isup(); #endif # ifdef HAVE_IO_COMPLETION_PORT for (;;) { #if !defined(SIM) && defined(SIGDIE1) if (signalled) finish_safe(signo); #endif GetReceivedBuffers(); # else /* normal I/O */ BLOCK_IO_AND_ALARM(); was_alarmed = FALSE; for (;;) { #if !defined(SIM) && defined(SIGDIE1) if (signalled) finish_safe(signo); #endif if (alarm_flag) { /* alarmed? */ was_alarmed = TRUE; alarm_flag = FALSE; } /* collect async name/addr results */ if (!was_alarmed) harvest_blocking_responses(); if (!was_alarmed && !has_full_recv_buffer()) { /* * Nothing to do. Wait for something. */ io_handler(); } if (alarm_flag) { /* alarmed? */ was_alarmed = TRUE; alarm_flag = FALSE; } if (was_alarmed) { UNBLOCK_IO_AND_ALARM(); /* * Out here, signals are unblocked. Call timer routine * to process expiry. */ timer(); was_alarmed = FALSE; BLOCK_IO_AND_ALARM(); } # endif /* !HAVE_IO_COMPLETION_PORT */ # ifdef DEBUG_TIMING { l_fp pts; l_fp tsa, tsb; int bufcount = 0; get_systime(&pts); tsa = pts; # endif rbuf = get_full_recv_buffer(); while (rbuf != NULL) { if (alarm_flag) { was_alarmed = TRUE; alarm_flag = FALSE; } UNBLOCK_IO_AND_ALARM(); if (was_alarmed) { /* avoid timer starvation during lengthy I/O handling */ timer(); was_alarmed = FALSE; } /* * Call the data procedure to handle each received * packet. */ if (rbuf->receiver != NULL) { # ifdef DEBUG_TIMING l_fp dts = pts; L_SUB(&dts, &rbuf->recv_time); DPRINTF(2, ("processing timestamp delta %s (with prec. fuzz)\n", lfptoa(&dts, 9))); collect_timing(rbuf, "buffer processing delay", 1, &dts); bufcount++; # endif (*rbuf->receiver)(rbuf); } else { msyslog(LOG_ERR, "fatal: receive buffer callback NULL"); abort(); } BLOCK_IO_AND_ALARM(); freerecvbuf(rbuf); rbuf = get_full_recv_buffer(); } # ifdef DEBUG_TIMING get_systime(&tsb); L_SUB(&tsb, &tsa); if (bufcount) { collect_timing(NULL, "processing", bufcount, &tsb); DPRINTF(2, ("processing time for %d buffers %s\n", bufcount, lfptoa(&tsb, 9))); } } # endif /* * Go around again */ # ifdef HAVE_DNSREGISTRATION if (mdnsreg && (current_time - mdnsreg ) > 60 && mdnstries && sys_leap != LEAP_NOTINSYNC) { mdnsreg = current_time; msyslog(LOG_INFO, "Attempting to register mDNS"); if ( DNSServiceRegister (&mdns, 0, 0, NULL, "_ntp._udp", NULL, NULL, htons(NTP_PORT), 0, NULL, NULL, NULL) != kDNSServiceErr_NoError ) { if (!--mdnstries) { msyslog(LOG_ERR, "Unable to register mDNS, giving up."); } else { msyslog(LOG_INFO, "Unable to register mDNS, will try later."); } } else { msyslog(LOG_INFO, "mDNS service registered."); mdnsreg = FALSE; } } # endif /* HAVE_DNSREGISTRATION */ } UNBLOCK_IO_AND_ALARM(); return 1; } #endif /* !SIM */ #if !defined(SIM) && defined(SIGDIE1) /* * finish - exit gracefully */ static void finish_safe( int sig ) { const char *sig_desc; sig_desc = NULL; #ifdef HAVE_STRSIGNAL sig_desc = strsignal(sig); #endif if (sig_desc == NULL) sig_desc = ""; msyslog(LOG_NOTICE, "%s exiting on signal %d (%s)", progname, sig, sig_desc); /* See Bug 2513 and Bug 2522 re the unlink of PIDFILE */ # ifdef HAVE_DNSREGISTRATION if (mdns != NULL) DNSServiceRefDeallocate(mdns); # endif peer_cleanup(); exit(0); } static RETSIGTYPE finish( int sig ) { signalled = 1; signo = sig; } #endif /* !SIM && SIGDIE1 */ #ifndef SIM /* * wait_child_sync_if - implements parent side of -w/--wait-sync */ # ifdef HAVE_WORKING_FORK static int wait_child_sync_if( int pipe_read_fd, long wait_sync ) { int rc; int exit_code; time_t wait_end_time; time_t cur_time; time_t wait_rem; fd_set readset; struct timeval wtimeout; if (0 == wait_sync) return 0; /* waitsync_fd_to_close used solely by child */ close(waitsync_fd_to_close); wait_end_time = time(NULL) + wait_sync; do { cur_time = time(NULL); wait_rem = (wait_end_time > cur_time) ? (wait_end_time - cur_time) : 0; wtimeout.tv_sec = wait_rem; wtimeout.tv_usec = 0; FD_ZERO(&readset); FD_SET(pipe_read_fd, &readset); rc = select(pipe_read_fd + 1, &readset, NULL, NULL, &wtimeout); if (-1 == rc) { if (EINTR == errno) continue; exit_code = (errno) ? errno : -1; msyslog(LOG_ERR, "--wait-sync select failed: %m"); return exit_code; } if (0 == rc) { /* * select() indicated a timeout, but in case * its timeouts are affected by a step of the * system clock, select() again with a zero * timeout to confirm. */ FD_ZERO(&readset); FD_SET(pipe_read_fd, &readset); wtimeout.tv_sec = 0; wtimeout.tv_usec = 0; rc = select(pipe_read_fd + 1, &readset, NULL, NULL, &wtimeout); if (0 == rc) /* select() timeout */ break; else /* readable */ return 0; } else /* readable */ return 0; } while (wait_rem > 0); fprintf(stderr, "%s: -w/--wait-sync %ld timed out.\n", progname, wait_sync); return ETIMEDOUT; } # endif /* HAVE_WORKING_FORK */ /* * assertion_failed - Redirect assertion failures to msyslog(). */ static void assertion_failed( const char *file, int line, isc_assertiontype_t type, const char *cond ) { isc_assertion_setcallback(NULL); /* Avoid recursion */ msyslog(LOG_ERR, "%s:%d: %s(%s) failed", file, line, isc_assertion_typetotext(type), cond); msyslog(LOG_ERR, "exiting (due to assertion failure)"); #if defined(DEBUG) && defined(SYS_WINNT) if (debug) DebugBreak(); #endif abort(); } /* * library_fatal_error - Handle fatal errors from our libraries. */ static void library_fatal_error( const char *file, int line, const char *format, va_list args ) { char errbuf[256]; isc_error_setfatal(NULL); /* Avoid recursion */ msyslog(LOG_ERR, "%s:%d: fatal error:", file, line); vsnprintf(errbuf, sizeof(errbuf), format, args); msyslog(LOG_ERR, "%s", errbuf); msyslog(LOG_ERR, "exiting (due to fatal error in library)"); #if defined(DEBUG) && defined(SYS_WINNT) if (debug) DebugBreak(); #endif abort(); } /* * library_unexpected_error - Handle non fatal errors from our libraries. */ # define MAX_UNEXPECTED_ERRORS 100 int unexpected_error_cnt = 0; static void library_unexpected_error( const char *file, int line, const char *format, va_list args ) { char errbuf[256]; if (unexpected_error_cnt >= MAX_UNEXPECTED_ERRORS) return; /* avoid clutter in log */ msyslog(LOG_ERR, "%s:%d: unexpected error:", file, line); vsnprintf(errbuf, sizeof(errbuf), format, args); msyslog(LOG_ERR, "%s", errbuf); if (++unexpected_error_cnt == MAX_UNEXPECTED_ERRORS) msyslog(LOG_ERR, "Too many errors. Shutting up."); } #endif /* !SIM */ #if !defined(SIM) && !defined(SYS_WINNT) # ifdef DEBUG /* * moredebug - increase debugging verbosity */ static RETSIGTYPE moredebug( int sig ) { int saved_errno = errno; if (debug < 255) { debug++; msyslog(LOG_DEBUG, "debug raised to %d", debug); } errno = saved_errno; } /* * lessdebug - decrease debugging verbosity */ static RETSIGTYPE lessdebug( int sig ) { int saved_errno = errno; if (debug > 0) { debug--; msyslog(LOG_DEBUG, "debug lowered to %d", debug); } errno = saved_errno; } # else /* !DEBUG follows */ /* * no_debug - We don't do the debug here. */ static RETSIGTYPE no_debug( int sig ) { int saved_errno = errno; msyslog(LOG_DEBUG, "ntpd not compiled for debugging (signal %d)", sig); errno = saved_errno; } # endif /* !DEBUG */ #endif /* !SIM && !SYS_WINNT */ Index: projects/clang900-import/contrib/ntp =================================================================== --- projects/clang900-import/contrib/ntp (revision 352586) +++ projects/clang900-import/contrib/ntp (revision 352587) Property changes on: projects/clang900-import/contrib/ntp ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/ntp:r351317-352586 Index: projects/clang900-import/share/man/man7/ascii.7 =================================================================== --- projects/clang900-import/share/man/man7/ascii.7 (revision 352586) +++ projects/clang900-import/share/man/man7/ascii.7 (revision 352587) @@ -1,153 +1,160 @@ .\" Copyright (c) 1989, 1990, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)ascii.7 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd October 30, 2017 +.Dd September 21, 2019 .Dt ASCII 7 .Os .Sh NAME .Nm ascii .Nd octal, hexadecimal, decimal and binary .Tn ASCII character sets .Sh DESCRIPTION The .Nm octal set: .Bd -literal -offset left 000 NUL 001 SOH 002 STX 003 ETX 004 EOT 005 ENQ 006 ACK 007 BEL 010 BS 011 HT 012 LF 013 VT 014 FF 015 CR 016 SO 017 SI 020 DLE 021 DC1 022 DC2 023 DC3 024 DC4 025 NAK 026 SYN 027 ETB 030 CAN 031 EM 032 SUB 033 ESC 034 FS 035 GS 036 RS 037 US 040 SP 041 ! 042 " 043 # 044 $ 045 % 046 & 047 ' 050 ( 051 ) 052 * 053 + 054 , 055 - 056 . 057 / 060 0 061 1 062 2 063 3 064 4 065 5 066 6 067 7 070 8 071 9 072 : 073 ; 074 < 075 = 076 > 077 ? 100 @ 101 A 102 B 103 C 104 D 105 E 106 F 107 G 110 H 111 I 112 J 113 K 114 L 115 M 116 N 117 O 120 P 121 Q 122 R 123 S 124 T 125 U 126 V 127 W 130 X 131 Y 132 Z 133 [ 134 \e\ 135 ] 136 ^ 137 _ 140 ` 141 a 142 b 143 c 144 d 145 e 146 f 147 g 150 h 151 i 152 j 153 k 154 l 155 m 156 n 157 o 160 p 161 q 162 r 163 s 164 t 165 u 166 v 167 w 170 x 171 y 172 z 173 { 174 | 175 } 176 ~ 177 DEL .Ed .Pp The .Nm hexadecimal set: .Bd -literal -offset left 00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL 08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI 10 DLE 11 DC1 12 DC2 13 DC3 14 DC4 15 NAK 16 SYN 17 ETB 18 CAN 19 EM 1a SUB 1b ESC 1c FS 1d GS 1e RS 1f US 20 SP 21 ! 22 " 23 # 24 $ 25 % 26 & 27 ' 28 ( 29 ) 2a * 2b + 2c , 2d - 2e . 2f / 30 0 31 1 32 2 33 3 34 4 35 5 36 6 37 7 38 8 39 9 3a : 3b ; 3c < 3d = 3e > 3f ? 40 @ 41 A 42 B 43 C 44 D 45 E 46 F 47 G 48 H 49 I 4a J 4b K 4c L 4d M 4e N 4f O 50 P 51 Q 52 R 53 S 54 T 55 U 56 V 57 W 58 X 59 Y 5a Z 5b [ 5c \e\ 5d ] 5e ^ 5f _ 60 \` 61 a 62 b 63 c 64 d 65 e 66 f 67 g 68 h 69 i 6a j 6b k 6c l 6d m 6e n 6f o 70 p 71 q 72 r 73 s 74 t 75 u 76 v 77 w 78 x 79 y 7a z 7b { 7c | 7d } 7e ~ 7f DEL .Ed .Pp The .Nm decimal set: .Bd -literal -offset left 0 NUL 1 SOH 2 STX 3 ETX 4 EOT 5 ENQ 6 ACK 7 BEL 8 BS 9 HT 10 LF 11 VT 12 FF 13 CR 14 SO 15 SI 16 DLE 17 DC1 18 DC2 19 DC3 20 DC4 21 NAK 22 SYN 23 ETB 24 CAN 25 EM 26 SUB 27 ESC 28 FS 29 GS 30 RS 31 US 32 SP 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W 88 X 89 Y 90 Z 91 [ 92 \e\ 93 ] 94 ^ 95 _ 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 DEL .Ed .Pp The .Nm binary set: .Bd -literal -offset left 00 01 10 11 NUL SP @ ` 00000 SOH ! A a 00001 STX " B b 00010 ETX # C c 00011 EOT $ D d 00100 ENQ % E e 00101 ACK & F f 00110 BEL ' G g 00111 BS ( H h 01000 HT ) I i 01001 LF * J j 01010 VT + K k 01011 FF , L l 01100 CR - M m 01101 SO . N n 01110 SI / O o 01111 DLE 0 P p 10000 DC1 1 Q q 10001 DC2 2 R r 10010 DC3 3 S s 10011 DC4 4 T t 10100 NAK 5 U u 10101 SYN 6 V v 10110 ETB 7 W w 10111 CAN 8 X x 11000 EM 9 Y y 11001 SUB : Z z 11010 ESC ; [ { 11011 FS < \e\ | 11100 GS = ] } 11101 RS > ^ - 11110 US ? _ DEL 11111 .Ed .Sh FILES .Bl -tag -width /usr/share/misc/ascii -compact .It Pa /usr/share/misc/ascii .El +.Sh STANDARDS +.Rs +.%T Information Systems - Coded Character Sets - 7-Bit American National\ + Standard Code for Information Interchange (7-Bit ASCII) +.%R INCITS 4-1986[R2017] +.%Q InterNational Committee for Information Technology Standards +.Re .Sh HISTORY An .Nm manual page appeared in -.At v7 . +.At v2 . Index: projects/clang900-import/stand/forth/color.4th =================================================================== --- projects/clang900-import/stand/forth/color.4th (revision 352586) +++ projects/clang900-import/stand/forth/color.4th (revision 352587) @@ -1,49 +1,55 @@ \ Copyright (c) 2011-2013 Devin Teske \ All rights reserved. \ \ Redistribution and use in source and binary forms, with or without \ modification, are permitted provided that the following conditions \ are met: \ 1. Redistributions of source code must retain the above copyright \ notice, this list of conditions and the following disclaimer. \ 2. Redistributions in binary form must reproduce the above copyright \ notice, this list of conditions and the following disclaimer in the \ documentation and/or other materials provided with the distribution. \ \ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND \ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE \ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE \ ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE \ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL \ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS \ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) \ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT \ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY \ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF \ SUCH DAMAGE. \ \ $FreeBSD$ marker task-color.4th \ This function returns FALSE if the `loader_color' environment variable is set -\ to NO, no, or 0. Otherwise, TRUE is returned (unless booting serial). +\ to NO, no, or 0. It returns TRUE if `loader_color' is set to any other value. +\ If `loader_color' is unset, TRUE is returned (unless booting serial). \ -: loader_color? ( -- N ) +: loader_color? ( -- t ) s" loader_color" getenv dup -1 <> if - + \ `loader_color' is set. + \ Check if it is explicitly disabled. 2dup s" NO" compare-insensitive 0= if 2drop FALSE exit then 2dup s" 0" compare 0= if 2drop FALSE exit then drop + \ It is enabled. + TRUE + else + \ `loader_color' is unset. + \ Default to using color unless serial boot is active. + drop + boot_serial? 0= then - drop - - boot_serial? if FALSE else TRUE then ; Index: projects/clang900-import/stand/libsa/zalloc.c =================================================================== --- projects/clang900-import/stand/libsa/zalloc.c (revision 352586) +++ projects/clang900-import/stand/libsa/zalloc.c (revision 352587) @@ -1,338 +1,339 @@ /* * This module derived from code donated to the FreeBSD Project by * Matthew Dillon * * Copyright (c) 1998 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include /* * LIB/MEMORY/ZALLOC.C - self contained low-overhead memory pool/allocation * subsystem * * This subsystem implements memory pools and memory allocation * routines. * * Pools are managed via a linked list of 'free' areas. Allocating * memory creates holes in the freelist, freeing memory fills them. * Since the freelist consists only of free memory areas, it is possible * to allocate the entire pool without incuring any structural overhead. * * The system works best when allocating similarly-sized chunks of * memory. Care must be taken to avoid fragmentation when * allocating/deallocating dissimilar chunks. * * When a memory pool is first allocated, the entire pool is marked as * allocated. This is done mainly because we do not want to modify any * portion of a pool's data area until we are given permission. The * caller must explicitly deallocate portions of the pool to make them * available. * * z[n]xalloc() works like z[n]alloc() but the allocation is made from * within the specified address range. If the segment could not be * allocated, NULL is returned. WARNING! The address range will be * aligned to an 8 or 16 byte boundry depending on the cpu so if you * give an unaligned address range, unexpected results may occur. * * If a standard allocation fails, the reclaim function will be called * to recover some space. This usually causes other portions of the * same pool to be released. Memory allocations at this low level * should not block but you can do that too in your reclaim function * if you want. Reclaim does not function when z[n]xalloc() is used, * only for z[n]alloc(). * * Allocation and frees of 0 bytes are valid operations. */ #include "zalloc_defs.h" /* * Objects in the pool must be aligned to at least the size of struct MemNode. * They must also be aligned to MALLOCALIGN, which should normally be larger * than the struct, so assert that to be so at compile time. */ typedef char assert_align[(sizeof(struct MemNode) <= MALLOCALIGN) ? 1 : -1]; #define MEMNODE_SIZE_MASK MALLOCALIGN_MASK /* * znalloc() - allocate memory (without zeroing) from pool. Call reclaim * and retry if appropriate, return NULL if unable to allocate * memory. */ void * znalloc(MemPool *mp, uintptr_t bytes, size_t align) { MemNode **pmn; MemNode *mn; /* * align according to pool object size (can be 0). This is * inclusive of the MEMNODE_SIZE_MASK minimum alignment. * */ bytes = (bytes + MEMNODE_SIZE_MASK) & ~MEMNODE_SIZE_MASK; if (bytes == 0) return ((void *)-1); /* * locate freelist entry big enough to hold the object. If all objects * are the same size, this is a constant-time function. */ if (bytes > mp->mp_Size - mp->mp_Used) return (NULL); for (pmn = &mp->mp_First; (mn = *pmn) != NULL; pmn = &mn->mr_Next) { char *ptr = (char *)mn; uintptr_t dptr; char *aligned; size_t extra; dptr = (uintptr_t)(ptr + MALLOCALIGN); /* pointer to data */ aligned = (char *)(roundup2(dptr, align) - MALLOCALIGN); extra = aligned - ptr; if (bytes + extra > mn->mr_Bytes) continue; /* - * Cut extra from head and create new memory node from reminder. + * Cut extra from head and create new memory node from + * remainder. */ if (extra != 0) { MemNode *new; new = (MemNode *)aligned; new->mr_Next = mn->mr_Next; new->mr_Bytes = mn->mr_Bytes - extra; /* And update current memory node */ mn->mr_Bytes = extra; mn->mr_Next = new; /* In next iteration, we will get our aligned address */ continue; } /* * Cut a chunk of memory out of the beginning of this * block and fixup the link appropriately. */ if (mn->mr_Bytes == bytes) { *pmn = mn->mr_Next; } else { mn = (MemNode *)((char *)mn + bytes); mn->mr_Next = ((MemNode *)ptr)->mr_Next; mn->mr_Bytes = ((MemNode *)ptr)->mr_Bytes - bytes; *pmn = mn; } mp->mp_Used += bytes; return(ptr); } /* * Memory pool is full, return NULL. */ return (NULL); } /* * zfree() - free previously allocated memory */ void zfree(MemPool *mp, void *ptr, uintptr_t bytes) { MemNode **pmn; MemNode *mn; /* * align according to pool object size (can be 0). This is * inclusive of the MEMNODE_SIZE_MASK minimum alignment. */ bytes = (bytes + MEMNODE_SIZE_MASK) & ~MEMNODE_SIZE_MASK; if (bytes == 0) return; /* * panic if illegal pointer */ if ((char *)ptr < (char *)mp->mp_Base || (char *)ptr + bytes > (char *)mp->mp_End || ((uintptr_t)ptr & MEMNODE_SIZE_MASK) != 0) panic("zfree(%p,%ju): wild pointer", ptr, (uintmax_t)bytes); /* * free the segment */ mp->mp_Used -= bytes; for (pmn = &mp->mp_First; (mn = *pmn) != NULL; pmn = &mn->mr_Next) { /* * If area between last node and current node * - check range * - check merge with next area * - check merge with previous area */ if ((char *)ptr <= (char *)mn) { /* * range check */ if ((char *)ptr + bytes > (char *)mn) { panic("zfree(%p,%ju): corrupt memlist1", ptr, (uintmax_t)bytes); } /* * merge against next area or create independant area */ if ((char *)ptr + bytes == (char *)mn) { ((MemNode *)ptr)->mr_Next = mn->mr_Next; ((MemNode *)ptr)->mr_Bytes = bytes + mn->mr_Bytes; } else { ((MemNode *)ptr)->mr_Next = mn; ((MemNode *)ptr)->mr_Bytes = bytes; } *pmn = mn = (MemNode *)ptr; /* * merge against previous area (if there is a previous * area). */ if (pmn != &mp->mp_First) { if ((char *)pmn + ((MemNode*)pmn)->mr_Bytes == (char *)ptr) { ((MemNode *)pmn)->mr_Next = mn->mr_Next; ((MemNode *)pmn)->mr_Bytes += mn->mr_Bytes; mn = (MemNode *)pmn; } } return; } if ((char *)ptr < (char *)mn + mn->mr_Bytes) { panic("zfree(%p,%ju): corrupt memlist2", ptr, (uintmax_t)bytes); } } /* * We are beyond the last MemNode, append new MemNode. Merge against * previous area if possible. */ if (pmn == &mp->mp_First || (char *)pmn + ((MemNode *)pmn)->mr_Bytes != (char *)ptr) { ((MemNode *)ptr)->mr_Next = NULL; ((MemNode *)ptr)->mr_Bytes = bytes; *pmn = (MemNode *)ptr; mn = (MemNode *)ptr; } else { ((MemNode *)pmn)->mr_Bytes += bytes; mn = (MemNode *)pmn; } } /* * zextendPool() - extend memory pool to cover additional space. * * Note: the added memory starts out as allocated, you * must free it to make it available to the memory subsystem. * * Note: mp_Size may not reflect (mp_End - mp_Base) range * due to other parts of the system doing their own sbrk() * calls. */ void zextendPool(MemPool *mp, void *base, uintptr_t bytes) { if (mp->mp_Size == 0) { mp->mp_Base = base; mp->mp_Used = bytes; mp->mp_End = (char *)base + bytes; mp->mp_Size = bytes; } else { void *pend = (char *)mp->mp_Base + mp->mp_Size; if (base < mp->mp_Base) { mp->mp_Size += (char *)mp->mp_Base - (char *)base; mp->mp_Used += (char *)mp->mp_Base - (char *)base; mp->mp_Base = base; } base = (char *)base + bytes; if (base > pend) { mp->mp_Size += (char *)base - (char *)pend; mp->mp_Used += (char *)base - (char *)pend; mp->mp_End = (char *)base; } } } #ifdef ZALLOCDEBUG void zallocstats(MemPool *mp) { int abytes = 0; int hbytes = 0; int fcount = 0; MemNode *mn; printf("%d bytes reserved", (int)mp->mp_Size); mn = mp->mp_First; if ((void *)mn != (void *)mp->mp_Base) { abytes += (char *)mn - (char *)mp->mp_Base; } while (mn != NULL) { if ((char *)mn + mn->mr_Bytes != mp->mp_End) { hbytes += mn->mr_Bytes; ++fcount; } if (mn->mr_Next != NULL) { abytes += (char *)mn->mr_Next - ((char *)mn + mn->mr_Bytes); } mn = mn->mr_Next; } printf(" %d bytes allocated\n%d fragments (%d bytes fragmented)\n", abytes, fcount, hbytes); } #endif Index: projects/clang900-import/stand/lua/color.lua =================================================================== --- projects/clang900-import/stand/lua/color.lua (revision 352586) +++ projects/clang900-import/stand/lua/color.lua (revision 352587) @@ -1,118 +1,116 @@ -- -- SPDX-License-Identifier: BSD-2-Clause-FreeBSD -- -- Copyright (c) 2015 Pedro Souza -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions -- are met: -- 1. Redistributions of source code must retain the above copyright -- notice, this list of conditions and the following disclaimer. -- 2. Redistributions in binary form must reproduce the above copyright -- notice, this list of conditions and the following disclaimer in the -- documentation and/or other materials provided with the distribution. -- -- THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -- ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -- SUCH DAMAGE. -- -- $FreeBSD$ -- local core = require("core") local color = {} -- Module exports color.BLACK = 0 color.RED = 1 color.GREEN = 2 color.YELLOW = 3 color.BLUE = 4 color.MAGENTA = 5 color.CYAN = 6 color.WHITE = 7 color.DEFAULT = 0 color.BRIGHT = 1 color.DIM = 2 function color.isEnabled() local c = loader.getenv("loader_color") if c ~= nil then - if c:lower() == "no" or c == "0" then - return false - end + return c:lower() ~= "no" and c ~= "0" end return not core.isSerialBoot() end color.disabled = not color.isEnabled() function color.escapefg(color_value) if color.disabled then return color_value end return core.KEYSTR_CSI .. "3" .. color_value .. "m" end function color.resetfg() if color.disabled then return '' end return color.escapefg(color.WHITE) end function color.escapebg(color_value) if color.disabled then return color_value end return core.KEYSTR_CSI .. "4" .. color_value .. "m" end function color.resetbg() if color.disabled then return '' end return color.escapebg(color.BLACK) end function color.escape(fg_color, bg_color, attribute) if color.disabled then return "" end if attribute == nil then attribute = "" else attribute = attribute .. ";" end return core.KEYSTR_CSI .. attribute .. "3" .. fg_color .. ";4" .. bg_color .. "m" end function color.default() if color.disabled then return "" end return color.escape(color.WHITE, color.BLACK, color.DEFAULT) end function color.highlight(str) if color.disabled then return str end -- We need to reset attributes as well as color scheme here, just in -- case the terminal defaults don't match what we're expecting. return core.KEYSTR_CSI .. "1m" .. str .. core.KEYSTR_CSI .. "22m" end return color Index: projects/clang900-import/stand/powerpc/uboot/Makefile =================================================================== --- projects/clang900-import/stand/powerpc/uboot/Makefile (revision 352586) +++ projects/clang900-import/stand/powerpc/uboot/Makefile (revision 352587) @@ -1,34 +1,35 @@ # $FreeBSD$ LOADER_UFS_SUPPORT?= yes LOADER_CD9660_SUPPORT?= no LOADER_EXT2FS_SUPPORT?= no LOADER_NET_SUPPORT?= yes LOADER_NFS_SUPPORT?= yes LOADER_TFTP_SUPPORT?= no LOADER_GZIP_SUPPORT?= no LOADER_BZIP2_SUPPORT?= no .include BINDIR= /boot/uboot PROG= ubldr +STRIP= NEWVERSWHAT= "U-Boot loader" ${MACHINE_ARCH} INSTALLFLAGS= -b # Architecture-specific loader code SRCS= start.S conf.c vers.c ppc64_elf_freebsd.c SRCS+= ucmpdi2.c # Always add MI sources .include "${BOOTSRC}/loader.mk" .PATH: ${SYSDIR}/libkern LDFLAGS= -nostdlib -static -T ${.CURDIR}/ldscript.powerpc .include "${BOOTSRC}/uboot.mk" DPADD= ${LDR_INTERP} ${LIBUBOOT} ${LIBFDT} ${LIBUBOOT_FDT} ${LIBSA} LDADD= ${LDR_INTERP} ${LIBUBOOT} ${LIBFDT} ${LIBUBOOT_FDT} ${LIBSA} .include Index: projects/clang900-import/sys/amd64/amd64/pmap.c =================================================================== --- projects/clang900-import/sys/amd64/amd64/pmap.c (revision 352586) +++ projects/clang900-import/sys/amd64/amd64/pmap.c (revision 352587) @@ -1,10306 +1,10306 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * Copyright (c) 2014-2019 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #define AMD64_NPT_AWARE #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_ddb.h" #include "opt_pmap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include static __inline boolean_t pmap_type_guest(pmap_t pmap) { return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); } static __inline boolean_t pmap_emulate_ad_bits(pmap_t pmap) { return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); } static __inline pt_entry_t pmap_valid_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_V; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_V; else mask = EPT_PG_READ; break; default: panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_rw_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_RW; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_RW; else mask = EPT_PG_WRITE; break; default: panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static pt_entry_t pg_g; static __inline pt_entry_t pmap_global_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: mask = pg_g; break; case PT_RVI: case PT_EPT: mask = 0; break; default: panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_accessed_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_A; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_READ; else mask = EPT_PG_A; break; default: panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_modified_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_M; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_WRITE; else mask = EPT_PG_M; break; default: panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_pku_mask_bit(pmap_t pmap) { return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); } #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int nkpt; SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, "Number of kernel page table pages allocated on bootup"); static int ndmpdp; vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv_list_locks[] * elements, but reads are not. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx __exclusive_cache_line pv_chunks_mutex; static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = NULL; caddr_t CADDR1 = 0; static vm_offset_t qframe = 0; static struct mtx qframe_mtx; static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ static vmem_t *large_vmem; static u_int lm_ents; #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pti, 0, "Page Table Isolation enabled"); static vm_object_t pti_obj; static pml4_entry_t *pti_pml4; static vm_pindex_t pti_pg_idx; static bool pti_finalized; struct pmap_pkru_range { struct rs_el pkru_rs_el; u_int pkru_keyidx; int pkru_flags; }; static uma_zone_t pmap_pkru_ranges_zone; static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void *pkru_dup_range(void *ctx, void *data); static void pkru_free_range(void *ctx, void *node); static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void pmap_pkru_deassign_all(pmap_t pmap); static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { int i; uint64_t res; res = 0; CPU_FOREACH(i) { res += cpuid_to_pcpu[i]->pc_pm_save_cnt; } return (sysctl_handle_64(oidp, &res, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", "Count of saved TLB context on switch"); static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); static struct mtx invl_gen_mtx; /* Fake lock object to satisfy turnstiles interface. */ static struct lock_object invl_gen_ts = { .lo_name = "invlts", }; static struct pmap_invl_gen pmap_invl_gen_head = { .gen = 1, .next = NULL, }; static u_long pmap_invl_gen = 1; static int pmap_invl_waiters; static struct callout pmap_invl_callout; static bool pmap_invl_callout_inited; #define PMAP_ASSERT_NOT_IN_DI() \ KASSERT(pmap_not_in_di(), ("DI already started")) static bool pmap_di_locked(void) { int tun; if ((cpu_feature2 & CPUID2_CX16) == 0) return (true); tun = 0; TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); return (tun != 0); } static int sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) { int locked; locked = pmap_di_locked(); return (sysctl_handle_int(oidp, &locked, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", "Locked delayed invalidation"); static bool pmap_not_in_di_l(void); static bool pmap_not_in_di_u(void); DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) { return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); } static bool pmap_not_in_di_l(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (invl_gen->gen == 0); } static void pmap_thread_init_invl_gen_l(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; } static void pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) { struct turnstile *ts; ts = turnstile_trywait(&invl_gen_ts); if (*m_gen > atomic_load_long(invl_gen)) turnstile_wait(ts, NULL, TS_SHARED_QUEUE); else turnstile_cancel(ts); } static void pmap_delayed_invl_finish_unblock(u_long new_gen) { struct turnstile *ts; turnstile_chain_lock(&invl_gen_ts); ts = turnstile_lookup(&invl_gen_ts); if (new_gen != 0) pmap_invl_gen = new_gen; if (ts != NULL) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts); } turnstile_chain_unlock(&invl_gen_ts); } /* * Start a new Delayed Invalidation (DI) block of code, executed by * the current thread. Within a DI block, the current thread may * destroy both the page table and PV list entries for a mapping and * then release the corresponding PV list lock before ensuring that * the mapping is flushed from the TLBs of any processors with the * pmap active. */ static void pmap_delayed_invl_start_l(void) { struct pmap_invl_gen *invl_gen; u_long currgen; invl_gen = &curthread->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); mtx_lock(&invl_gen_mtx); if (LIST_EMPTY(&pmap_invl_gen_tracker)) currgen = pmap_invl_gen; else currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; invl_gen->gen = currgen + 1; LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); mtx_unlock(&invl_gen_mtx); } /* * Finish the DI block, previously started by the current thread. All * required TLB flushes for the pages marked by * pmap_delayed_invl_page() must be finished before this function is * called. * * This function works by bumping the global DI generation number to * the generation number of the current thread's DI, unless there is a * pending DI that started earlier. In the latter case, bumping the * global DI generation number would incorrectly signal that the * earlier DI had finished. Instead, this function bumps the earlier * DI's generation number to match the generation number of the * current thread's DI. */ static void pmap_delayed_invl_finish_l(void) { struct pmap_invl_gen *invl_gen, *next; invl_gen = &curthread->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start")); mtx_lock(&invl_gen_mtx); next = LIST_NEXT(invl_gen, link); if (next == NULL) pmap_delayed_invl_finish_unblock(invl_gen->gen); else next->gen = invl_gen->gen; LIST_REMOVE(invl_gen, link); mtx_unlock(&invl_gen_mtx); invl_gen->gen = 0; } static bool pmap_not_in_di_u(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); } static void pmap_thread_init_invl_gen_u(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; } static bool pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) { uint64_t new_high, new_low, old_high, old_low; char res; old_low = new_low = 0; old_high = new_high = (uintptr_t)0; __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); if (res == 0) { if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) return (false); out->gen = old_low; out->next = (void *)old_high; } else { out->gen = new_low; out->next = (void *)new_high; } return (true); } static bool pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, struct pmap_invl_gen *new_val) { uint64_t new_high, new_low, old_high, old_low; char res; new_low = new_val->gen; new_high = (uintptr_t)new_val->next; old_low = old_val->gen; old_high = (uintptr_t)old_val->next; __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); return (res); } #ifdef PV_STATS static long invl_start_restart; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD, &invl_start_restart, 0, ""); static long invl_finish_restart; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, &invl_finish_restart, 0, ""); static int invl_max_qlen; SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, &invl_max_qlen, 0, ""); #endif static struct lock_delay_config __read_frequently di_delay; LOCK_DELAY_SYSINIT_DEFAULT(di_delay); static void pmap_delayed_invl_start_u(void) { struct pmap_invl_gen *invl_gen, *p, prev, new_prev; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; u_char pri; #ifdef PV_STATS int i, ii; #endif td = curthread; invl_gen = &td->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); lock_delay_arg_init(&lda, &di_delay); invl_gen->saved_pri = 0; pri = td->td_base_pri; if (pri > PVM) { thread_lock(td); pri = td->td_base_pri; if (pri > PVM) { invl_gen->saved_pri = pri; sched_prio(td, PVM); } thread_unlock(td); } again: PV_STAT(i = 0); for (p = &pmap_invl_gen_head;; p = prev.next) { PV_STAT(i++); prevl = atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } if (prevl == 0) break; prev.next = (void *)prevl; } #ifdef PV_STATS if ((ii = invl_max_qlen) < i) atomic_cmpset_int(&invl_max_qlen, ii, i); #endif if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } new_prev.gen = prev.gen; new_prev.next = invl_gen; invl_gen->gen = prev.gen + 1; /* Formal fence between store to invl->gen and updating *p. */ atomic_thread_fence_rel(); /* * After inserting an invl_gen element with invalid bit set, * this thread blocks any other thread trying to enter the * delayed invalidation block. Do not allow to remove us from * the CPU, because it causes starvation for other threads. */ critical_enter(); /* * ABA for *p is not possible there, since p->gen can only * increase. So if the *p thread finished its di, then * started a new one and got inserted into the list at the * same place, its gen will appear greater than the previously * read gen. */ if (!pmap_di_store_invl(p, &prev, &new_prev)) { critical_exit(); PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } /* * There we clear PMAP_INVL_GEN_NEXT_INVALID in * invl_gen->next, allowing other threads to iterate past us. * pmap_di_store_invl() provides fence between the generation * write and the update of next. */ invl_gen->next = NULL; critical_exit(); } static bool pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, struct pmap_invl_gen *p) { struct pmap_invl_gen prev, new_prev; u_long mygen; /* * Load invl_gen->gen after setting invl_gen->next * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger * generations to propagate to our invl_gen->gen. Lock prefix * in atomic_set_ptr() worked as seq_cst fence. */ mygen = atomic_load_long(&invl_gen->gen); if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) return (false); KASSERT(prev.gen < mygen, ("invalid di gen sequence %lu %lu", prev.gen, mygen)); new_prev.gen = mygen; new_prev.next = (void *)((uintptr_t)invl_gen->next & ~PMAP_INVL_GEN_NEXT_INVALID); /* Formal fence between load of prev and storing update to it. */ atomic_thread_fence_rel(); return (pmap_di_store_invl(p, &prev, &new_prev)); } static void pmap_delayed_invl_finish_u(void) { struct pmap_invl_gen *invl_gen, *p; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; td = curthread; invl_gen = &td->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, ("missed invl_start: INVALID")); lock_delay_arg_init(&lda, &di_delay); again: for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { prevl = atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } if ((void *)prevl == invl_gen) break; } /* * It is legitimate to not find ourself on the list if a * thread before us finished its DI and started it again. */ if (__predict_false(p == NULL)) { PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_enter(); atomic_set_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { atomic_clear_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); critical_exit(); PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_exit(); if (atomic_load_int(&pmap_invl_waiters) > 0) pmap_delayed_invl_finish_unblock(0); if (invl_gen->saved_pri != 0) { thread_lock(td); sched_prio(td, invl_gen->saved_pri); thread_unlock(td); } } #ifdef DDB DB_SHOW_COMMAND(di_queue, pmap_di_queue) { struct pmap_invl_gen *p, *pn; struct thread *td; uintptr_t nextl; bool first; for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, first = false) { nextl = atomic_load_ptr(&p->next); pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); td = first ? NULL : __containerof(p, struct thread, td_md.md_invl_gen); db_printf("gen %lu inv %d td %p tid %d\n", p->gen, (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, td != NULL ? td->td_tid : -1); } } #endif #ifdef PV_STATS static long invl_wait; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, "Number of times DI invalidation blocked pmap_remove_all/write"); static long invl_wait_slow; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0, "Number of slow invalidation waits for lockless DI"); #endif static u_long * pmap_delayed_invl_genp(vm_page_t m) { return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); } static void pmap_delayed_invl_callout_func(void *arg __unused) { if (atomic_load_int(&pmap_invl_waiters) == 0) return; pmap_delayed_invl_finish_unblock(0); } static void pmap_delayed_invl_callout_init(void *arg __unused) { if (pmap_di_locked()) return; callout_init(&pmap_invl_callout, 1); pmap_invl_callout_inited = true; } SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_delayed_invl_callout_init, NULL); /* * Ensure that all currently executing DI blocks, that need to flush * TLB for the given page m, actually flushed the TLB at the time the * function returned. If the page m has an empty PV list and we call * pmap_delayed_invl_wait(), upon its return we know that no CPU has a * valid mapping for the page m in either its page table or TLB. * * This function works by blocking until the global DI generation * number catches up with the generation number associated with the * given page m and its PV list. Since this function's callers * typically own an object lock and sometimes own a page lock, it * cannot sleep. Instead, it blocks on a turnstile to relinquish the * processor. */ static void pmap_delayed_invl_wait_l(vm_page_t m) { u_long *m_gen; #ifdef PV_STATS bool accounted = false; #endif m_gen = pmap_delayed_invl_genp(m); while (*m_gen > pmap_invl_gen) { #ifdef PV_STATS if (!accounted) { atomic_add_long(&invl_wait, 1); accounted = true; } #endif pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); } } static void pmap_delayed_invl_wait_u(vm_page_t m) { u_long *m_gen; struct lock_delay_arg lda; bool fast; fast = true; m_gen = pmap_delayed_invl_genp(m); lock_delay_arg_init(&lda, &di_delay); while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { if (fast || !pmap_invl_callout_inited) { PV_STAT(atomic_add_long(&invl_wait, 1)); lock_delay(&lda); fast = false; } else { /* * The page's invalidation generation number * is still below the current thread's number. * Prepare to block so that we do not waste * CPU cycles or worse, suffer livelock. * * Since it is impossible to block without * racing with pmap_delayed_invl_finish_u(), * prepare for the race by incrementing * pmap_invl_waiters and arming a 1-tick * callout which will unblock us if we lose * the race. */ atomic_add_int(&pmap_invl_waiters, 1); /* * Re-check the current thread's invalidation * generation after incrementing * pmap_invl_waiters, so that there is no race * with pmap_delayed_invl_finish_u() setting * the page generation and checking * pmap_invl_waiters. The only race allowed * is for a missed unblock, which is handled * by the callout. */ if (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { callout_reset(&pmap_invl_callout, 1, pmap_delayed_invl_callout_func, NULL); PV_STAT(atomic_add_long(&invl_wait_slow, 1)); pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen_head.gen); } atomic_add_int(&pmap_invl_waiters, -1); } } } DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) { return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : pmap_thread_init_invl_gen_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_start_l : pmap_delayed_invl_start_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_finish_l : pmap_delayed_invl_finish_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) { return (pmap_di_locked() ? pmap_delayed_invl_wait_l : pmap_delayed_invl_wait_u); } /* * Mark the page m's PV list as participating in the current thread's * DI block. Any threads concurrently using m's PV list to remove or * restrict all mappings to m will wait for the current thread's DI * block to complete before proceeding. * * The function works by setting the DI generation number for m's PV * list to at least the DI generation number of the current thread. * This forces a caller of pmap_delayed_invl_wait() to block until * current thread calls pmap_delayed_invl_finish(). */ static void pmap_delayed_invl_page(vm_page_t m) { u_long gen, *m_gen; rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); gen = curthread->td_md.md_invl_gen.gen; if (gen == 0) return; m_gen = pmap_delayed_invl_genp(m); if (*m_gen < gen) *m_gen = gen; } /* * Crashdump maps. */ static caddr_t crashdumpmap; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ /* * Internal flags for pmap_mapdev_internal() and * pmap_change_attr_locked(). */ #define MAPDEV_FLUSHCACHE 0x0000001 /* Flush cache after mapping. */ #define MAPDEV_SETATTR 0x0000002 /* Modify existing attrs. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static int popcnt_pc_map_pq(uint64_t *map); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, int flags); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_large_map_getptp_unlocked(void); static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec); static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); /********************/ /* Inline functions */ /********************/ /* Return a non-clipped PD index for a given VA */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_pml4[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) { pdp_entry_t *pdpe; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); if ((*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return (NULL); if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); return (pmap_pde_to_pte(pde, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); return (PTmap + ((va >> PAGE_SHIFT) & mask)); } static __inline pd_entry_t * vtopde(vm_offset_t va) { u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); return (PDmap + ((va >> PDRSHIFT) & mask)); } static u_int64_t allocpages(vm_paddr_t *firstaddr, int n) { u_int64_t ret; ret = *firstaddr; bzero((void *)ret, n * PAGE_SIZE); *firstaddr += n * PAGE_SIZE; return (ret); } CTASSERT(powerof2(NDMPML4E)); /* number of kernel PDP slots */ #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) static void nkpt_init(vm_paddr_t addr) { int pt_pages; #ifdef NKPT pt_pages = NKPT; #else pt_pages = howmany(addr, 1 << PDRSHIFT); pt_pages += NKPDPE(pt_pages); /* * Add some slop beyond the bare minimum required for bootstrapping * the kernel. * * This is quite important when allocating KVA for kernel modules. * The modules are required to be linked in the negative 2GB of * the address space. If we run out of KVA in this region then * pmap_growkernel() will need to allocate page table pages to map * the entire 512GB of KVA space which is an unnecessary tax on * physical memory. * * Secondly, device memory mapped as part of setting up the low- * level console(s) is taken from KVA, starting at virtual_avail. * This is because cninit() is called after pmap_bootstrap() but * before vm_init() and pmap_init(). 20MB for a frame buffer is * not uncommon. */ pt_pages += 32; /* 64MB additional slop. */ #endif nkpt = pt_pages; } /* * Returns the proper write/execute permission for a physical page that is * part of the initial boot allocations. * * If the page has kernel text, it is marked as read-only. If the page has * kernel read-only data, it is marked as read-only/not-executable. If the * page has only read-write data, it is marked as read-write/not-executable. * If the page is below/above the kernel range, it is marked as read-write. * * This function operates on 2M pages, since we map the kernel space that * way. * * Note that this doesn't currently provide any protection for modules. */ static inline pt_entry_t bootaddr_rwx(vm_paddr_t pa) { /* * Everything in the same 2M page as the start of the kernel * should be static. On the other hand, things in the same 2M * page as the end of the kernel could be read-write/executable, * as the kernel image is not guaranteed to end on a 2M boundary. */ if (pa < trunc_2mpage(btext - KERNBASE) || pa >= trunc_2mpage(_end - KERNBASE)) return (X86_PG_RW); /* * The linker should ensure that the read-only and read-write * portions don't share the same 2M page, so this shouldn't * impact read-only data. However, in any case, any page with * read-write data needs to be read-write. */ if (pa >= trunc_2mpage(brwsection - KERNBASE)) return (X86_PG_RW | pg_nx); /* * Mark any 2M page containing kernel text as read-only. Mark * other pages with read-only data as read-only and not executable. * (It is likely a small portion of the read-only data section will * be marked as read-only, but executable. This should be acceptable * since the read-only protection will keep the data from changing.) * Note that fixups to the .text section will still work until we * set CR0.WP. */ if (pa < round_2mpage(etext - KERNBASE)) return (0); return (pg_nx); } static void create_pagetables(vm_paddr_t *firstaddr) { int i, j, ndm1g, nkpdpe, nkdmpde; pd_entry_t *pd_p; pdp_entry_t *pdp_p; pml4_entry_t *p4_p; uint64_t DMPDkernphys; /* Allocate page table pages for the direct map */ ndmpdp = howmany(ptoa(Maxmem), NBPDP); if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); if (ndmpdpphys > NDMPML4E) { /* * Each NDMPML4E allows 512 GB, so limit to that, * and then readjust ndmpdp and ndmpdpphys. */ printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); Maxmem = atop(NDMPML4E * NBPML4); ndmpdpphys = NDMPML4E; ndmpdp = NDMPML4E * NPDEPG; } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { /* * Calculate the number of 1G pages that will fully fit in * Maxmem. */ ndm1g = ptoa(Maxmem) >> PDPSHIFT; /* * Allocate 2M pages for the kernel. These will be used in * place of the first one or more 1G pages from ndm1g. */ nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); DMPDkernphys = allocpages(firstaddr, nkdmpde); } if (ndm1g < ndmpdp) DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages */ KPML4phys = allocpages(firstaddr, 1); KPDPphys = allocpages(firstaddr, NKPML4E); /* * Allocate the initial number of kernel page table pages required to * bootstrap. We defer this until after all memory-size dependent * allocations are done (e.g. direct map), so that we don't have to * build in too much slop in our estimate. * * Note that when NKPML4E > 1, we have an empty page underneath * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) * pages. (pmap_enter requires a PD page to exist for each KPML4E.) */ nkpt_init(*firstaddr); nkpdpe = NKPDPE(nkpt); KPTphys = allocpages(firstaddr, nkpt); KPDphys = allocpages(firstaddr, nkpdpe); /* * Connect the zero-filled PT pages to their PD entries. This * implicitly maps the PT pages at their correct locations within * the PTmap. */ pd_p = (pd_entry_t *)KPDphys; for (i = 0; i < nkpt; i++) pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* * Map from physical address zero to the end of loader preallocated * memory using 2MB pages. This replaces some of the PD entries * created above. */ for (i = 0; (i << PDRSHIFT) < KERNend; i++) /* Preset PG_M and PG_A because demotion expects it. */ pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); /* * Because we map the physical blocks in 2M pages, adjust firstaddr * to record the physical blocks we've actually mapped into kernel * virtual address space. */ if (*firstaddr < round_2mpage(KERNend)) *firstaddr = round_2mpage(KERNend); /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); for (i = 0; i < nkpdpe; i++) pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If * the end of physical memory is not aligned to a 1GB page boundary, * then the residual physical memory is mapped with 2MB pages. Later, * if pmap_mapdev{_attr}() uses the direct map for non-write-back * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ pd_p = (pd_entry_t *)DMPDphys; for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); pdp_p[i] |= X86_PG_RW | X86_PG_V; } /* * Instead of using a 1G page for the memory containing the kernel, * use 2M pages with appropriate permissions. (If using 1G pages, * this will partially overwrite the PDPEs above.) */ if (ndm1g) { pd_p = (pd_entry_t *)DMPDkernphys; for (i = 0; i < (NPDEPG * nkdmpde); i++) pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx | bootaddr_rwx(i << PDRSHIFT); for (i = 0; i < nkdmpde; i++) pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } /* And recursively map PML4 to itself in order to get PTmap */ p4_p = (pml4_entry_t *)KPML4phys; p4_p[PML4PML4I] = KPML4phys; p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V; } /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; pt_entry_t *pte, *pcpu_pte; uint64_t cr4, pcpu_phys; u_long res; int i; KERNend = *firstaddr; res = atop(KERNend - (vm_paddr_t)kernphys); if (!pti) pg_g = X86_PG_G; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(firstaddr); pcpu_phys = allocpages(firstaddr, MAXCPU); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Account for the virtual addresses mapped by create_pagetables(). */ virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Enable PG_G global pages, then switch to the kernel page * table from the bootstrap page table. After the switch, it * is possible to enable SMEP and SMAP since PG_U bits are * correct now. */ cr4 = rcr4(); cr4 |= CR4_PGE; load_cr4(cr4); load_cr3(KPML4phys); if (cpu_stdext_feature & CPUID_STDEXT_SMEP) cr4 |= CR4_SMEP; if (cpu_stdext_feature & CPUID_STDEXT_SMAP) cr4 |= CR4_SMAP; load_cr4(cr4); /* * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_stats.resident_count = res; kernel_pmap->pm_flags = pmap_flags; /* * Initialize the TLB invalidations generation number lock. */ mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Crashdump maps. The first page is reused as CMAP1 for the * memory test. */ SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) CADDR1 = crashdumpmap; SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); virtual_avail = va; for (i = 0; i < MAXCPU; i++) { pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | pg_g | pg_nx | X86_PG_M | X86_PG_A; } STAILQ_INIT(&cpuhead); wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); amd64_bsp_pcpu_init1(&__pcpu[0]); amd64_bsp_ist_init(&__pcpu[0]); __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; /* * Initialize the PAT MSR. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. */ pmap_init_pat(); /* Initialize TLB Context Id. */ if (pmap_pcid_enabled) { for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; kernel_pmap->pm_pcids[i].pm_gen = 1; } /* * PMAP_PCID_KERN + 1 is used for initialization of * proc0 pmap. The pmap' pcid state might be used by * EFIRT entry before first context switch, so it * needs to be valid. */ PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); PCPU_SET(pcid_gen, 1); /* * pcpu area for APs is zeroed during AP startup. * pc_pcid_next and pc_pcid_gen are initialized by AP * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); } } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { uint64_t pat_msr; u_long cr0, cr4; int i; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) panic("no PAT??"); /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = -1; pat_index[PAT_WRITE_BACK] = 0; pat_index[PAT_WRITE_THROUGH] = 1; pat_index[PAT_UNCACHEABLE] = 3; pat_index[PAT_WRITE_COMBINING] = 6; pat_index[PAT_WRITE_PROTECTED] = 5; pat_index[PAT_UNCACHED] = 2; /* * Initialize default PAT entries. * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * * Leave 4 and 7 as WB and UC. Note that a recursive page table * mapping for a 2M page uses a PAT value with the bit 3 set due * to its overload with PG_PS. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING) | PAT_VALUE(7, PAT_UNCACHEABLE); /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t m, mpte; vm_size_t s; int error, i, pv_npg, ret, skz63; /* L1TF, reserve page @0 unconditionally */ vm_page_blacklist_add(0, bootverbose); /* Detect bare-metal Skylake Server and Skylake-X. */ if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { /* * Skylake-X errata SKZ63. Processor May Hang When * Executing Code In an HLE Transaction Region between * 40000000H and 403FFFFFH. * * Mark the pages in the range as preallocated. It * seems to be impossible to distinguish between * Skylake Server and Skylake X. */ skz63 = 1; TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); if (skz63 != 0) { if (bootverbose) printf("SKZ63: skipping 4M RAM starting " "at physical 1G\n"); for (i = 0; i < atop(0x400000); i++) { ret = vm_page_blacklist_add(0x40000000 + ptoa(i), FALSE); if (!ret && bootverbose) printf("page at %#lx already used\n", 0x40000000 + ptoa(i)); } } } /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = pmap_pde_pindex(KERNBASE) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); mpte->wire_count = 1; /* * Collect the page table pages that were replaced by a 2MB * page in create_pagetables(). They are zero filled. */ if (i << PDRSHIFT < KERNend && pmap_insert_pt_page(kernel_pmap, mpte, false)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); vm_wire_add(nkpt); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); pmap_initialized = 1; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; /* Make the direct map consistent */ if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), ppim->sz, ppim->mode); } if (!bootverbose) continue; printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, ppim->pa, ppim->va, ppim->sz, ppim->mode); } mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); lm_ents = 8; TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); if (lm_ents > LMEPML4I - LMSPML4I + 1) lm_ents = LMEPML4I - LMSPML4I + 1; if (bootverbose) printf("pmap: large map %u PML4 slots (%lu Gb)\n", lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); if (lm_ents != 0) { large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); if (large_vmem == NULL) { printf("pmap: cannot create large map\n"); lm_ents = 0; } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); } } } static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2MB page promotions"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, "1GB page mapping counters"); static u_long pmap_pdpe_demotions; SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pdpe_demotions, 0, "1GB page demotions"); /*************************************************** * Low level helper routines..... ***************************************************/ static pt_entry_t pmap_swap_pat(pmap_t pmap, pt_entry_t entry) { int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* Verify that both PAT bits are not set at the same time */ KASSERT((entry & x86_pat_bits) != x86_pat_bits, ("Invalid PAT bits in entry %#lx", entry)); /* Swap the PAT bits if one of them is set */ if ((entry & x86_pat_bits) != 0) entry ^= x86_pat_bits; break; case PT_EPT: /* * Nothing to do - the memory attributes are represented * the same way for regular pages and superpages. */ break; default: panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); } return (entry); } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < PAT_INDEX_SIZE && pat_index[(int)mode] >= 0); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (!pmap_is_valid_memattr(pmap, mode)) panic("Unknown caching mode %d\n", mode); switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; break; case PT_EPT: cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); break; default: panic("unsupported pmap type %d", pmap->pm_type); } return (cache_bits); } static int pmap_cache_mask(pmap_t pmap, boolean_t is_pde) { int mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; break; case PT_EPT: mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); break; default: panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); } return (mask); } static int pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) { int pat_flag, pat_idx; pat_idx = 0; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; if ((pte & pat_flag) != 0) pat_idx |= 0x4; if ((pte & PG_NC_PCD) != 0) pat_idx |= 0x2; if ((pte & PG_NC_PWT) != 0) pat_idx |= 0x1; break; case PT_EPT: if ((pte & EPT_PG_IGNORE_PAT) != 0) panic("EPT PTE %#lx has no PAT memory type", pte); pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; break; } /* See pmap_init_pat(). */ if (pat_idx == 4) pat_idx = 0; if (pat_idx == 7) pat_idx = 3; return (pat_idx); } bool pmap_ps_enabled(pmap_t pmap) { return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); } static void pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) { switch (pmap->pm_type) { case PT_X86: break; case PT_RVI: case PT_EPT: /* * XXX * This is a little bogus since the generation number is * supposed to be bumped up when a region of the address * space is invalidated in the page tables. * * In this case the old PDE entry is valid but yet we want * to make sure that any mappings using the old entry are * invalidated in the TLB. * * The reason this works as expected is because we rendezvous * "all" host cpus and force any vcpu context to exit as a * side-effect. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); break; default: panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); } pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) { pt_entry_t PG_G; if (pmap_type_guest(pmap)) return; KASSERT(pmap->pm_type == PT_X86, ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); PG_G = pmap_global_bit(pmap); if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ invltlb_glob(); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ /* * Interrupt the cpus that are executing in the guest context. * This will force the vcpu to exit and the cached EPT mappings * will be invalidated by the host before the next vmresume. */ static __inline void pmap_invalidate_ept(pmap_t pmap) { int ipinum; sched_pin(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("pmap_invalidate_ept: absurd pm_active")); /* * The TLB mappings associated with a vcpu context are not * flushed each time a different vcpu is chosen to execute. * * This is in contrast with a process's vtop mappings that * are flushed from the TLB on each context switch. * * Therefore we need to do more than just a TLB shootdown on * the active cpus in 'pmap->pm_active'. To do this we keep * track of the number of invalidations performed on this pmap. * * Each vcpu keeps a cache of this counter and compares it * just before a vmresume. If the counter is out-of-date an * invept will be done to flush stale mappings from the TLB. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); /* * Force the vcpu to exit and trap back into the hypervisor. */ ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; ipi_selected(pmap->pm_active, ipinum); sched_unpin(); } static cpuset_t pmap_invalidate_cpu_mask(pmap_t pmap) { return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); } static inline void pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap->pm_ucr3 != PMAP_NO_CR3) { /* * Because pm_pcid is recalculated on a * context switch, we must disable switching. * Otherwise, we might use a stale value * below. */ critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } critical_exit(); } } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } /* * The fence is between stores to pm_gen and the read of the * pm_active mask. We need to ensure that it is impossible * for us to miss the bit update in pm_active and * simultaneously observe a non-zero pm_gen in * pmap_activate_sw(), otherwise TLB update is missed. * Without the fence, IA32 allows such an outcome. Note that * pm_active is updated by a locked operation, which provides * the reciprocal fence. */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid(pmap, va, true); } static void pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid(pmap, va, false); } static void pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) { } DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : pmap_invalidate_page_pcid_noinvpcid); return (pmap_invalidate_page_nopcid); } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { invlpg(va); } else { if (pmap == PCPU_GET(curpmap)) invlpg(va); pmap_invalidate_page_mode(pmap, va); } smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } /* See the comment in pmap_invalidate_page_pcid(). */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid(pmap, sva, eva, true); } static void pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid(pmap, sva, eva, false); } static void pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { } DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : pmap_invalidate_range_pcid_noinvpcid); return (pmap_invalidate_range_nopcid); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } else { if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } pmap_invalidate_range_mode(pmap, sva, eva); } smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap); sched_unpin(); } static inline void pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; if (pmap == kernel_pmap) { if (invpcid_works1) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); if (pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); } } else { kcr3 = pmap->pm_cr3 | pcid; ucr3 = pmap->pm_ucr3; if (ucr3 != PMAP_NO_CR3) { ucr3 |= pcid | PMAP_PCID_USER_PT; pmap_pti_pcid_invalidate(ucr3, kcr3); } else { load_cr3(kcr3); } } critical_exit(); } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } /* See the comment in pmap_invalidate_page_pcid(). */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_all_pcid_invpcid(pmap_t pmap) { pmap_invalidate_all_pcid(pmap, true); } static void pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) { pmap_invalidate_all_pcid(pmap, false); } static void pmap_invalidate_all_nopcid(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (pmap == PCPU_GET(curpmap)) invltlb(); } DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : pmap_invalidate_all_pcid_noinvpcid); return (pmap_invalidate_all_nopcid); } void pmap_invalidate_all(pmap_t pmap) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); sched_pin(); pmap_invalidate_all_mode(pmap); smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ pmap_t pmap; vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_action(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pmap_update_pde_store(act->pmap, act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap || pmap_type_guest(pmap)) active = all_cpus; else { active = pmap->pm_active; } if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pmap = pmap; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendezvous_barrier, pmap_update_pde_action, pmap_update_pde_teardown, &act); } else { pmap_update_pde_store(pmap, pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(pmap, va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { invlpg(va); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); pcid = pmap->pm_pcids[0].pm_pcid; if (invpcid_works) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } critical_exit(); } } else if (pmap_pcid_enabled) pmap->pm_pcids[0].pm_gen = 0; } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct invpcid_descr d; vm_offset_t addr; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. pm_pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } void pmap_invalidate_all(pmap_t pmap) { struct invpcid_descr d; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap) { if (pmap_pcid_enabled && invpcid_works) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); if (pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); } } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; if (pmap->pm_ucr3 != PMAP_NO_CR3) { ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 0].pm_pcid | PMAP_PCID_USER_PT; pmap_pti_pcid_invalidate(ucr3, kcr3); } else load_cr3(kcr3); } critical_exit(); } else { invltlb(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) pmap_update_pde_invalidate(pmap, va, newpde); else pmap->pm_pcids[0].pm_gen = 0; } #endif /* !SMP */ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2MB page mapping was created * by a promotion that did not invalidate the 512 4KB page mappings * that might exist in the TLB. Consequently, at this point, the TLB * may hold both 4KB and 2MB page mappings for the address range [va, * va + NBPDR). Therefore, the entire range must be invalidated here. * In contrast, when PG_PROMOTED is clear, the TLB will not hold any * 4KB page mappings for the address range [va, va + NBPDR), and so a * single INVLPG suffices to invalidate the 2MB page mapping from the * TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + NBPDR - 1); else pmap_invalidate_page(pmap, va); } DEFINE_IFUNC(, void, pmap_invalidate_cache_range, (vm_offset_t sva, vm_offset_t eva)) { if ((cpu_feature & CPUID_SS) != 0) return (pmap_invalidate_cache_range_selfsnoop); if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_force_invalidate_cache_range); return (pmap_invalidate_cache_range_all); } #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) static void pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); } void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) { sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC range. The * local APIC is always uncached, so we don't need to flush * for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* * Do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); } else { /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); pmap_invalidate_cache(); } /* * Remove the specified set of pages from the data and instruction caches. * * In contrast to pmap_invalidate_cache_range(), this function does not * rely on the CPU's self-snoop feature, because it is intended for use * when moving pages into a different cache domain. */ void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { vm_offset_t daddr, eva; int i; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) pmap_invalidate_cache(); else { if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (i = 0; i < count; i++) { daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); eva = daddr + PAGE_SIZE; for (; daddr < eva; daddr += cpu_clflush_line_size) { if (useclflushopt) clflushopt(daddr); else clflush(daddr); } } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } void pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { pmap_force_invalidate_cache_range(sva, eva); return; } /* See comment in pmap_force_invalidate_cache_range(). */ if (pmap_kextract(sva) == lapic_paddr) return; sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clwb(sva); sfence(); } void pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) { pt_entry_t *pte; vm_offset_t vaddr; int error, pte_bits; KASSERT((spa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: spa not page-aligned")); KASSERT((epa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: epa not page-aligned")); if (spa < dmaplimit) { pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( dmaplimit, epa))); if (dmaplimit >= epa) return; spa = dmaplimit; } pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | X86_PG_V; error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); pte = vtopte(vaddr); for (; spa < epa; spa += PAGE_SIZE) { sched_pin(); pte_store(pte, spa | pte_bits); invlpg(vaddr); /* XXXKIB sfences inside flush_cache_range are excessive */ pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); sched_unpin(); } vmem_free(kernel_arena, vaddr, PAGE_SIZE); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_paddr_t pa; pa = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { if ((*pdpe & PG_PS) != 0) pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); else { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & PG_V) != 0) { if ((*pde & PG_PS) != 0) { pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); } else { pte = pmap_pde_to_pte(pde, va); pa = (*pte & PG_FRAME) | (va & PAGE_MASK); } } } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; pt_entry_t pte, PG_RW, PG_V; vm_page_t m; m = NULL; PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdep = pmap_pde(pmap, va); if (pdep != NULL && (pde = *pdep)) { if (pde & PG_PS) { if ((pde & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0) m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); } else { pte = *pmap_pde_to_pte(pdep, va); if ((pte & PG_V) != 0 && ((pte & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0)) m = PHYS_TO_VM_PAGE(pte & PG_FRAME); } if (m != NULL && !vm_page_wire_mapped(m)) m = NULL; } PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { pa = pmap_large_map_kextract(va); } else { pde = *vtopde(va); if (pde & PG_PS) { pa = (pde & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pa = *pmap_pde_to_pte(&pde, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; int cache_bits; pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; int cache_bits; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V); } pte++; } if (__predict_false((oldpte & X86_PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUPDE + NUPDPE)) { /* PDP page */ pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; *pml4 = 0; } } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ pd_entry_t *pd; pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); } /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { struct proc *p; struct thread *td; int i; PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); pmap->pm_pml4u = NULL; pmap->pm_cr3 = KPML4phys; /* hack to keep pmap_pti_pcid_invalidate() alive */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = pmap_flags; CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; pmap->pm_pcids[i].pm_gen = 1; } pmap_activate_boot(pmap); td = curthread; if (pti) { p = td->td_proc; PROC_LOCK(p); p->p_md.md_flags |= P_MD_KPTI; PROC_UNLOCK(p); } pmap_thread_init_invl_gen(td); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } } void pmap_pinit_pml4(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); /* Wire in kernel global address entries. */ for (i = 0; i < NKPML4E; i++) { pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } for (i = 0; i < ndmpdpphys; i++) { pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } /* install self-referential address mapping entry(s) */ pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; /* install large map entries if configured */ for (i = 0; i < lm_ents; i++) pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; } static void pmap_pinit_pml4_pti(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); for (i = 0; i < NPML4EPG; i++) pm_pml4[i] = pti_pml4[i]; } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pml4pg, pml4pgu; vm_paddr_t pml4phys; int i; /* * allocate the page directory page */ pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); pml4phys = VM_PAGE_TO_PHYS(pml4pg); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_pml4u = NULL; pmap->pm_type = pm_type; if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); /* * Do not install the host kernel mappings in the nested page * tables. These mappings are meaningless in the guest physical * address space. * Install minimal kernel mappings in PTI case. */ if (pm_type == PT_X86) { pmap->pm_cr3 = pml4phys; pmap_pinit_pml4(pml4pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(pml4pgu)); pmap_pinit_pml4_pti(pml4pgu); pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); } if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_init(&pmap->pm_pkru, pkru_dup_range, pkru_free_range, pmap, M_NOWAIT); } } pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; pmap->pm_eptgen = 0; return (1); } int pmap_pinit(pmap_t pmap) { return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); PMAP_ASSERT_NOT_IN_DI(); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUPDE + NUPDPE)) { pml4_entry_t *pml4, *pml4u; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { /* * PTI: Make all user-space mappings in the * kernel-mode page table no-execute so that * we detect any programming errors that leave * the kernel-mode page table active on return * to user space. */ if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml4 |= pg_nx; pml4u = &pmap->pm_pml4u[pml4index]; *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; /* Wire up a new PDE page */ pdpindex = ptepindex - NUPDE; pml4index = pdpindex >> NPML4EPGSHIFT; pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to pdp page */ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); pdppg->wire_count++; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); /* Now find the pdp page */ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; /* Wire up a new PTE page */ pdpindex = ptepindex >> NPDPEPGSHIFT; pml4index = pdpindex >> NPML4EPGSHIFT; /* First, find the pdp and check that its valid. */ pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); pdpg->wire_count++; } } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; pdp_entry_t *pdpe, PG_V; vm_page_t pdpg; PG_V = pmap_valid_bit(pmap); retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); pdpg->wire_count++; } else { /* Allocate a pd page. */ ptepindex = pmap_pde_pindex(va); pdpindex = ptepindex >> NPDPEPGSHIFT; pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); if (pdpg == NULL && lockp != NULL) goto retry; } return (pdpg); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pd, PG_V; vm_page_t m; PG_V = pmap_valid_bit(pmap); /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. */ pd = NULL; } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pml4[KPML4BASE + i] = 0; for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ pmap->pm_pml4[DMPML4I + i] = 0; pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ for (i = 0; i < lm_ents; i++) /* Large Map */ pmap->pm_pml4[LMSPML4I + i] = 0; vm_page_unwire_noq(m); vm_page_free_zero(m); if (pmap->pm_pml4u != NULL) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); vm_page_unwire_noq(m); vm_page_free(m); } if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_fini(&pmap->pm_pkru); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * Allocate physical memory for the vm_page array and map it into KVA, * attempting to back the vm_pages with domain-local memory. */ void pmap_page_array_startup(long pages) { pdp_entry_t *pdpe; pd_entry_t *pde, newpdir; vm_offset_t va, start, end; vm_paddr_t pa; long pfn; int domain, i; vm_page_array_size = pages; start = va = VM_MIN_KERNEL_ADDRESS; end = va + pages * sizeof(struct vm_page); while (va < end) { pfn = first_page + (va - start) / sizeof(struct vm_page); domain = _vm_phys_domain(ptoa(pfn)); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) { pa = vm_phys_early_alloc(domain, PAGE_SIZE); dump_add_page(pa); pagezero((void *)PHYS_TO_DMAP(pa)); *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); } pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) != 0) panic("Unexpected pde"); pa = vm_phys_early_alloc(domain, NBPDR); for (i = 0; i < NPDEPG; i++) dump_add_page(pa + i * PAGE_SIZE); newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS | pg_g | pg_nx); pde_store(pde, newpdir); va += NBPDR; } vm_page_array = (vm_page_t)start; } /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t *pdpe; mtx_assert(&kernel_map->system_mtx, MA_OWNED); /* * Return if "addr" is within the range of kernel page table pages * that were preallocated during pmap bootstrap. Moreover, leave * "kernel_vm_end" and the kernel page table as they were. * * The correctness of this action is based on the following * argument: vm_map_insert() allocates contiguous ranges of the * kernel virtual address space. It calls this function if a range * ends after "kernel_vm_end". If the kernel is mapped between * "kernel_vm_end" and "addr", then the range cannot begin at * "kernel_vm_end". In fact, its beginning address cannot be less * than the kernel. Thus, there is no immediate need to allocate * any new kernel page table pages between "kernel_vm_end" and * "KERNBASE". */ if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) return; addr = roundup2(addr, NBPDR); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); if ((*pdpe & X86_PG_V) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); continue; /* try again */ } pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if ((*pde & X86_PG_V) != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pde_store(pde, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif static void reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) { if (pmap == NULL) return; pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); if (start_di) pmap_delayed_invl_finish(); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; bool start_di; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; PG_G = PG_A = PG_M = PG_RW = 0; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; /* * A delayed invalidation block should already be active if * pmap_advise() or pmap_remove() called this function by way * of pmap_demote_pde_locked(). */ start_di = pmap_not_in_di(); mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pc_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); if (start_di) pmap_delayed_invl_start(); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { if (start_di) pmap_delayed_invl_start(); mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } else if (start_di) pmap_delayed_invl_start(); PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfq(inuse); pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_W) != 0) continue; tpte = pte_load_clear(pte); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_delayed_invl_page(m); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde, &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Returns the number of one bits within the given PV chunk map. * * The erratas for Intel processors state that "POPCNT Instruction May * Take Longer to Execute Than Expected". It is believed that the * issue is the spurious dependency on the destination register. * Provide a hint to the register rename logic that the destination * value is overwritten, by clearing it, as suggested in the * optimization manual. It should be cheap for unaffected processors * as well. * * Reference numbers for erratas are * 4th Gen Core: HSD146 * 5th Gen Core: BDM85 * 6th Gen Core: SKL029 */ static int popcnt_pc_map_pq(uint64_t *map) { u_long result, tmp; __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" : "=&r" (result), "=&r" (tmp) : "m" (map[0]), "m" (map[1]), "m" (map[2])); return (result); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { #ifndef __POPCNT__ if ((cpu_feature2 & CPUID2_POPCNT) == 0) bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); else #endif free = popcnt_pc_map_pq(pc->pc_map); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + NBPDR - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = pde & PG_PS_FRAME; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { struct rwlock *lock; boolean_t rv; lock = NULL; rv = pmap_demote_pde_locked(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } static void pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) { #ifdef INVARIANTS #ifdef DIAGNOSTIC pt_entry_t *xpte, *ypte; for (xpte = firstpte; xpte < firstpte + NPTEPG; xpte++, newpte += PAGE_SIZE) { if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { printf("pmap_demote_pde: xpte %zd and newpte map " "different pages: found %#lx, expected %#lx\n", xpte - firstpte, *xpte, newpte); printf("page table dump\n"); for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) printf("%zd %#lx\n", ypte - firstpte, *ypte); panic("firstpte"); } } #else KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); #endif #endif } static void pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t oldpde, struct rwlock **lockp) { struct spglist free; vm_offset_t sva; SLIST_INIT(&free); sva = trunc_2mpage(va); pmap_remove_pde(pmap, pde, sva, &free, lockp); if ((oldpde & pmap_global_bit(pmap)) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", va, pmap); } static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; int PG_PTE_CACHE; bool in_kernel; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); in_kernel = va >= VM_MAXUSER_ADDRESS; oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldpde & PG_A) == 0) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: a wired mapping is missing PG_A")); pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS), ("pmap_demote_pde: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ mpte = vm_page_alloc(NULL, pmap_pde_pindex(va), (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (mpte == NULL) { pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } if (!in_kernel) { mpte->wire_count = NPTEPG; pmap_resident_count_inc(pmap, 1); } } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; newpte = pmap_swap_pat(pmap, newpte); /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); pmap_demote_pde_check(firstpte, newpte); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the PDE and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_pde() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldpde & PG_MANAGED) != 0) reserve_pv_entries(pmap, NPTEPG - 1, lockp); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ if (in_kernel) pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); atomic_add_long(&pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (mpte->valid != 0) pagezero((void *)PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 2mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); if (oldpde & PG_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_page(m); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldpte, PG_A, PG_M, PG_RW; vm_page_t m; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_delayed_invl_page(m); } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free) { struct rwlock *lock; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; lock = NULL; pmap_remove_pte(pmap, pte, va, *pde, free, &lock); if (lock != NULL) rw_wunlock(lock); pmap_invalidate_page(pmap, va); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) { pt_entry_t PG_G, *pte; vm_offset_t va; bool anyvalid; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_G = pmap_global_bit(pmap); anyvalid = false; va = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, sva += PAGE_SIZE) { if (*pte == 0) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } if ((*pte & PG_G) == 0) anyvalid = true; else if (va == eva) va = sva; if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { sva += PAGE_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); return (anyvalid); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t PG_G, PG_V; struct spglist free; int anyvalid; PG_G = pmap_global_bit(pmap); PG_V = pmap_valid_bit(pmap); /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); pmap_delayed_invl_start(); PMAP_LOCK(pmap); pmap_pkru_on_remove(pmap, sva, eva); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva, pde, &free); goto out; } } lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, pde, sva, &free, &lock); continue; } else if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { /* The large page mapping was destroyed. */ continue; } else ptpaddr = *pde; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) anyvalid = 1; } if (lock != NULL) rw_wunlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; struct spglist free; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, va); (void)pmap_demote_pde_locked(pmap, pde, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); pmap_delayed_invl_wait(m); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_pde: do the things to protect a 2mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_page_t m, mt; boolean_t anychanged; pt_entry_t PG_G, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 2mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((prot & VM_PROT_WRITE) == 0) { if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } newpde &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; boolean_t anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; /* * Although this function delays and batches the invalidation * of stale TLB entries, it does not need to call * pmap_delayed_invl_start() and * pmap_delayed_invl_finish(), because it does not * ordinarily destroy mappings. Stale TLB entries from * protection-only changes need only be invalidated before the * pmap lock is released, because protection-only changes do * not destroy PV entries. Even operations that iterate over * a physical page's PV list of mappings, like * pmap_remove_write(), acquire the pmap lock for each * mapping. Consequently, for protection-only changes, the * pmap lock suffices to synchronize both page table and TLB * updates. * * This function only destroys a mapping if pmap_demote_pde() * fails. In that case, stale TLB entries are immediately * invalidated. */ PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, pde, sva, prot)) anychanged = TRUE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pt_entry_t obits, pbits; vm_page_t m; retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pte, obits, pbits)) goto retry; if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; vm_page_t mpte; int PG_PTE_CACHE; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK), pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); /* * Propagate the PAT index to its proper position. */ newpde = pmap_swap_pat(pmap, newpde); /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); atomic_add_long(&pmap_pde_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. * * When destroying both a page table and PV entry, this function * performs the TLB invalidation before releasing the PV list * lock, so we do not need pmap_delayed_invl_page() calls here. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; boolean_t nosleep; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if ((newpte & PG_RW) != 0) newpte |= PG_M; } else newpte |= PG_MANAGED; lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va); if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || pmap_demote_pde_locked(pmap, pde, va, &lock))) { pte = pmap_pde_to_pte(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); mpte->wire_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: invalid page directory va=%#lx", va); origpte = *pte; pv = NULL; if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) newpte |= pmap_pkru_get(pmap, va); /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = pte_load_clear(pte); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); origpte = 0; } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_V = pmap_valid_bit(pmap); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t oldpde, *pde; pt_entry_t PG_G, PG_RW, PG_V; vm_page_t mt, pdpg; KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, ("pmap_enter_pde: cannot create wired user mapping")); PG_G = pmap_global_bit(pmap); PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } /* * If pkru is not same for the whole pde range, return failure * and let vm_fault() cope. Check after pde allocation, since * it could sleep. */ if (!pmap_pkru_same(pmap, va, va + NBPDR)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } return (KERN_FAILURE); } if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { newpde &= ~X86_PG_PKU_MASK; newpde |= pmap_pkru_get(pmap, va); } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; oldpde = *pde; if ((oldpde & PG_V) != 0) { KASSERT(pdpg->wire_count > 1, ("pmap_enter_pde: pdpg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { pdpg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldpde & PG_PS) != 0) { /* * The reference to the PD page that was acquired by * pmap_allocpde() ensures that it won't be freed. * However, if the PDE resulted from a promotion, then * a reserved PT page could be freed. */ (void)pmap_remove_pde(pmap, pde, va, &free, lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { pmap_delayed_invl_start(); if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, lockp)) pmap_invalidate_all(pmap); pmap_delayed_invl_finish(); } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_pde() and pmap_remove_ptes() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); } else KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); atomic_add_long(&pmap_pde_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pt_entry_t newpte, *pte, PG_V; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_allocpte(pmap, ptepindex, NULL); if (mpte == NULL) return (mpte); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); pte = &pte[pmap_pte_index(va)]; } else { mpte = NULL; pte = vtopte(va); } if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); newpte = VM_PAGE_TO_PHYS(m) | PG_V | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U | pmap_pkru_get(pmap, va); pte_store(pte, newpte); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; int pat_mode; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!pmap_ps_enabled(pmap)) return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2MB pages. Since "ptepa" is 2M aligned and * "size" is a multiple of 2M, adding the PAT setting to "pa" * will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pdpg = pmap_allocpde(pmap, addr, NULL); if (pdpg == NULL) { /* * The creation of mappings below is only an * optimization. If a page directory page * cannot be allocated without blocking, * continue on to the next mapping rather than * blocking. */ addr += NBPDR; continue; } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(addr)]; if ((*pde & PG_V) == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else { /* Continue on if the PDE is already valid. */ pdpg->wire_count--; KASSERT(pdpg->wire_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. * Since pmap_demote_pde() for the wired entry must never fail, * pmap_delayed_invl_start()/finish() calls around the * function are not needed. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { atomic_clear_long(pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(pte, PG_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde, srcptepaddr; pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_pdpg, dstmpte, srcmpte; if (dst_addr != src_addr) return; if (dst_pmap->pm_type != src_pmap->pm_type) return; /* * EPT page table entries that require emulation of A/D bits are * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit * (aka EPT_PG_EXECUTE) could still be set. Since some EPT * implementations flag an EPT misconfiguration for exec-only * mappings we skip this function entirely for emulated pmaps. */ if (pmap_emulate_ad_bits(dst_pmap)) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } PG_A = pmap_accessed_bit(dst_pmap); PG_M = pmap_modified_bit(dst_pmap); PG_V = pmap_valid_bit(dst_pmap); for (addr = src_addr; addr < end_addr; addr = va_next) { KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pml4e = pmap_pml4e(src_pmap, addr); if ((*pml4e & PG_V) == 0) { va_next = (addr + NBPML4) & ~PML4MASK; if (va_next < addr) va_next = end_addr; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, addr); if ((*pdpe & PG_V) == 0) { va_next = (addr + NBPDP) & ~PDPMASK; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + NBPDR) & ~PDRMASK; if (va_next < addr) va_next = end_addr; pde = pmap_pdpe_to_pde(pdpe, addr); srcptepaddr = *pde; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL); if (dst_pdpg == NULL) break; pde = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); pde = &pde[pmap_pde_index(addr)]; if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { *pde = srcptepaddr & ~PG_W; pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else dst_pdpg->wire_count--; continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_pte_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = *src_pte; /* * We only virtual copy managed pages. */ if ((ptetemp & PG_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_pde_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->wire_count++; } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not mapped, * paging-structure caches could * nonetheless have entries that refer * to the freed page table pages. * Invalidate those entries. */ pmap_invalidate_page(dst_pmap, addr); vm_page_free_pages_toq(&free, true); } goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->wire_count >= srcmpte->wire_count) break; } } out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) { int error; if (dst_pmap->pm_type != src_pmap->pm_type || dst_pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (0); for (;;) { if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } error = pmap_pkru_copy(dst_pmap, src_pmap); /* Clean up partial copy on failure due to no memory. */ if (error == ENOMEM) pmap_pkru_deassign_all(dst_pmap); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } /* * Zero the specified hardware page. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * Copy 1 specified hardware page to another. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t pages[2]; vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; int cnt; boolean_t mapped; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; pages[0] = ma[a_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; pages[1] = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); a_cp = (char *)vaddr[0] + a_pg_offset; b_cp = (char *)vaddr[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); if (__predict_false(mapped)) pmap_unmap_io_transient(pages, vaddr, 2, FALSE); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. * * Although this function destroys all of the pmap's managed, * non-wired mappings, it can delay and batch the invalidation of TLB * entries without calling pmap_delayed_invl_start() and * pmap_delayed_invl_finish(). Because the pmap is not active on * any other processor, none of these TLB entries will ever be used * before their eventual invalidation. Consequently, there is no need * for either pmap_remove_all() or pmap_remove_write() to wait for * that eventual TLB invalidation. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t ptepde; pt_entry_t *pte, tpte; pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; boolean_t superpage; vm_paddr_t pa; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); #ifdef INVARIANTS { cpuset_t other_cpus; other_cpus = all_cpus; critical_enter(); CPU_CLR(PCPU_GET(cpuid), &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); critical_exit(); KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); } #endif lock = NULL; PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfq(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pdpe(pmap, pv->pv_va); ptepde = *pte; pte = pmap_pdpe_to_pde(pte, pv->pv_va); tpte = *pte; if ((tpte & (PG_PS | PG_V)) == PG_V) { superpage = FALSE; ptepde = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); pte = &pte[pmap_pte_index(pv->pv_va)]; tpte = *pte; } else { /* * Keep track whether 'tpte' is a * superpage explicitly instead of * relying on PG_PS being set. * * This is because PG_PS is numerically * identical to PG_PTE_PAT and thus a * regular page could be mistaken for * a superpage. */ superpage = TRUE; } if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", pv->pv_va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } if (superpage) pa = tpte & PG_PS_FRAME; else pa = tpte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (superpage) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; if (superpage) { pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); pmap_pkru_deassign_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask; pt_entry_t PG_A, PG_M, PG_RW, PG_V; pmap_t pmap; int md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte, PG_V; boolean_t rv; PG_V = pmap_valid_bit(pmap); rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pd_entry_t *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW; vm_offset_t va; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde_locked(pmap, pde, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); retry: oldpte = *pte; if (oldpte & PG_RW) { if (!atomic_cmpset_long(pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_wait(m); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { if (!pmap_emulate_ad_bits(pmap)) return (TRUE); KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); /* * XWR = 010 or 110 will cause an unconditional EPT misconfiguration * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared * if the EPT_PG_WRITE bit is set. */ if ((pte & EPT_PG_WRITE) != 0) return (FALSE); /* * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. */ if ((pte & EPT_PG_EXECUTE) == 0 || ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) return (TRUE); else return (FALSE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). * * A DI block is not needed within this function, because * invalidations are performed before the PV list lock is * released. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW; vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; boolean_t demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va); oldpde = *pde; if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "oldpde" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((oldpde & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (oldpde & PG_W) == 0) { if (safe_to_clear_referenced(pmap, oldpde)) { atomic_clear_long(pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); demoted = FALSE; } else if (pmap_demote_pde_locked(pmap, pde, pv->pv_va, &lock)) { /* * Remove the mapping to a single page * so that a subsequent access may * repromote. Since the underlying * page table page is fully populated, * this removal never frees a page * table page. */ demoted = TRUE; va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); pmap_invalidate_page(pmap, va); } else demoted = TRUE; if (demoted) { /* * The superpage mapping was removed * entirely and therefore 'pv' is no * longer valid. */ if (pvf == pv) pvf = NULL; pv = NULL; } cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { if (safe_to_clear_referenced(pmap, *pte)) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else if ((*pte & PG_W) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_pte(pmap, pte, pv->pv_va, *pde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; vm_offset_t va, va_next; vm_page_t m; bool anychanged; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; /* * A/D bit emulation requires an alternate code path when clearing * the modified and accessed bits below. Since this function is * advisory in nature we skip it entirely for pmaps that require * A/D bit emulation. */ if (pmap_emulate_ad_bits(pmap)) return; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = false; pmap_delayed_invl_start(); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(va_next, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldpde & PG_W) == 0) { va = eva; if (va > va_next) va = va_next; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); pte = pmap_pde_to_pte(pde, va); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); anychanged = true; } if (lock != NULL) rw_wunlock(lock); } if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_long(pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_long(pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == va_next) va = sva; } else anychanged = true; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_M, PG_RW; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); oldpde = *pde; /* If oldpde has PG_RW set, then it also has PG_M set. */ if ((oldpde & PG_RW) != 0 && pmap_demote_pde_locked(pmap, pde, va, &lock) && (oldpde & PG_W) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); atomic_clear_long(pte, PG_M | PG_RW); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~mask; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~mask; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ static void * pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = trunc_page(pa); if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && (ppim->mode == mode || (flags & MAPDEV_SETATTR) == 0)) return ((void *)(ppim->va + offset)); } /* * If the specified range of physical addresses fits within * the direct map window, use the direct map. */ if (pa < dmaplimit && pa + size <= dmaplimit) { va = PHYS_TO_DMAP(pa); if ((flags & MAPDEV_SETATTR) != 0) { PMAP_LOCK(kernel_pmap); i = pmap_change_attr_locked(va, size, mode, flags); PMAP_UNLOCK(kernel_pmap); } else i = 0; if (!i) return ((void *)(va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); if ((flags & MAPDEV_FLUSHCACHE) != 0) pmap_invalidate_cache_range(va, va + tmpsize); return ((void *)(va + offset)); } void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | MAPDEV_SETATTR)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, MAPDEV_SETATTR)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, MAPDEV_FLUSHCACHE)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Tries to demote a 1GB page mapping. */ static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) { pdp_entry_t newpdpe, oldpdpe; pd_entry_t *firstpde, newpde, *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pdpgpa; vm_page_t pdpg; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = *pdpe; KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; KASSERT((oldpdpe & PG_A) != 0, ("pmap_demote_pdpe: oldpdpe is missing PG_A")); KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pdpe: oldpdpe is missing PG_M")); newpde = oldpdpe; /* * Initialize the page directory page. */ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { *pde = newpde; newpde += NBPDR; } /* * Demote the mapping. */ *pdpe = newpdpe; /* * Invalidate a stale recursive mapping of the page directory page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); pmap_pdpe_demotions++; CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pat_mode)) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode, MAPDEV_FLUSHCACHE); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, int flags) { vm_offset_t base, offset, tmpva; vm_paddr_t pa_start, pa_end, pa_end1; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde, error; boolean_t changed; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ if (base < DMAP_MIN_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down 2MB pages * into 4KB pages if required. */ for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (pdpe == NULL || *pdpe == 0) return (EINVAL); if (*pdpe & PG_PS) { /* * If the current 1GB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } /* * If the current offset aligns with a 1GB page frame * and there is at least 1GB left within the range, then * we need not break down this page into 2MB pages. */ if ((tmpva & PDPMASK) == 0 && tmpva + PDPMASK < base + size) { tmpva += NBPDP; continue; } if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) return (ENOMEM); } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde == 0) return (EINVAL); if (*pde & PG_PS) { /* * If the current 2MB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_2mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2MB page frame * and there is at least 2MB left within the range, then * we need not break down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) return (ENOMEM); } pte = pmap_pde_to_pte(pde, tmpva); if (*pte == 0) return (EINVAL); tmpva += PAGE_SIZE; } error = 0; /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ pa_start = pa_end = 0; for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (*pdpe & PG_PS) { if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pdpe, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } else if (pa_end == (*pdpe & PG_PS_FRAME)) pa_end += NBPDP; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } } tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde & PG_PS) { if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } else if (pa_end == (*pde & PG_PS_FRAME)) pa_end += NBPDR; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } } tmpva = trunc_2mpage(tmpva) + NBPDR; } else { pte = pmap_pde_to_pte(pde, tmpva); if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte, X86_PG_PTE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } else if (pa_end == (*pte & PG_FRAME)) pa_end += PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, flags); if (error != 0) break; /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } } tmpva += PAGE_SIZE; } } if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { pa_end1 = MIN(pa_end, dmaplimit); if (pa_start != pa_end1) error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), pa_end1 - pa_start, mode, flags); } /* * Flush CPU caches if required to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); if ((flags & MAPDEV_FLUSHCACHE) != 0) pmap_invalidate_cache_range(base, tmpva); } return (error); } /* * Demotes any mapping within the direct map region that covers more than the * specified range of physical addresses. This range's size must be a power * of two and its starting address must be a multiple of its size. Since the * demotion does not change any attributes of the mapping, a TLB invalidation * is not mandatory. The caller may, however, request a TLB invalidation. */ void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_offset_t va; boolean_t changed; if (len == 0) return; KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); KASSERT((base & (len - 1)) == 0, ("pmap_demote_DMAP: base is not a multiple of len")); if (len < NBPDP && base < dmaplimit) { va = PHYS_TO_DMAP(base); changed = FALSE; PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDPE"); if ((*pdpe & PG_PS) != 0) { if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) panic("pmap_demote_DMAP: PDPE failed"); changed = TRUE; } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDE"); if ((*pde & PG_PS) != 0) { if (!pmap_demote_pde(kernel_pmap, pde, va)) panic("pmap_demote_DMAP: PDE failed"); changed = TRUE; } } if (changed && invalidate) pmap_invalidate_page(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; int val; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (pdep != NULL && (*pdep & PG_V)) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { pte = *pmap_pde_to_pte(pdep, addr); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } static uint64_t pmap_pcid_alloc(pmap_t pmap, u_int cpuid) { uint32_t gen, new_gen, pcid_next; CRITICAL_ASSERT(curthread); gen = PCPU_GET(pcid_gen); if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) return (pti ? 0 : CR3_PCID_SAVE); if (pmap->pm_pcids[cpuid].pm_gen == gen) return (CR3_PCID_SAVE); pcid_next = PCPU_GET(pcid_next); KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), ("cpu %d pcid_next %#x", cpuid, pcid_next)); if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { new_gen = gen + 1; if (new_gen == 0) new_gen = 1; PCPU_SET(pcid_gen, new_gen); pcid_next = PMAP_PCID_KERN + 1; } else { new_gen = gen; } pmap->pm_pcids[cpuid].pm_pcid = pcid_next; pmap->pm_pcids[cpuid].pm_gen = new_gen; PCPU_SET(pcid_next, pcid_next + 1); return (0); } static uint64_t pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) { uint64_t cached; cached = pmap_pcid_alloc(pmap, cpuid); KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, ("pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, ("non-kernel pmap pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); return (cached); } static void pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) { PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? PCPU_GET(pti_rsp0) : (uintptr_t)td->td_pcb; } static void inline pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1) { struct invpcid_descr d; uint64_t cached, cr3, kcr3, ucr3; cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); PCPU_SET(curpmap, pmap); kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | PMAP_PCID_USER_PT; if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { /* * Explicitly invalidate translations cached from the * user page table. They are not automatically * flushed by reload of cr3 with the kernel page table * pointer above. * * Note that the if() condition is resolved statically * by using the function argument instead of * runtime-evaluated invpcid_works value. */ if (invpcid_works1) { d.pcid = PMAP_PCID_USER_PT | pmap->pm_pcids[cpuid].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); } else { pmap_pti_pcid_invalidate(ucr3, kcr3); } } PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); if (cached) PCPU_INC(pm_save_cnt); } static void pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { pmap_activate_sw_pcid_pti(pmap, cpuid, true); pmap_activate_sw_pti_post(td, pmap); } static void pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { register_t rflags; /* * If the INVPCID instruction is not available, * invltlb_pcid_handler() is used to handle an invalidate_all * IPI, which checks for curpmap == smp_tlb_pmap. The below * sequence of operations has a window where %CR3 is loaded * with the new pmap's PML4 address, but the curpmap value has * not yet been updated. This causes the invltlb IPI handler, * which is called between the updates, to execute as a NOP, * which leaves stale TLB entries. * * Note that the most typical use of pmap_activate_sw(), from * the context switch, is immune to this race, because * interrupts are disabled (while the thread lock is owned), * and the IPI happens after curpmap is updated. Protect * other callers in a similar way, by disabling interrupts * around the %cr3 register reload and curpmap assignment. */ rflags = intr_disable(); pmap_activate_sw_pcid_pti(pmap, cpuid, false); intr_restore(rflags); pmap_activate_sw_pti_post(td, pmap); } static void pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { uint64_t cached, cr3; cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | cached); PCPU_SET(curpmap, pmap); if (cached) PCPU_INC(pm_save_cnt); } static void pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { register_t rflags; rflags = intr_disable(); pmap_activate_sw_pcid_nopti(td, pmap, cpuid); intr_restore(rflags); } static void pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid __unused) { load_cr3(pmap->pm_cr3); PCPU_SET(curpmap, pmap); } static void pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, u_int cpuid __unused) { pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); PCPU_SET(kcr3, pmap->pm_cr3); PCPU_SET(ucr3, pmap->pm_ucr3); pmap_activate_sw_pti_post(td, pmap); } DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, u_int)) { if (pmap_pcid_enabled && pti && invpcid_works) return (pmap_activate_sw_pcid_invpcid_pti); else if (pmap_pcid_enabled && pti && !invpcid_works) return (pmap_activate_sw_pcid_noinvpcid_pti); else if (pmap_pcid_enabled && !pti && invpcid_works) return (pmap_activate_sw_pcid_nopti); else if (pmap_pcid_enabled && !pti && !invpcid_works) return (pmap_activate_sw_pcid_noinvpcid_nopti); else if (!pmap_pcid_enabled && pti) return (pmap_activate_sw_nopcid_pti); else /* if (!pmap_pcid_enabled && !pti) */ return (pmap_activate_sw_nopcid_nopti); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; u_int cpuid; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) return; cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif pmap_activate_sw_mode(td, pmap, cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); #endif } void pmap_activate(struct thread *td) { critical_enter(); pmap_activate_sw(td); critical_exit(); } void pmap_activate_boot(pmap_t pmap) { uint64_t kcr3; u_int cpuid; /* * kernel_pmap must be never deactivated, and we ensure that * by never activating it at all. */ MPASS(pmap != kernel_pmap); cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); if (pti) { kcr3 = pmap->pm_cr3; if (pmap_pcid_enabled) kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; } else { kcr3 = PMAP_NO_CR3; } PCPU_SET(kcr3, kcr3); PCPU_SET(ucr3, PMAP_NO_CR3); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef INVARIANTS static unsigned long num_dirty_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, &num_dirty_emulations, 0, NULL); static unsigned long num_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, &num_accessed_emulations, 0, NULL); static unsigned long num_superpage_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, &num_superpage_accessed_emulations, 0, NULL); static unsigned long ad_emulation_superpage_promotions; SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, &ad_emulation_superpage_promotions, 0, NULL); #endif /* INVARIANTS */ int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) { int rv; struct rwlock *lock; #if VM_NRESERVLEVEL > 0 vm_page_t m, mpte; #endif pd_entry_t *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); if (!pmap_emulate_ad_bits(pmap)) return (-1); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); rv = -1; lock = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) goto done; if ((*pde & PG_PS) != 0) { if (ftype == VM_PROT_READ) { #ifdef INVARIANTS atomic_add_long(&num_superpage_accessed_emulations, 1); #endif *pde |= PG_A; rv = 0; } goto done; } pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) goto done; if (ftype == VM_PROT_WRITE) { if ((*pte & PG_RW) == 0) goto done; /* * Set the modified and accessed bits simultaneously. * * Intel EPT PTEs that do software emulation of A/D bits map * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. * An EPT misconfiguration is triggered if the PTE is writable * but not readable (WR=10). This is avoided by setting PG_A * and PG_M simultaneously. */ *pte |= PG_M | PG_A; } else { *pte |= PG_A; } #if VM_NRESERVLEVEL > 0 /* try to promote the mapping */ if (va < VM_MAXUSER_ADDRESS) mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); else mpte = NULL; m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_pde(pmap, pde, va, &lock); #ifdef INVARIANTS atomic_add_long(&ad_emulation_superpage_promotions, 1); #endif } #endif #ifdef INVARIANTS if (ftype == VM_PROT_WRITE) atomic_add_long(&num_dirty_emulations, 1); else atomic_add_long(&num_accessed_emulations, 1); #endif rv = 0; /* success */ done: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) { pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; int idx; idx = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pml4 = pmap_pml4e(pmap, va); ptr[idx++] = *pml4; if ((*pml4 & PG_V) == 0) goto done; pdp = pmap_pml4e_to_pdpe(pml4, va); ptr[idx++] = *pdp; if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) goto done; pde = pmap_pdpe_to_pde(pdp, va); ptr[idx++] = *pde; if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) goto done; pte = pmap_pde_to_pte(pde, va); ptr[idx++] = *pte; done: PMAP_UNLOCK(pmap); *num = idx; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; pt_entry_t *pte; int cache_bits, error __unused, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= dmaplimit)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); /* * NB: The sequence of updating a page table followed by accesses * to the corresponding pages used in the !DMAP case is subject to * the situation described in the "AMD64 Architecture Programmer's * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special * Coherency Considerations". Therefore, issuing the INVLPG right * after modifying the PTE bits is crucial. */ if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) { /* * Slow path, since we can get page faults * while mappings are active don't pin the * thread to the CPU and instead add a global * mapping visible to all CPUs. */ pmap_qenter(vaddr[i], &page[i], 1); } else { pte = vtopte(vaddr[i]); cache_bits = pmap_cache_bits(kernel_pmap, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); invlpg(vaddr[i]); } } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) pmap_qremove(vaddr[i], 1); vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); } } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; paddr = VM_PAGE_TO_PHYS(m); if (paddr < dmaplimit) return (PHYS_TO_DMAP(paddr)); mtx_lock_spin(&qframe_mtx); KASSERT(*vtopte(qframe) == 0, ("qframe busy")); pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); return (qframe); } void pmap_quick_remove_page(vm_offset_t addr) { if (addr != qframe) return; pte_store(vtopte(qframe), 0); invlpg(qframe); mtx_unlock_spin(&qframe_mtx); } /* * Pdp pages from the large map are managed differently from either * kernel or user page table pages. They are permanently allocated at * initialization time, and their wire count is permanently set to * zero. The pml4 entries pointing to those pages are copied into * each allocated pmap. * * In contrast, pd and pt pages are managed like user page table * pages. They are dynamically allocated, and their wire count * represents the number of valid entries within the page. */ static vm_page_t pmap_large_map_getptp_unlocked(void) { vm_page_t m; m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_ZERO); if (m != NULL && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } static vm_page_t pmap_large_map_getptp(void) { vm_page_t m; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); m = pmap_large_map_getptp_unlocked(); if (m == NULL) { PMAP_UNLOCK(kernel_pmap); vm_wait(NULL); PMAP_LOCK(kernel_pmap); /* Callers retry. */ } return (m); } static pdp_entry_t * pmap_large_map_pdpe(vm_offset_t va) { vm_pindex_t pml4_idx; vm_paddr_t mphys; pml4_idx = pmap_pml4e_index(va); KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " "%#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0, ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " "LMSPML4I %#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME; return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } static pd_entry_t * pmap_large_map_pde(vm_offset_t va) { pdp_entry_t *pdpe; vm_page_t m; vm_paddr_t mphys; retry: pdpe = pmap_large_map_pdpe(va); if (*pdpe == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & PG_FRAME; } return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); } static pt_entry_t * pmap_large_map_pte(vm_offset_t va) { pd_entry_t *pde; vm_page_t m; vm_paddr_t mphys; retry: pde = pmap_large_map_pde(va); if (*pde == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++; } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & PG_FRAME; } return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); } static vm_paddr_t pmap_large_map_kextract(vm_offset_t va) { pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte, pt; KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), ("not largemap range %#lx", (u_long)va)); pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) return ((pd & PG_PS_FRAME) | (va & PDRMASK)); pte = pmap_pde_to_pte(pde, va); pt = *pte; KASSERT((pt & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); return ((pt & PG_FRAME) | (va & PAGE_MASK)); } static int pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, vmem_addr_t *vmem_res) { /* * Large mappings are all but static. Consequently, there * is no point in waiting for an earlier allocation to be * freed. */ return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); } int pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, vm_memattr_t mattr) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; vm_offset_t va, inc; vmem_addr_t vmem_res; vm_paddr_t pa; int error; if (len == 0 || spa + len < spa) return (EINVAL); /* See if DMAP can serve. */ if (spa + len <= dmaplimit) { va = PHYS_TO_DMAP(spa); *addr = (void *)va; return (pmap_change_attr(va, len, mattr)); } /* * No, allocate KVA. Fit the address with best possible * alignment for superpages. Fall back to worse align if * failed. */ error = ENOMEM; if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, NBPDP) >= roundup2(spa, NBPDP) + NBPDP) error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, &vmem_res); if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, NBPDR) + NBPDR) error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, &vmem_res); if (error != 0) error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); if (error != 0) return (error); /* * Fill pagetable. PG_M is not pre-set, we scan modified bits * in the pagetable to minimize flushing. No need to * invalidate TLB, since we only update invalid entries. */ PMAP_LOCK(kernel_pmap); for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, len -= inc) { if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { pdpe = pmap_large_map_pdpe(va); MPASS(*pdpe == 0); *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); inc = NBPDP; } else if (len >= NBPDR && (pa & PDRMASK) == 0 && (va & PDRMASK) == 0) { pde = pmap_large_map_pde(va); MPASS(*pde == 0); *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> wire_count++; inc = NBPDR; } else { pte = pmap_large_map_pte(va); MPASS(*pte == 0); *pte = pa | pg_g | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, FALSE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> wire_count++; inc = PAGE_SIZE; } } PMAP_UNLOCK(kernel_pmap); MPASS(len == 0); *addr = (void *)vmem_res; return (0); } void pmap_large_unmap(void *svaa, vm_size_t len) { vm_offset_t sva, va; vm_size_t inc; pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte; vm_page_t m; struct spglist spgf; sva = (vm_offset_t)svaa; if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) return; SLIST_INIT(&spgf); KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); PMAP_LOCK(kernel_pmap); for (va = sva; va < sva + len; va += inc) { pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT((va & PDPMASK) == 0, ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT(va + NBPDP <= sva + len, ("unmap covers partial 1GB page, sva %#lx va %#lx " "pdpe %#lx pdp %#lx len %#lx", sva, va, (u_long)pdpe, pdp, len)); *pdpe = 0; inc = NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) { KASSERT((va & PDRMASK) == 0, ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); KASSERT(va + NBPDR <= sva + len, ("unmap covers partial 2MB page, sva %#lx va %#lx " "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, pd, len)); pde_store(pde, 0); inc = NBPDR; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->wire_count--; if (m->wire_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } continue; } pte = pmap_pde_to_pte(pde, va); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); inc = PAGE_SIZE; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); m->wire_count--; if (m->wire_count == 0) { *pde = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->wire_count--; if (m->wire_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } } } pmap_invalidate_range(kernel_pmap, sva, sva + len); PMAP_UNLOCK(kernel_pmap); vm_page_free_pages_toq(&spgf, false); vmem_free(large_vmem, sva, len); } static void pmap_large_map_wb_fence_mfence(void) { mfence(); } static void pmap_large_map_wb_fence_sfence(void) { sfence(); } static void pmap_large_map_wb_fence_nop(void) { } DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) { if (cpu_vendor_id != CPU_VENDOR_INTEL) return (pmap_large_map_wb_fence_mfence); else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | CPUID_STDEXT_CLFLUSHOPT)) == 0) return (pmap_large_map_wb_fence_sfence); else /* clflush is strongly enough ordered */ return (pmap_large_map_wb_fence_nop); } static void pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clwb(va); } static void pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflushopt(va); } static void pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflush(va); } static void pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) { } DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) { if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) return (pmap_large_map_flush_range_clwb); else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) return (pmap_large_map_flush_range_clflushopt); else if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_large_map_flush_range_clflush); else return (pmap_large_map_flush_range_nop); } static void pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) { volatile u_long *pe; u_long p; vm_offset_t va; vm_size_t inc; bool seen_other; for (va = sva; va < eva; va += inc) { inc = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { pe = (volatile u_long *)pmap_large_map_pdpe(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDP; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pde(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDR; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pte(va); p = *pe; inc = PAGE_SIZE; } seen_other = false; for (;;) { if ((p & X86_PG_AVAIL1) != 0) { /* * Spin-wait for the end of a parallel * write-back. */ cpu_spinwait(); p = *pe; /* * If we saw other write-back * occuring, we cannot rely on PG_M to * indicate state of the cache. The * PG_M bit is cleared before the * flush to avoid ignoring new writes, * and writes which are relevant for * us might happen after. */ seen_other = true; continue; } if ((p & X86_PG_M) != 0 || seen_other) { if (!atomic_fcmpset_long(pe, &p, (p & ~X86_PG_M) | X86_PG_AVAIL1)) /* * If we saw PG_M without * PG_AVAIL1, and then on the * next attempt we do not * observe either PG_M or * PG_AVAIL1, the other * write-back started after us * and finished before us. We * can rely on it doing our * work. */ continue; pmap_large_map_flush_range(va, inc); atomic_clear_long(pe, X86_PG_AVAIL1); } break; } maybe_yield(); } } /* * Write-back cache lines for the given address range. * * Must be called only on the range or sub-range returned from * pmap_large_map(). Must not be called on the coalesced ranges. * * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH * instructions support. */ void pmap_large_map_wb(void *svap, vm_size_t len) { vm_offset_t eva, sva; sva = (vm_offset_t)svap; eva = sva + len; pmap_large_map_wb_fence(); if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { pmap_large_map_flush_range(sva, len); } else { KASSERT(sva >= LARGEMAP_MIN_ADDRESS && eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); pmap_large_map_wb_large(sva, eva); } pmap_large_map_wb_fence(); } static vm_page_t pmap_pti_alloc_page(void) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); return (m); } static bool pmap_pti_free_page(vm_page_t m) { KASSERT(m->wire_count > 0, ("page %p not wired", m)); if (!vm_page_unwire_noq(m)) return (false); vm_page_free_zero(m); return (true); } static void pmap_pti_init(void) { vm_page_t pml4_pg; pdp_entry_t *pdpe; vm_offset_t va; int i; if (!pti) return; pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); VM_OBJECT_WLOCK(pti_obj); pml4_pg = pmap_pti_alloc_page(); pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { pdpe = pmap_pti_pdpe(va); pmap_pti_wire_pte(pdpe); } pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt + sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false); pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + sizeof(struct gate_descriptor) * NIDT, false); pmap_pti_add_kva_locked((vm_offset_t)common_tss, (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false); CPU_FOREACH(i) { /* Doublefault stack IST 1 */ va = common_tss[i].tss_ist1; pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* NMI stack IST 2 */ va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* MC# stack IST 3 */ va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* DB# stack IST 4 */ va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); } pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, (vm_offset_t)etext, true); pti_finalized = true; VM_OBJECT_WUNLOCK(pti_obj); } SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); static pdp_entry_t * pmap_pti_pdpe(vm_offset_t va) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; vm_page_t m; vm_pindex_t pml4_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pml4_idx = pmap_pml4e_index(va); pml4e = &pti_pml4[pml4_idx]; m = NULL; if (*pml4e == 0) { if (pti_finalized) panic("pml4 alloc after finalization\n"); m = pmap_pti_alloc_page(); if (*pml4e != 0) { pmap_pti_free_page(m); mphys = *pml4e & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pml4e = mphys | X86_PG_RW | X86_PG_V; } } else { mphys = *pml4e & ~PAGE_MASK; } pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); return (pdpe); } static void pmap_pti_wire_pte(void *pte) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); m->wire_count++; } static void pmap_pti_unwire_pde(void *pde, bool only_ref) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); MPASS(m->wire_count > 0); MPASS(only_ref || m->wire_count > 1); pmap_pti_free_page(m); } static void pmap_pti_unwire_pte(void *pte, vm_offset_t va) { vm_page_t m; pd_entry_t *pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); MPASS(m->wire_count > 0); if (pmap_pti_free_page(m)) { pde = pmap_pti_pde(va); MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); *pde = 0; pmap_pti_unwire_pde(pde, false); } } static pd_entry_t * pmap_pti_pde(vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_page_t m; vm_pindex_t pd_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pdpe = pmap_pti_pdpe(va); if (*pdpe == 0) { m = pmap_pti_alloc_page(); if (*pdpe != 0) { pmap_pti_free_page(m); MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_RW | X86_PG_V; } } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); pd_idx = pmap_pde_index(va); pde += pd_idx; return (pde); } static pt_entry_t * pmap_pti_pte(vm_offset_t va, bool *unwire_pde) { pd_entry_t *pde; pt_entry_t *pte; vm_page_t m; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pde = pmap_pti_pde(va); if (unwire_pde != NULL) { *unwire_pde = true; pmap_pti_wire_pte(pde); } if (*pde == 0) { m = pmap_pti_alloc_page(); if (*pde != 0) { pmap_pti_free_page(m); MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } else { mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_RW | X86_PG_V; if (unwire_pde != NULL) *unwire_pde = false; } } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); pte += pmap_pte_index(va); return (pte); } static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) { vm_paddr_t pa; pd_entry_t *pde; pt_entry_t *pte, ptev; bool unwire_pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); sva = trunc_page(sva); MPASS(sva > VM_MAXUSER_ADDRESS); eva = round_page(eva); MPASS(sva < eva); for (; sva < eva; sva += PAGE_SIZE) { pte = pmap_pti_pte(sva, &unwire_pde); pa = pmap_kextract(sva); ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); if (*pte == 0) { pte_store(pte, ptev); pmap_pti_wire_pte(pte); } else { KASSERT(!pti_finalized, ("pti overlap after fin %#lx %#lx %#lx", sva, *pte, ptev)); KASSERT(*pte == ptev, ("pti non-identical pte after fin %#lx %#lx %#lx", sva, *pte, ptev)); } if (unwire_pde) { pde = pmap_pti_pde(sva); pmap_pti_unwire_pde(pde, true); } } } void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) { if (!pti) return; VM_OBJECT_WLOCK(pti_obj); pmap_pti_add_kva_locked(sva, eva, exec); VM_OBJECT_WUNLOCK(pti_obj); } void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) { pt_entry_t *pte; vm_offset_t va; if (!pti) return; sva = rounddown2(sva, PAGE_SIZE); MPASS(sva > VM_MAXUSER_ADDRESS); eva = roundup2(eva, PAGE_SIZE); MPASS(sva < eva); VM_OBJECT_WLOCK(pti_obj); for (va = sva; va < eva; va += PAGE_SIZE) { pte = pmap_pti_pte(va, NULL); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); pmap_pti_unwire_pte(pte, va); } pmap_invalidate_range(kernel_pmap, sva, eva); VM_OBJECT_WUNLOCK(pti_obj); } static void * pkru_dup_range(void *ctx __unused, void *data) { struct pmap_pkru_range *node, *new_node; new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (new_node == NULL) return (NULL); node = data; memcpy(new_node, node, sizeof(*node)); return (new_node); } static void pkru_free_range(void *ctx __unused, void *node) { uma_zfree(pmap_pkru_ranges_zone, node); } static int pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { struct pmap_pkru_range *ppr; int error; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if ((flags & AMD64_PKRU_EXCL) != 0 && !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) return (EBUSY); ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (ppr == NULL) return (ENOMEM); ppr->pkru_keyidx = keyidx; ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); if (error != 0) uma_zfree(pmap_pkru_ranges_zone, ppr); return (error); } static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); return (rangeset_remove(&pmap->pm_pkru, sva, eva)); } static void pmap_pkru_deassign_all(pmap_t pmap) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_remove_all(&pmap->pm_pkru); } static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_pkru_range *ppr, *prev_ppr; vm_offset_t va; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || sva >= VM_MAXUSER_ADDRESS) return (true); MPASS(eva <= VM_MAXUSER_ADDRESS); for (va = sva, prev_ppr = NULL; va < eva;) { ppr = rangeset_lookup(&pmap->pm_pkru, va); if ((ppr == NULL) ^ (prev_ppr == NULL)) return (false); if (ppr == NULL) { va += PAGE_SIZE; continue; } if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) return (false); va = ppr->pkru_rs_el.re_end; } return (true); } static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va) { struct pmap_pkru_range *ppr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || va >= VM_MAXUSER_ADDRESS) return (0); ppr = rangeset_lookup(&pmap->pm_pkru, va); if (ppr != NULL) return (X86_PG_PKU(ppr->pkru_keyidx)); return (0); } static bool pred_pkru_on_remove(void *ctx __unused, void *r) { struct pmap_pkru_range *ppr; ppr = r; return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); } static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_remove_pred(&pmap->pm_pkru, sva, eva, pred_pkru_on_remove); } } static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) { PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); MPASS(dst_pmap->pm_type == PT_X86); MPASS(src_pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if (src_pmap->pm_pkru.rs_data_ctx == NULL) return (0); return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); } static void pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t newpde, ptpaddr, *pde; pt_entry_t newpte, *ptep, pte; vm_offset_t va, va_next; bool changed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS(keyidx <= PMAP_MAX_PKRU_IDX); for (changed = false, va = sva; va < eva; va = va_next) { pml4e = pmap_pml4e(pmap, va); if ((*pml4e & X86_PG_V) == 0) { va_next = (va + NBPML4) & ~PML4MASK; if (va_next < va) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, va); if ((*pdpe & X86_PG_V) == 0) { va_next = (va + NBPDP) & ~PDPMASK; if (va_next < va) va_next = eva; continue; } va_next = (va + NBPDR) & ~PDRMASK; if (va_next < va) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, va); ptpaddr = *pde; if (ptpaddr == 0) continue; MPASS((ptpaddr & X86_PG_V) != 0); if ((ptpaddr & PG_PS) != 0) { if (va + NBPDR == va_next && eva >= va_next) { newpde = (ptpaddr & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpde != ptpaddr) { *pde = newpde; changed = true; } continue; } else if (!pmap_demote_pde(pmap, pde, va)) { continue; } } if (va_next > eva) va_next = eva; for (ptep = pmap_pde_to_pte(pde, va); va != va_next; ptep++, va += PAGE_SIZE) { pte = *ptep; if ((pte & X86_PG_V) == 0) continue; newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpte != pte) { *ptep = newpte; changed = true; } } } if (changed) pmap_invalidate_range(pmap, sva, eva); } static int pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) return (EINVAL); if (eva <= sva || eva > VM_MAXUSER_ADDRESS) return (EFAULT); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (ENOTSUP); return (0); } int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, keyidx); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_deassign(pmap, sva, eva); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, 0); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } /* * Track a range of the kernel's virtual address space that is contiguous * in various mapping attributes. */ struct pmap_kernel_map_range { vm_offset_t sva; pt_entry_t attrs; int ptes; int pdes; int pdpes; }; static void sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { const char *mode; int i, pat_idx; if (eva <= range->sva) return; pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); for (i = 0; i < PAT_INDEX_SIZE; i++) if (pat_index[i] == pat_idx) break; switch (i) { case PAT_WRITE_BACK: mode = "WB"; break; case PAT_WRITE_THROUGH: mode = "WT"; break; case PAT_UNCACHEABLE: mode = "UC"; break; case PAT_WRITE_PROTECTED: mode = "WP"; break; case PAT_WRITE_COMBINING: mode = "WC"; break; default: - printf("%s: unknown PAT mode %#x for range %#016lx-%#016lx\n", + printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", __func__, i, range->sva, eva); mode = "??"; break; } - sbuf_printf(sb, "%#016lx-%#016lx r%c%c%c%c %s %d %d %d\n", + sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", range->sva, eva, (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', (range->attrs & pg_nx) != 0 ? '-' : 'x', (range->attrs & X86_PG_U) != 0 ? 'u' : 's', (range->attrs & X86_PG_G) != 0 ? 'g' : '-', mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); } /* * Determine whether the attributes specified by a page table entry match those * being tracked by the current range. This is not quite as simple as a direct * flag comparison since some PAT modes have multiple representations. */ static bool sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) { pt_entry_t diff, mask; mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; diff = (range->attrs ^ attrs) & mask; if (diff == 0) return (true); if ((diff & ~X86_PG_PDE_PAT) == 0 && pmap_pat_index(kernel_pmap, range->attrs, true) == pmap_pat_index(kernel_pmap, attrs, true)) return (true); return (false); } static void sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, pt_entry_t attrs) { memset(range, 0, sizeof(*range)); range->sva = va; range->attrs = attrs; } /* * Given a leaf PTE, derive the mapping's attributes. If they do not match * those of the current run, dump the address range and its attributes, and * begin a new run. */ static void sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, pt_entry_t pte) { pt_entry_t attrs; attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); attrs |= pdpe & pg_nx; attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); if ((pdpe & PG_PS) != 0) { attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); } else if (pde != 0) { attrs |= pde & pg_nx; attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); } if ((pde & PG_PS) != 0) { attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); } else if (pte != 0) { attrs |= pte & pg_nx; attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); /* Canonicalize by always using the PDE PAT bit. */ if ((attrs & X86_PG_PTE_PAT) != 0) attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; } if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va); sysctl_kmaps_reinit(range, va, attrs); } } static int sysctl_kmaps(SYSCTL_HANDLER_ARGS) { struct pmap_kernel_map_range range; struct sbuf sbuf, *sb; pml4_entry_t pml4e; pdp_entry_t *pdp, pdpe; pd_entry_t *pd, pde; pt_entry_t *pt, pte; vm_offset_t sva; vm_paddr_t pa; int error, i, j, k, l; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = &sbuf; sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); /* * Iterate over the kernel page tables without holding the kernel pmap * lock. Outside of the large map, kernel page table pages are never * freed, so at worst we will observe inconsistencies in the output. * Within the large map, ensure that PDP and PD page addresses are * valid before descending. */ for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { switch (i) { case PML4PML4I: sbuf_printf(sb, "\nRecursive map:\n"); break; case DMPML4I: sbuf_printf(sb, "\nDirect map:\n"); break; case KPML4BASE: sbuf_printf(sb, "\nKernel map:\n"); break; case LMSPML4I: sbuf_printf(sb, "\nLarge map:\n"); break; } /* Convert to canonical form. */ if (sva == 1ul << 47) sva |= -1ul << 48; restart: pml4e = kernel_pmap->pm_pml4[i]; if ((pml4e & X86_PG_V) == 0) { sva = rounddown2(sva, NBPML4); sysctl_kmaps_dump(sb, &range, sva); sva += NBPML4; continue; } pa = pml4e & PG_FRAME; pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { pdpe = pdp[j]; if ((pdpe & X86_PG_V) == 0) { sva = rounddown2(sva, NBPDP); sysctl_kmaps_dump(sb, &range, sva); sva += NBPDP; continue; } pa = pdpe & PG_FRAME; if (PMAP_ADDRESS_IN_LARGEMAP(sva) && vm_phys_paddr_to_vm_page(pa) == NULL) goto restart; if ((pdpe & PG_PS) != 0) { sva = rounddown2(sva, NBPDP); sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 0, 0); range.pdpes++; sva += NBPDP; continue; } pd = (pd_entry_t *)PHYS_TO_DMAP(pa); for (k = pmap_pde_index(sva); k < NPDEPG; k++) { pde = pd[k]; if ((pde & X86_PG_V) == 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_dump(sb, &range, sva); sva += NBPDR; continue; } pa = pde & PG_FRAME; if (PMAP_ADDRESS_IN_LARGEMAP(sva) && vm_phys_paddr_to_vm_page(pa) == NULL) goto restart; if ((pde & PG_PS) != 0) { sva = rounddown2(sva, NBPDR); sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, pde, 0); range.pdes++; sva += NBPDR; continue; } pt = (pt_entry_t *)PHYS_TO_DMAP(pa); for (l = pmap_pte_index(sva); l < NPTEPG; l++, sva += PAGE_SIZE) { pte = pt[l]; if ((pte & X86_PG_V) == 0) { sysctl_kmaps_dump(sb, &range, sva); continue; } sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, pde, pte); range.ptes++; } } } } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kmaps, "A", "Dump kernel address layout"); #ifdef DDB DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_offset_t va; if (!have_addr) { db_printf("show pte addr\n"); return; } va = (vm_offset_t)addr; if (kdb_thread != NULL) pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); else pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); pml4 = pmap_pml4e(pmap, va); - db_printf("VA %#016lx pml4e %#016lx", va, *pml4); + db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; } pdp = pmap_pml4e_to_pdpe(pml4, va); - db_printf(" pdpe %#016lx", *pdp); + db_printf(" pdpe 0x%016lx", *pdp); if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { db_printf("\n"); return; } pde = pmap_pdpe_to_pde(pdp, va); - db_printf(" pde %#016lx", *pde); + db_printf(" pde 0x%016lx", *pde); if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { db_printf("\n"); return; } pte = pmap_pde_to_pte(pde, va); - db_printf(" pte %#016lx\n", *pte); + db_printf(" pte 0x%016lx\n", *pte); } DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) { vm_paddr_t a; if (have_addr) { a = (vm_paddr_t)addr; db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); } else { db_printf("show phys2dmap addr\n"); } } #endif Index: projects/clang900-import/sys/arm64/arm64/pmap.c =================================================================== --- projects/clang900-import/sys/arm64/arm64/pmap.c (revision 352586) +++ projects/clang900-import/sys/arm64/arm64/pmap.c (revision 352587) @@ -1,5941 +1,5951 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * This software was developed by Andrew Turner under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif /* * These are configured by the mair_el1 register. This is set up in locore.S */ #define DEVICE_MEMORY 0 #define UNCACHED_MEMORY 1 #define CACHED_MEMORY 2 #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* * The presence of this flag indicates that the mapping is writeable. * If the ATTR_AP_RO bit is also set, then the mapping is clean, otherwise it is * dirty. This flag may only be set on managed mappings. * * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it * as a software managed bit. */ #define ATTR_SW_DBM ATTR_DBM struct pmap kernel_pmap_store; /* Used for mapping ACPI memory before VM is initialized */ #define PMAP_PREINIT_MAPPING_COUNT 32 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ static int vm_initialized = 0; /* No need to use pre-init maps when set */ /* * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. * Always map entire L2 block for simplicity. * VA of L2 block = preinit_map_va + i * L2_SIZE */ static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t size; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; /* * Data for the pv entry allocation mechanism. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) extern pt_entry_t pagetable_dmap[]; #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) static vm_paddr_t physmap[PHYSMAP_SIZE]; static u_int physmap_idx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, "Are large page mappings enabled?"); /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_clear(table) atomic_store_64(table, 0) #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) #define pmap_load(table) (*table) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set_bits(table, bits) atomic_set_64(table, bits) #define pmap_store(table, entry) atomic_store_64(table, entry) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[pmap_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { pd_entry_t *l2; l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { pt_entry_t *l3; l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); return (&l3[pmap_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == L1_BLOCK) { *level = 1; return (l1); } if (desc != L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == L2_BLOCK) { *level = 2; return (l2); } if (desc != L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) return (NULL); return (l3); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled != 0); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l0p, *l1p, *l2p; if (pmap->pm_l0 == NULL) return (false); l0p = pmap_l0(pmap, va); *l0 = l0p; if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) return (false); l1p = pmap_l0_to_l1(l0p, va); *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { *l2 = NULL; *l3 = NULL; return (true); } if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) return (false); l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { *l3 = NULL; return (true); } if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) return (false); *l3 = pmap_l2_to_l3(l2p, va); return (true); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); } CTASSERT(L1_BLOCK == L2_BLOCK); /* * Checks if the PTE is dirty. */ static inline int pmap_pte_dirty(pt_entry_t pte) { KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); KASSERT((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != 0, ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); return ((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); } static vm_offset_t pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_offset_t freemempos) { pt_entry_t *l2; vm_offset_t va; vm_paddr_t l2_pa, pa; u_int l1_slot, l2_slot, prev_l1_slot; int i; dmap_phys_base = min_pa & ~L1_OFFSET; dmap_phys_max = 0; dmap_max_addr = 0; l2 = NULL; prev_l1_slot = -1; #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES); for (i = 0; i < (physmap_idx * 2); i += 2) { pa = physmap[i] & ~L2_OFFSET; va = pa - dmap_phys_base + DMAP_MIN_ADDRESS; /* Create L2 mappings at the start of the region */ if ((pa & L1_OFFSET) != 0) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { /* * We are on a boundary, stop to * create a level 1 block */ if ((pa & L1_OFFSET) == 0) break; l2_slot = pmap_l2_index(va); KASSERT(l2_slot != 0, ("...")); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), ("...")); } for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] && (physmap[i + 1] - pa) >= L1_SIZE; pa += L1_SIZE, va += L1_SIZE) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_store(&pagetable_dmap[l1_slot], (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L1_BLOCK); } /* Create L2 mappings at the end of the region */ if (pa < physmap[i + 1]) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { l2_slot = pmap_l2_index(va); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } } if (pa > dmap_phys_max) { dmap_phys_max = pa; dmap_max_addr = va; } } cpu_tlb_flushID(); return (freemempos); } static vm_offset_t pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) { vm_offset_t l2pt; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(va); l2pt = l2_start; for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); pa = pmap_early_vtophys(l1pt, l2pt); pmap_store(&l1[l1_slot], (pa & ~Ln_TABLE_MASK) | L1_TABLE); l2pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l2_start, 0, l2pt - l2_start); return l2pt; } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; vm_paddr_t pa; pd_entry_t *l2; u_int l2_slot; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pmap_store(&l2[l2_slot], (pa & ~Ln_TABLE_MASK) | L2_TABLE); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return l3pt; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot; pt_entry_t *l2; vm_offset_t va, freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t start_pa, pa, min_pa; uint64_t kern_delta; int i; kern_delta = KERNBASE - kernstart; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; PMAP_LOCK_INIT(kernel_pmap); /* Assume the address we were loaded to is a valid physical address */ min_pa = KERNBASE - kern_delta; physmap_idx = arm_physmem_avail(physmap, nitems(physmap)); physmap_idx /= 2; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < (physmap_idx * 2); i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; } freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create a direct map region early so we can use it for pa -> va */ freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); va = KERNBASE; start_pa = pa = KERNBASE - kern_delta; /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); /* Find how many pages we have mapped */ for (; l2_slot < Ln_ENTRIES; l2_slot++) { if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0) break; /* Check locore used L2 blocks */ KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK, ("Invalid bootstrap L2 table")); KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa, ("Incorrect PA in L2 table")); va += L2_SIZE; pa += L2_SIZE; } va = roundup2(va, L1_SIZE); /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */ freemempos = pmap_bootstrap_l2(l1pt, va, freemempos); /* And the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; /* Reserve some VA space for early BIOS/ACPI mapping */ preinit_map_va = roundup2(freemempos, L2_SIZE); virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; virtual_avail = roundup2(virtual_avail, L1_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); if (superpages_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = L2_SIZE; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); vm_initialized = 1; } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Invalidate a single TLB entry. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vaae1is, %0 \n" "dsb ish \n" "isb \n" : : "r"(va >> PAGE_SHIFT)); sched_unpin(); } static __inline void pmap_invalidate_range_nopin(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; dsb(ishst); for (addr = sva; addr < eva; addr += PAGE_SIZE) { __asm __volatile( "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT)); } __asm __volatile( "dsb ish \n" "isb \n"); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { sched_pin(); pmap_invalidate_range_nopin(pmap, sva, eva); sched_unpin(); } static __inline void pmap_invalidate_all(pmap_t pmap) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n"); sched_unpin(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; pa = 0; PMAP_LOCK(pmap); /* * Find the block or page map for this virtual address. pmap_pte * will return either a valid block/page entry, or NULL. */ pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_extract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_extract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_extract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte, tpte; vm_offset_t off; vm_page_t m; int lvl; m = NULL; PMAP_LOCK(pmap); pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); KASSERT(lvl > 0 && lvl <= 3, ("pmap_extract_and_hold: Invalid level %d", lvl)); CTASSERT(L1_BLOCK == L2_BLOCK); KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, tpte & ATTR_DESCR_MASK)); if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) || ((prot & VM_PROT_WRITE) == 0)) { switch(lvl) { case 1: off = va & L1_OFFSET; break; case 2: off = va & L2_OFFSET; break; case 3: default: off = 0; } m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); if (!vm_page_wire_mapped(m)) m = NULL; } } PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pt_entry_t *pte, tpte; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return (DMAP_TO_PHYS(va)); pte = pmap_l1(kernel_pmap, va); if (pte == NULL) return (0); /* * A concurrent pmap_update_entry() will clear the entry's valid bit * but leave the rest of the entry unchanged. Therefore, we treat a * non-zero entry as being valid, and we ignore the valid bit when * determining whether the entry maps a block, page, or table. */ tpte = pmap_load(pte); if (tpte == 0) return (0); if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) return ((tpte & ~ATTR_MASK) | (va & L1_OFFSET)); pte = pmap_l1_to_l2(&tpte, va); tpte = pmap_load(pte); if (tpte == 0) return (0); if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) return ((tpte & ~ATTR_MASK) | (va & L2_OFFSET)); pte = pmap_l2_to_l3(&tpte, va); tpte = pmap_load(pte); if (tpte == 0) return (0); return ((tpte & ~ATTR_MASK) | (va & L3_OFFSET)); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pd_entry_t *pde; pt_entry_t *pte, attr; vm_offset_t va; int lvl; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE; if (mode == DEVICE_MEMORY) attr |= ATTR_XN; va = sva; while (size != 0) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, DEVICE_MEMORY); } /* * Remove a page from the kernel pagetables. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; int lvl; pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); pmap_clear(pte); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); pmap_clear(pte); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pd_entry_t *pde; pt_entry_t *pte, pa; vm_offset_t va; vm_page_t m; int i, lvl; va = sva; for (i = 0; i < count; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_qenter: Invalid level %d", lvl)); m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | ATTR_IDX(m->md.pv_memattr) | L3_PAGE; if (m->md.pv_memattr == DEVICE_MEMORY) pa |= ATTR_XN; pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, pa); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); va = sva; while (count-- > 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); if (pte != NULL) { pmap_clear(pte); } va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_clear(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); return (pmap_unwire_l3(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l0 = kernel_pmap->pm_l0; pmap->pm_root.rt_root = 0; } int pmap_pinit(pmap_t pmap) { vm_paddr_t l0phys; vm_page_t l0pt; /* * allocate the l0 page */ while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) vm_wait(NULL); l0phys = VM_PAGE_TO_PHYS(l0pt); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys); if ((l0pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l0); pmap->pm_root.rt_root = 0; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Because of AArch64's weak memory consistency model, we must have a * barrier here to ensure that the stores for zeroing "m", whether by * pmap_zero_page() or an earlier function, are visible before adding * "m" to the page table. Otherwise, a page table walk by another * processor's MMU could see the mapping to "m" and a stale, non-zero * PTE within "m". */ dmb(ishst); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0 = &pmap->pm_l0[l0index]; pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->wire_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->wire_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { pd_entry_t *l1; vm_page_t l2pg; vm_pindex_t l2pindex; retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); l2pg->wire_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL && lockp != NULL) goto retry; } return (l2pg); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pde, tpde; #ifdef INVARIANTS pt_entry_t *pte; #endif vm_page_t m; int lvl; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment the hold count, * and activate it. If we get a level 2 pde it will point to a level 3 * table. */ switch (lvl) { case -1: break; case 0: #ifdef INVARIANTS pte = pmap_l0_to_l1(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l0 superpages")); #endif break; case 1: #ifdef INVARIANTS pte = pmap_l1_to_l2(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l1 superpages")); #endif break; case 2: tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); m->wire_count++; return (m); } break; default: panic("pmap_alloc_l3: Invalid level %d", lvl); } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0)); vm_page_unwire_noq(m); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l0, *l1, *l2; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l0 = pmap_l0(kernel_pmap, kernel_vm_end); KASSERT(pmap_load(l0) != 0, ("pmap_growkernel: No level 0 kernel entry")); l1 = pmap_l0_to_l1(l0, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); /* See the dmb() in _pmap_alloc_l3(). */ dmb(ishst); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l1, paddr | L1_TABLE); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if (pmap_load(l2) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); /* See the dmb() in _pmap_alloc_l3(). */ dmb(ishst); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l2, paddr | L2_TABLE); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed, lvl; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pv_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va, &lvl); if (lvl != 2) continue; pte = pmap_l2_to_l3(pde, va); tpte = pmap_load(pte); if ((tpte & ATTR_SW_WIRED) != 0) continue; tpte = pmap_load_clear(pte); pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); if (pmap_pte_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, pmap_load(pde), &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_demote_l2: va is not 2mpage aligned")); KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_demote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = l2e & ~ATTR_MASK; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | L2_TABLE; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. The caller must have already invalidated the * mapping (i.e., the "break" in break-before-make). */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); old_l2 = pmap_load_clear(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); if (old_l2 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if (old_l2 & ATTR_SW_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); pvh = pa_to_pvh(old_l2 & ~ATTR_MASK); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); va < eva; va += PAGE_SIZE, m++) { if (pmap_pte_dirty(old_l2)) vm_page_dirty(m); if (old_l2 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_l2: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(old_l3)) vm_page_dirty(m); if (old_l3 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the specified range of addresses from the L3 page table that is * identified by the given L2 entry. */ static void pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, vm_offset_t eva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; struct rwlock *new_lock; pt_entry_t *l3, old_l3; vm_offset_t va; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), ("pmap_remove_l3_range: range crosses an L3 page table boundary")); va = eva; for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { if (!pmap_l3_valid(pmap_load(l3))) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } old_l3 = pmap_load_clear(l3); if ((old_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; pmap_resident_count_dec(pmap, 1); if ((old_l3 & ATTR_SW_MANAGED) != 0) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(old_l3)) vm_page_dirty(m); if ((old_l3 & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); if (new_lock != *lockp) { if (*lockp != NULL) { /* * Pending TLB invalidations must be * performed before the PV list lock is * released. Otherwise, a concurrent * pmap_remove_all() on a physical page * could return while a stale TLB entry * still provides access to that page. */ if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } rw_wunlock(*lockp); } *lockp = new_lock; rw_wlock(*lockp); } pmap_pvh_free(&m->md, pmap, sva); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (va == eva) va = sva; if (pmap_unuse_pt(pmap, sva, l2e, free)) { sva += L3_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t l3_paddr; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_paddr = pmap_load(l2); if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (pmap_demote_l2_locked(pmap, l2, sva, &lock) == NULL) continue; l3_paddr = pmap_load(l2); } /* * Weed out invalid mappings. */ if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, &lock); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; struct spglist free; int lvl, pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pte = pmap_pte(pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_remove_all: no page table entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pte level %d", lvl)); pmap_demote_l2_locked(pmap, pte, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_remove_all: no page directory entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pde level %d", lvl)); tpde = pmap_load(pde); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load_clear(pte); pmap_invalidate_page(pmap, pv->pv_va); if (tpte & ATTR_SW_WIRED) pmap->pm_stats.wired_count--; if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pmap_pte_dirty(tpte)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_l2: do the things to protect a 2MB page in a pmap */ static void pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, pt_entry_t nbits) { pd_entry_t old_l2; vm_page_t m, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_protect_l2: sva is not 2mpage aligned")); old_l2 = pmap_load(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); /* * Return if the L2 entry already has the desired access restrictions * in place. */ retry: if ((old_l2 & mask) == nbits) return; /* * When a dirty read/write superpage mapping is write protected, * update the dirty field of each of the superpage's constituent 4KB * pages. */ if ((old_l2 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(old_l2)) { m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) goto retry; /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3p, l3, mask, nbits; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } mask = nbits = 0; if ((prot & VM_PROT_WRITE) == 0) { mask |= ATTR_AP_RW_BIT | ATTR_SW_DBM; nbits |= ATTR_AP(ATTR_AP_RO); } if ((prot & VM_PROT_EXECUTE) == 0) { mask |= ATTR_XN; nbits |= ATTR_XN; } if (mask == 0) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_protect_l2(pmap, l2, sva, mask, nbits); continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_protect: Invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); retry: /* * Go to the next L3 entry if the current one is * invalid or already has the desired access * restrictions in place. (The latter case occurs * frequently. For example, in a "buildworld" * workload, almost 1 out of 4 L3 entries already * have the desired restrictions.) */ if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } /* * When a dirty read/write mapping is write protected, * update the page's dirty field. */ if ((l3 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(l3)) vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits)) goto retry; if (va == va_next) va = sva; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Performs a break-before-make update of a pmap entry. This is needed when * either promoting or demoting pages to ensure the TLB doesn't get into an * inconsistent state. */ static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, vm_offset_t va, vm_size_t size) { register_t intr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Ensure we don't get switched out with the page table in an * inconsistent state. We also need to ensure no interrupts fire * as they may make use of an address we are about to invalidate. */ intr = intr_disable(); critical_enter(); /* * Clear the old mapping's valid bit, but leave the rest of the entry * unchanged, so that a lockless, concurrent pmap_kextract() can still * lookup the physical address. */ pmap_clear_bits(pte, ATTR_DESCR_VALID); pmap_invalidate_range_nopin(pmap, va, va + size); /* Create the new mapping */ pmap_store(pte, newpte); dsb(ishst); critical_exit(); intr_restore(intr); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_promote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single level 2 table entry to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3, newl2, oldl3, pa; vm_page_t mpte; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); sva = va & ~L2_OFFSET; firstl3 = pmap_l2_to_l3(l2, sva); newl2 = pmap_load(firstl3); setl2: if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l2, &newl2, newl2 & ~ATTR_SW_DBM)) goto setl2; newl2 &= ~ATTR_SW_DBM; } pa = newl2 + L2_SIZE - PAGE_SIZE; for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { oldl3 = pmap_load(l3); setl3: if ((oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & ~ATTR_SW_DBM)) goto setl3; oldl3 &= ~ATTR_SW_DBM; } if (oldl3 != pa) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the L2 * mapping the superpage is demoted by pmap_demote_l2() or * destroyed by pmap_remove_l3(). */ mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l2: page table page is out of range")); KASSERT(mpte->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx in pmap %p", va, pmap); return; } if ((newl2 & ATTR_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); newl2 &= ~ATTR_DESCR_MASK; newl2 |= L2_BLOCK; pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l2, *l3; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t nosleep; int lvl, rv; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | L3_PAGE); if ((prot & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l3 |= ATTR_XN; if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= ATTR_SW_WIRED; if (va < VM_MAXUSER_ADDRESS) new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= ATTR_SW_MANAGED; if ((prot & VM_PROT_WRITE) != 0) { new_l3 |= ATTR_SW_DBM; if ((flags & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); } } CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 2) { l3 = pmap_l2_to_l3(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } else if (pde != NULL && lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { l3 = &l3[pmap_l3_index(va)]; if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE( pmap_load(l2) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } /* We need to allocate an L3 table. */ } if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; /* * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order * to handle the possibility that a superpage mapping for "va" * was created while we slept. */ mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: missing L3 table for kernel va %#lx", va); havel3: orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; pv = NULL; /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & ATTR_SW_MANAGED) != 0 && (new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. */ orig_l3 = pmap_load_clear(l3); KASSERT((orig_l3 & ~ATTR_MASK) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & ATTR_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if (pmap_pte_dirty(orig_l3)) vm_page_dirty(om); if ((orig_l3 & ATTR_AF) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); if ((m->oflags & VPO_UNMANAGED) != 0) free_pv_entry(pmap, pv); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: /* * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK * is set. Do it now, before the mapping is stored and made * valid for hardware table walk. If done later, then other can * access this page before caches are properly synced. * Don't do it for kernel memory which is mapped with exec * permission even if the memory isn't going to hold executable * code. The only time when icache sync is needed is after * kernel module is loaded and the relocation info is processed. * And it's done in elf_cpu_load_file(). */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && (opa != pa || (orig_l3 & ATTR_XN))) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); /* * Update the L3 entry */ if (pmap_l3_valid(orig_l3)) { KASSERT(opa == pa, ("pmap_enter: invalid update")); if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { /* same PA, different attributes */ /* XXXMJ need to reload orig_l3 for hardware DBM. */ pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); if ((orig_l3 & ATTR_SW_MANAGED) != 0 && pmap_pte_dirty(orig_l3)) vm_page_dirty(m); } else { /* * orig_l3 == new_l3 * This can happens if multiple threads simultaneously * access not yet mapped page. This bad for performance * since this can cause full demotion-NOP-promotion * cycle. * Another possible reasons are: * - VM and pmap memory layout are diverged * - tlb flush is missing somewhere and CPU doesn't see * actual mapping. */ CTR4(KTR_PMAP, "%s: already mapped page - " "pmap %p va 0x%#lx pte 0x%lx", __func__, pmap, va, new_l3); } } else { /* New mapping */ pmap_store(l3, new_l3); dsb(ishst); } #if VM_NRESERVLEVEL > 0 if ((mpte == NULL || mpte->wire_count == NL3PG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_l2(pmap, pde, va, &lock); } #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; PMAP_LOCK_ASSERT(pmap, MA_OWNED); new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L2_BLOCK); if ((m->oflags & VPO_UNMANAGED) == 0) { new_l2 |= ATTR_SW_MANAGED; new_l2 &= ~ATTR_AF; } if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l2 |= ATTR_XN; if (va < VM_MAXUSER_ADDRESS) new_l2 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, old_l2; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((old_l2 = pmap_load(l2)) != 0) { KASSERT(l2pg->wire_count > 1, ("pmap_enter_l2: l2pg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { l2pg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, &free, lockp); vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_l2() and pmap_remove_l3_range() * will leave the kernel page table page zero filled. * Nonetheless, the TLB could have an intermediate * entry for the kernel page table page. */ mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); pmap_clear(l2); pmap_invalidate_page(pmap, va); } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } if ((new_l2 & ATTR_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, l2pg, &free)) { /* * Although "va" is not mapped, the TLB could * nonetheless have intermediate entries that * refer to the freed page table pages. * Invalidate those entries. * * XXX redundant invalidation (See * _pmap_unwire_l3().) */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & ATTR_SW_DBM) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Map the superpage. */ pmap_store(l2, new_l2); dsb(ishst); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pd_entry_t *pde; pt_entry_t *l2, *l3, l3_val; vm_paddr_t pa; int lvl; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) return (NULL); } if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter_quick_locked: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } /* * Abort if a mapping already exists. */ if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m); l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L3_PAGE; if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) l3_val |= ATTR_XN; else if (va < VM_MAXUSER_ADDRESS) l3_val |= ATTR_PXN; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) { l3_val |= ATTR_SW_MANAGED; l3_val &= ~ATTR_AF; } /* Sync icache before the mapping is stored to PTE */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); pmap_store(l3, l3_val); dsb(ishst); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_clear_bits(l2, ATTR_SW_WIRED); pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) panic("pmap_unwire: demotion failed"); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_unwire: Invalid l2 entry after demotion")); if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); /* * ATTR_SW_WIRED must be cleared atomically. Although * the pmap lock synchronizes access to ATTR_SW_WIRED, * the System MMU may write to the entry concurrently. */ pmap_clear_bits(l3, ATTR_SW_WIRED); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. * * Because the executable mappings created by this routine are copied, * it should not have to flush the instruction cache. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; pd_entry_t *l0, *l1, *l2, srcptepaddr; pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_l2pg, dstmpte, srcmpte; if (dst_addr != src_addr) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr = va_next) { l0 = pmap_l0(src_pmap, addr); if (pmap_load(l0) == 0) { va_next = (addr + L0_SIZE) & ~L0_OFFSET; if (va_next < addr) va_next = end_addr; continue; } l1 = pmap_l0_to_l1(l0, addr); if (pmap_load(l1) == 0) { va_next = (addr + L1_SIZE) & ~L1_OFFSET; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + L2_SIZE) & ~L2_OFFSET; if (va_next < addr) va_next = end_addr; l2 = pmap_l1_to_l2(l1, addr); srcptepaddr = pmap_load(l2); if (srcptepaddr == 0) continue; if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { if ((addr & L2_OFFSET) != 0 || addr + L2_SIZE > end_addr) continue; dst_l2pg = pmap_alloc_l2(dst_pmap, addr, NULL); if (dst_l2pg == NULL) break; l2 = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_l2pg)); l2 = &l2[pmap_l2_index(addr)]; if (pmap_load(l2) == 0 && ((srcptepaddr & ATTR_SW_MANAGED) == 0 || pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((srcptepaddr & ATTR_SW_DBM) != 0) nbits |= ATTR_AP_RW_BIT; pmap_store(l2, (srcptepaddr & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, L2_SIZE / PAGE_SIZE); atomic_add_long(&pmap_l2_mappings, 1); } else dst_l2pg->wire_count--; continue; } KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_copy: invalid L2 entry")); srcptepaddr &= ~ATTR_MASK; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_l3_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = pmap_load(src_pte); /* * We only virtual copy managed pages. */ if ((ptetemp & ATTR_SW_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->wire_count++; } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_l3_index(addr)]; if (pmap_load(dst_pte) == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((ptetemp & ATTR_SW_DBM) != 0) nbits |= ATTR_AP_RW_BIT; pmap_store(dst_pte, (ptetemp & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_l3(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not mapped, * the TLB could nonetheless have * intermediate entries that refer * to the freed page table pages. * Invalidate those entries. * * XXX redundant invalidation */ pmap_invalidate_page(dst_pmap, addr); vm_page_free_pages_toq(&free, true); } goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->wire_count >= srcmpte->wire_count) break; } } out: /* * XXX This barrier may not be needed because the destination pmap is * not active. */ dsb(ishst); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, lvl, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, ml3, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx, lvl; vm_paddr_t pa; lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("Attempting to remove an unmapped page")); switch(lvl) { case 1: pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("Attempting to remove an invalid " "block: %lx", tpte)); tpte = pmap_load(pte); break; case 2: pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("Attempting to remove an invalid " "page: %lx", tpte)); break; default: panic( "Invalid page directory level: %d", lvl); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & ATTR_SW_WIRED) { allfree = 0; continue; } pa = tpte & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); /* * Because this pmap is not active on other * processors, the dirty bit cannot have * changed state since we last loaded pte. */ pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if (pmap_pte_dirty(tpte)) { switch (lvl) { case 1: for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); break; case 2: vm_page_dirty(m); break; } } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; switch (lvl) { case 1: pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); pvh = pa_to_pvh(tpte & ~ATTR_MASK); TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: l3 page not promoted")); pmap_resident_count_dec(pmap,1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_pages: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, &free, FALSE); } break; case 2: pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh( VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } break; } pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * This is used to check if a page has been accessed or modified. As we * don't have a bit to see if it has been modified we have to assume it * has been if the page is read/write. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask, value; pmap_t pmap; int lvl, md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 3, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 2, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L2_BLOCK; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; int lvl; rv = FALSE; PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL && pmap_load(pte) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pt_entry_t oldpte, *pte; vm_offset_t va; int lvl, md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; pte = pmap_pte(pmap, pv->pv_va, &lvl); if ((pmap_load(pte) & ATTR_SW_DBM) != 0) (void)pmap_demote_l2_locked(pmap, pte, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); oldpte = pmap_load(pte); retry: if ((oldpte & ATTR_SW_DBM) != 0) { if (!atomic_fcmpset_long(pte, &oldpte, (oldpte | ATTR_AP_RW_BIT) & ~ATTR_SW_DBM)) goto retry; if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; vm_paddr_t pa; int cleared, lvl, md_gen, not_cleared, pvh_gen; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); KASSERT(lvl == 1, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, ("pmap_ts_referenced: found an invalid l1 table")); pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(tpte)) { /* * Although "tpte" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((tpte & ATTR_AF) != 0) { /* * Since this reference bit is shared by 512 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual superpage number, and the pmap * address to select one 4KB page out of the 512 on * which testing the reference bit will result in * clearing that reference bit. This function is * designed to avoid the selection of the same 4KB page * for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); KASSERT(lvl == 2, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { if ((tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; vm_offset_t va, va_next; vm_page_t m; pd_entry_t *l0, *l1, *l2, oldl2; pt_entry_t *l3, oldl3; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); oldl2 = pmap_load(l2); if (oldl2 == 0) continue; if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { if ((oldl2 & ATTR_SW_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The 2MB page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Choosing the last page * within the address range [sva, min(va_next, eva)) * generally results in more repromotions. Since the * underlying page table page is fully populated, this * removal never frees a page table page. */ if ((oldl2 & ATTR_SW_WIRED) == 0) { va = eva; if (va > va_next) va = va_next; va -= PAGE_SIZE; KASSERT(va >= sva, ("pmap_advise: no address gap")); l3 = pmap_l2_to_l3(l2, va); KASSERT(pmap_load(l3) != 0, ("pmap_advise: invalid PTE")); pmap_remove_l3(pmap, l3, va, pmap_load(l2), NULL, &lock); } if (lock != NULL) rw_wunlock(lock); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_advise: invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { oldl3 = pmap_load(l3); if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != (ATTR_SW_MANAGED | L3_PAGE)) goto maybe_invlrng; else if (pmap_pte_dirty(oldl3)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); vm_page_dirty(m); } while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_AF) | ATTR_AP(ATTR_AP_RO))) cpu_spinwait(); } else if ((oldl3 & ATTR_AF) != 0) pmap_clear_bits(l3, ATTR_AF); else goto maybe_invlrng; if (va == va_next) va = sva; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; pt_entry_t *l3, oldl3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have ATTR_SW_DBM * set. If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ if ((oldl2 & ATTR_SW_DBM) != 0 && pmap_demote_l2_locked(pmap, l2, va, &lock) && (oldl2 & ATTR_SW_WIRED) == 0) { /* * Write protect the mapping to a single page so that * a subsequent write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); l3 = pmap_l2_to_l3(l2, va); oldl3 = pmap_load(l3); while (!atomic_fcmpset_long(l3, &oldl3, (oldl3 & ~ATTR_SW_DBM) | ATTR_AP(ATTR_AP_RO))) cpu_spinwait(); vm_page_dirty(m); pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); l3 = pmap_l2_to_l3(l2, pv->pv_va); oldl3 = pmap_load(l3); if (pmap_l3_valid(oldl3) && (oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) { pmap_set_bits(l3, ATTR_AP(ATTR_AP_RO)); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, free_l2_count, start_idx; if (!vm_initialized) { /* * No L3 ptables so map entire L2 blocks where start VA is: * preinit_map_va + start_idx * L2_SIZE * There may be duplicate mappings (multiple VA -> same PA) but * ARM64 dcache is always PIPT so that's acceptable. */ if (size == 0) return (NULL); /* Calculate how many L2 blocks are needed for the mapping */ l2_blocks = (roundup2(pa + size, L2_SIZE) - rounddown2(pa, L2_SIZE)) >> L2_SHIFT; offset = pa & L2_OFFSET; if (preinit_map_va == 0) return (NULL); /* Map 2MiB L2 blocks from reserved VA space */ free_l2_count = 0; start_idx = -1; /* Find enough free contiguous VA space */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (free_l2_count > 0 && ppim->pa != 0) { /* Not enough space here */ free_l2_count = 0; start_idx = -1; continue; } if (ppim->pa == 0) { /* Free L2 block */ if (start_idx == -1) start_idx = i; free_l2_count++; if (free_l2_count == l2_blocks) break; } } if (free_l2_count != l2_blocks) panic("%s: too many preinit mappings", __func__); va = preinit_map_va + (start_idx * L2_SIZE); for (i = start_idx; i < start_idx + l2_blocks; i++) { /* Mark entries as allocated */ ppim = pmap_preinit_mapping + i; ppim->pa = pa; ppim->va = va + offset; ppim->size = size; } /* Map L2 blocks */ pa = rounddown2(pa, L2_SIZE); for (i = 0; i < l2_blocks; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_mapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 1, ("pmap_mapbios: Invalid level %d", lvl)); /* Insert L2_BLOCK */ l2 = pmap_l1_to_l2(pde, va); pmap_load_store(l2, pa | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); va += L2_SIZE; pa += L2_SIZE; } pmap_invalidate_all(kernel_pmap); va = preinit_map_va + (start_idx * L2_SIZE); } else { /* kva_alloc may be used to map the pages */ offset = pa & PAGE_MASK; size = round_page(offset + size); va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); /* L3 table is linked */ va = trunc_page(va); pa = trunc_page(pa); pmap_kenter(va, size, pa, CACHED_MEMORY); } return ((void *)(va + offset)); } void pmap_unmapbios(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset, tmpsize, va_trunc; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, block; bool preinit_map; l2_blocks = (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); /* Remove preinit mapping */ preinit_map = false; block = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va) { KASSERT(ppim->size == size, ("pmap_unmapbios: size mismatch")); ppim->va = 0; ppim->pa = 0; ppim->size = 0; preinit_map = true; offset = block * L2_SIZE; va_trunc = rounddown2(va, L2_SIZE) + offset; /* Remove L2_BLOCK */ pde = pmap_pde(kernel_pmap, va_trunc, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va_trunc)); l2 = pmap_l1_to_l2(pde, va_trunc); pmap_clear(l2); if (block == (l2_blocks - 1)) break; block++; } } if (preinit_map) { pmap_invalidate_all(kernel_pmap); return; } /* Unmap the pages reserved with kva_alloc. */ if (vm_initialized) { offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); /* Unmap and invalidate the pages */ for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kremove(va + tmpsize); kva_free(va, size); } } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pv_memattr) != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pt_entry_t l3, *pte, *newpte; int lvl; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); if (!VIRT_IN_DMAP(base)) return (EINVAL); for (tmpva = base; tmpva < base + size; ) { pte = pmap_pte(kernel_pmap, tmpva, &lvl); if (pte == NULL) return (EINVAL); if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) { /* * We already have the correct attribute, * ignore this entry. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; break; case 2: tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; break; case 3: tmpva += PAGE_SIZE; break; } } else { /* * Split the entry to an level 3 table, then * set the new attribute. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: newpte = pmap_demote_l1(kernel_pmap, pte, tmpva & ~L1_OFFSET); if (newpte == NULL) return (EINVAL); pte = pmap_l1_to_l2(pte, tmpva); case 2: newpte = pmap_demote_l2(kernel_pmap, pte, tmpva); if (newpte == NULL) return (EINVAL); pte = pmap_l2_to_l3(pte, tmpva); case 3: /* Update the entry */ l3 = pmap_load(pte); l3 &= ~ATTR_IDX_MASK; l3 |= ATTR_IDX(mode); if (mode == DEVICE_MEMORY) l3 |= ATTR_XN; pmap_update_entry(kernel_pmap, pte, l3, tmpva, PAGE_SIZE); /* * If moving to a non-cacheable entry flush * the cache. */ if (mode == VM_MEMATTR_UNCACHEABLE) cpu_dcache_wbinv_range(tmpva, L3_SIZE); break; } tmpva += PAGE_SIZE; } } return (0); } /* * Create an L2 table to map all addresses within an L1 mapping. */ static pt_entry_t * pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) { pt_entry_t *l2, newl2, oldl1; vm_offset_t tmpl1; vm_paddr_t l2phys, phys; vm_page_t ml2; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl1 = pmap_load(l1); KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_demote_l1: Demoting a non-block entry")); KASSERT((va & L1_OFFSET) == 0, ("pmap_demote_l1: Invalid virtual address %#lx", va)); KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, ("pmap_demote_l1: Level 1 table shouldn't be managed")); tmpl1 = 0; if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { tmpl1 = kva_alloc(PAGE_SIZE); if (tmpl1 == 0) return (NULL); } if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" " in pmap %p", va, pmap); return (NULL); } l2phys = VM_PAGE_TO_PHYS(ml2); l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); /* Address the range points at */ phys = oldl1 & ~ATTR_MASK; /* The attributed from the old l1 table to be copied */ newl2 = oldl1 & ATTR_MASK; /* Create the new entries */ for (i = 0; i < Ln_ENTRIES; i++) { l2[i] = newl2 | phys; phys += L2_SIZE; } KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); if (tmpl1 != 0) { pmap_kenter(tmpl1, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY); l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); } pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); if (tmpl1 != 0) { pmap_kremove(tmpl1); kva_free(tmpl1, PAGE_SIZE); } return (l2); } static void pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) { pt_entry_t *l3; for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { *l3 = newl3; newl3 += L3_SIZE; } } static void pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, struct rwlock **lockp) { struct spglist free; SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); } /* * Create an L3 table to map all addresses within an L2 mapping. */ static pt_entry_t * pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *l3, newl3, oldl2; vm_offset_t tmpl2; vm_paddr_t l3phys; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); l3 = NULL; oldl2 = pmap_load(l2); KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_demote_l2: Demoting a non-block entry")); va &= ~L2_OFFSET; tmpl2 = 0; if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { tmpl2 = kva_alloc(PAGE_SIZE); if (tmpl2 == 0) return (NULL); } /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldl2 & ATTR_AF) == 0) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", va, pmap); goto fail; } if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va), ("pmap_demote_l2: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (ml3 == NULL) { pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if (va < VM_MAXUSER_ADDRESS) { ml3->wire_count = NL3PG; pmap_resident_count_inc(pmap, 1); } } l3phys = VM_PAGE_TO_PHYS(ml3); l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; KASSERT((oldl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM), ("pmap_demote_l2: L2 entry is writeable but not dirty")); /* * If the page table page is not leftover from an earlier promotion, * or the mapping attributes have changed, (re)initialize the L3 table. * * When pmap_update_entry() clears the old L2 mapping, it (indirectly) * performs a dsb(). That dsb() ensures that the stores for filling * "l3" are visible before "l3" is added to the page table. */ if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) pmap_fill_l3(l3, newl3); /* * Map the temporary page so we don't lose access to the l2 table. */ if (tmpl2 != 0) { pmap_kenter(tmpl2, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY); l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); } /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the L2 and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l2() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Pass PAGE_SIZE so that a single TLB invalidation is performed on * the 2MB page mapping. */ pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); /* * Demote the PV entry. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" " in pmap %p %lx", va, pmap, l3[0]); fail: if (tmpl2 != 0) { pmap_kremove(tmpl2); kva_free(tmpl2, PAGE_SIZE); } return (l3); } static pt_entry_t * pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { struct rwlock *lock; pt_entry_t *l3; lock = NULL; l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (l3); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt_entry_t *pte, tpte; vm_paddr_t mask, pa; int lvl, val; bool managed; PMAP_LOCK(pmap); retry: val = 0; pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL) { tpte = pmap_load(pte); switch (lvl) { case 3: mask = L3_OFFSET; break; case 2: mask = L2_OFFSET; break; case 1: mask = L1_OFFSET; break; default: panic("pmap_mincore: invalid level %d", lvl); } managed = (tpte & ATTR_SW_MANAGED) != 0; val = MINCORE_INCORE; if (lvl != 3) val |= MINCORE_SUPER; if ((managed && pmap_pte_dirty(tpte)) || (!managed && (tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; pa = (tpte & ~ATTR_MASK) | (addr & mask); } else managed = false; if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0); __asm __volatile( "msr ttbr0_el1, %0 \n" "isb \n" : : "r"(td->td_proc->p_md.md_l0addr)); pmap_invalidate_all(pmap); critical_exit(); } struct pcb * pmap_switch(struct thread *old, struct thread *new) { pcpu_bp_harden bp_harden; struct pcb *pcb; /* Store the new curthread */ PCPU_SET(curthread, new); /* And the new pcb */ pcb = new->td_pcb; PCPU_SET(curpcb, pcb); /* * TODO: We may need to flush the cache here if switching * to a user process. */ if (old == NULL || old->td_proc->p_md.md_l0addr != new->td_proc->p_md.md_l0addr) { __asm __volatile( /* Switch to the new pmap */ "msr ttbr0_el1, %0 \n" "isb \n" /* Invalidate the TLB */ "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n" : : "r"(new->td_proc->p_md.md_l0addr)); /* * Stop userspace from training the branch predictor against * other processes. This will call into a CPU specific * function that clears the branch predictor state. */ bp_harden = PCPU_GET(bp_harden); if (bp_harden != NULL) bp_harden(); } return (pcb); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { if (va >= VM_MIN_KERNEL_ADDRESS) { cpu_icache_sync_range(va, sz); } else { u_int len, offset; vm_paddr_t pa; /* Find the length of data in this page to flush */ offset = va & PAGE_MASK; len = imin(PAGE_SIZE - offset, sz); while (sz != 0) { /* Extract the physical address & find it in the DMAP */ pa = pmap_extract(pmap, va); if (pa != 0) cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); /* Move to the next page */ sz -= len; va += len; /* Set the length for the next iteration */ len = imin(PAGE_SIZE, sz); } } } int pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) { pt_entry_t pte, *ptep; register_t intr; uint64_t ec, par; int lvl, rv; rv = KERN_FAILURE; ec = ESR_ELx_EXCEPTION(esr); switch (ec) { case EXCP_INSN_ABORT_L: case EXCP_INSN_ABORT: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: break; default: return (rv); } /* Data and insn aborts use same encoding for FSC field. */ switch (esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_AFF_L1: case ISS_DATA_DFSC_AFF_L2: case ISS_DATA_DFSC_AFF_L3: PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); if (ptep != NULL) { pmap_set_bits(ptep, ATTR_AF); rv = KERN_SUCCESS; /* * XXXMJ as an optimization we could mark the entry * dirty if this is a write fault. */ } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_PF_L1: case ISS_DATA_DFSC_PF_L2: case ISS_DATA_DFSC_PF_L3: if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || (esr & ISS_DATA_WnR) == 0) return (rv); PMAP_LOCK(pmap); ptep = pmap_pte(pmap, far, &lvl); if (ptep != NULL && ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { if ((pte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RO)) { pmap_clear_bits(ptep, ATTR_AP_RW_BIT); pmap_invalidate_page(pmap, far); } rv = KERN_SUCCESS; } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: - PMAP_LOCK(pmap); - /* Ask the MMU to check the address */ - intr = intr_disable(); - if (pmap == kernel_pmap) - par = arm64_address_translate_s1e1r(far); - else - par = arm64_address_translate_s1e0r(far); - intr_restore(intr); - PMAP_UNLOCK(pmap); - /* - * If the translation was successful the address was invalid - * due to a break-before-make sequence. We can unlock and - * return success to the trap handler. + * Retry the translation. A break-before-make sequence can + * produce a transient fault. */ - if (PAR_SUCCESS(par)) - rv = KERN_SUCCESS; + if (pmap == kernel_pmap) { + /* + * The translation fault may have occurred within a + * critical section. Therefore, we must check the + * address without acquiring the kernel pmap's lock. + */ + if (pmap_kextract(far) != 0) + rv = KERN_SUCCESS; + } else { + PMAP_LOCK(pmap); + /* Ask the MMU to check the address. */ + intr = intr_disable(); + par = arm64_address_translate_s1e0r(far); + intr_restore(intr); + PMAP_UNLOCK(pmap); + + /* + * If the translation was successful, then we can + * return success to the trap handler. + */ + if (PAR_SUCCESS(par)) + rv = KERN_SUCCESS; + } break; } return (rv); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(!PHYS_IN_DMAP(paddr))) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); } Index: projects/clang900-import/sys/dev/ioat/ioat.c =================================================================== --- projects/clang900-import/sys/dev/ioat/ioat.c (revision 352586) +++ projects/clang900-import/sys/dev/ioat/ioat.c (revision 352587) @@ -1,2211 +1,2218 @@ /*- * Copyright (C) 2012 Intel Corporation * All rights reserved. * Copyright (C) 2018 Alexander Motin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include "ioat.h" #include "ioat_hw.h" #include "ioat_internal.h" #ifndef BUS_SPACE_MAXADDR_40BIT #define BUS_SPACE_MAXADDR_40BIT 0xFFFFFFFFFFULL #endif static int ioat_probe(device_t device); static int ioat_attach(device_t device); static int ioat_detach(device_t device); static int ioat_setup_intr(struct ioat_softc *ioat); static int ioat_teardown_intr(struct ioat_softc *ioat); static int ioat3_attach(device_t device); static int ioat_start_channel(struct ioat_softc *ioat); static int ioat_map_pci_bar(struct ioat_softc *ioat); static void ioat_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error); static void ioat_interrupt_handler(void *arg); static boolean_t ioat_model_resets_msix(struct ioat_softc *ioat); static int chanerr_to_errno(uint32_t); static void ioat_process_events(struct ioat_softc *ioat, boolean_t intr); static inline uint32_t ioat_get_active(struct ioat_softc *ioat); static inline uint32_t ioat_get_ring_space(struct ioat_softc *ioat); static void ioat_free_ring(struct ioat_softc *, uint32_t size, struct ioat_descriptor *); static int ioat_reserve_space(struct ioat_softc *, uint32_t, int mflags); static union ioat_hw_descriptor *ioat_get_descriptor(struct ioat_softc *, uint32_t index); static struct ioat_descriptor *ioat_get_ring_entry(struct ioat_softc *, uint32_t index); static void ioat_halted_debug(struct ioat_softc *, uint32_t); static void ioat_poll_timer_callback(void *arg); static void dump_descriptor(void *hw_desc); static void ioat_submit_single(struct ioat_softc *ioat); static void ioat_comp_update_map(void *arg, bus_dma_segment_t *seg, int nseg, int error); static int ioat_reset_hw(struct ioat_softc *ioat); static void ioat_reset_hw_task(void *, int); static void ioat_setup_sysctl(device_t device); static int sysctl_handle_reset(SYSCTL_HANDLER_ARGS); static void ioat_get(struct ioat_softc *); static void ioat_put(struct ioat_softc *); static void ioat_drain_locked(struct ioat_softc *); #define ioat_log_message(v, ...) do { \ if ((v) <= g_ioat_debug_level) { \ device_printf(ioat->device, __VA_ARGS__); \ } \ } while (0) MALLOC_DEFINE(M_IOAT, "ioat", "ioat driver memory allocations"); SYSCTL_NODE(_hw, OID_AUTO, ioat, CTLFLAG_RD, 0, "ioat node"); static int g_force_legacy_interrupts; SYSCTL_INT(_hw_ioat, OID_AUTO, force_legacy_interrupts, CTLFLAG_RDTUN, &g_force_legacy_interrupts, 0, "Set to non-zero to force MSI-X disabled"); int g_ioat_debug_level = 0; SYSCTL_INT(_hw_ioat, OID_AUTO, debug_level, CTLFLAG_RWTUN, &g_ioat_debug_level, 0, "Set log level (0-3) for ioat(4). Higher is more verbose."); unsigned g_ioat_ring_order = 13; SYSCTL_UINT(_hw_ioat, OID_AUTO, ring_order, CTLFLAG_RDTUN, &g_ioat_ring_order, 0, "Set IOAT ring order. (1 << this) == ring size."); /* * OS <-> Driver interface structures */ static device_method_t ioat_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ioat_probe), DEVMETHOD(device_attach, ioat_attach), DEVMETHOD(device_detach, ioat_detach), DEVMETHOD_END }; static driver_t ioat_pci_driver = { "ioat", ioat_pci_methods, sizeof(struct ioat_softc), }; static devclass_t ioat_devclass; DRIVER_MODULE(ioat, pci, ioat_pci_driver, ioat_devclass, 0, 0); MODULE_VERSION(ioat, 1); /* * Private data structures */ static struct ioat_softc *ioat_channel[IOAT_MAX_CHANNELS]; static unsigned ioat_channel_index = 0; SYSCTL_UINT(_hw_ioat, OID_AUTO, channels, CTLFLAG_RD, &ioat_channel_index, 0, "Number of IOAT channels attached"); static struct mtx ioat_list_mtx; MTX_SYSINIT(ioat_list_mtx, &ioat_list_mtx, "ioat list mtx", MTX_DEF); static struct _pcsid { u_int32_t type; const char *desc; } pci_ids[] = { { 0x34308086, "TBG IOAT Ch0" }, { 0x34318086, "TBG IOAT Ch1" }, { 0x34328086, "TBG IOAT Ch2" }, { 0x34338086, "TBG IOAT Ch3" }, { 0x34298086, "TBG IOAT Ch4" }, { 0x342a8086, "TBG IOAT Ch5" }, { 0x342b8086, "TBG IOAT Ch6" }, { 0x342c8086, "TBG IOAT Ch7" }, { 0x37108086, "JSF IOAT Ch0" }, { 0x37118086, "JSF IOAT Ch1" }, { 0x37128086, "JSF IOAT Ch2" }, { 0x37138086, "JSF IOAT Ch3" }, { 0x37148086, "JSF IOAT Ch4" }, { 0x37158086, "JSF IOAT Ch5" }, { 0x37168086, "JSF IOAT Ch6" }, { 0x37178086, "JSF IOAT Ch7" }, { 0x37188086, "JSF IOAT Ch0 (RAID)" }, { 0x37198086, "JSF IOAT Ch1 (RAID)" }, { 0x3c208086, "SNB IOAT Ch0" }, { 0x3c218086, "SNB IOAT Ch1" }, { 0x3c228086, "SNB IOAT Ch2" }, { 0x3c238086, "SNB IOAT Ch3" }, { 0x3c248086, "SNB IOAT Ch4" }, { 0x3c258086, "SNB IOAT Ch5" }, { 0x3c268086, "SNB IOAT Ch6" }, { 0x3c278086, "SNB IOAT Ch7" }, { 0x3c2e8086, "SNB IOAT Ch0 (RAID)" }, { 0x3c2f8086, "SNB IOAT Ch1 (RAID)" }, { 0x0e208086, "IVB IOAT Ch0" }, { 0x0e218086, "IVB IOAT Ch1" }, { 0x0e228086, "IVB IOAT Ch2" }, { 0x0e238086, "IVB IOAT Ch3" }, { 0x0e248086, "IVB IOAT Ch4" }, { 0x0e258086, "IVB IOAT Ch5" }, { 0x0e268086, "IVB IOAT Ch6" }, { 0x0e278086, "IVB IOAT Ch7" }, { 0x0e2e8086, "IVB IOAT Ch0 (RAID)" }, { 0x0e2f8086, "IVB IOAT Ch1 (RAID)" }, { 0x2f208086, "HSW IOAT Ch0" }, { 0x2f218086, "HSW IOAT Ch1" }, { 0x2f228086, "HSW IOAT Ch2" }, { 0x2f238086, "HSW IOAT Ch3" }, { 0x2f248086, "HSW IOAT Ch4" }, { 0x2f258086, "HSW IOAT Ch5" }, { 0x2f268086, "HSW IOAT Ch6" }, { 0x2f278086, "HSW IOAT Ch7" }, { 0x2f2e8086, "HSW IOAT Ch0 (RAID)" }, { 0x2f2f8086, "HSW IOAT Ch1 (RAID)" }, { 0x0c508086, "BWD IOAT Ch0" }, { 0x0c518086, "BWD IOAT Ch1" }, { 0x0c528086, "BWD IOAT Ch2" }, { 0x0c538086, "BWD IOAT Ch3" }, { 0x6f508086, "BDXDE IOAT Ch0" }, { 0x6f518086, "BDXDE IOAT Ch1" }, { 0x6f528086, "BDXDE IOAT Ch2" }, { 0x6f538086, "BDXDE IOAT Ch3" }, { 0x6f208086, "BDX IOAT Ch0" }, { 0x6f218086, "BDX IOAT Ch1" }, { 0x6f228086, "BDX IOAT Ch2" }, { 0x6f238086, "BDX IOAT Ch3" }, { 0x6f248086, "BDX IOAT Ch4" }, { 0x6f258086, "BDX IOAT Ch5" }, { 0x6f268086, "BDX IOAT Ch6" }, { 0x6f278086, "BDX IOAT Ch7" }, { 0x6f2e8086, "BDX IOAT Ch0 (RAID)" }, { 0x6f2f8086, "BDX IOAT Ch1 (RAID)" }, { 0x20218086, "SKX IOAT" }, }; MODULE_PNP_INFO("W32:vendor/device;D:#", pci, ioat, pci_ids, nitems(pci_ids)); /* * OS <-> Driver linkage functions */ static int ioat_probe(device_t device) { struct _pcsid *ep; u_int32_t type; type = pci_get_devid(device); for (ep = pci_ids; ep < &pci_ids[nitems(pci_ids)]; ep++) { if (ep->type == type) { device_set_desc(device, ep->desc); return (0); } } return (ENXIO); } static int ioat_attach(device_t device) { struct ioat_softc *ioat; int error, i; ioat = DEVICE2SOFTC(device); ioat->device = device; + if (bus_get_domain(device, &ioat->domain) != 0) + ioat->domain = 0; + ioat->cpu = CPU_FFS(&cpuset_domain[ioat->domain]) - 1; + if (ioat->cpu < 0) + ioat->cpu = CPU_FIRST(); error = ioat_map_pci_bar(ioat); if (error != 0) goto err; ioat->version = ioat_read_cbver(ioat); if (ioat->version < IOAT_VER_3_0) { error = ENODEV; goto err; } error = ioat3_attach(device); if (error != 0) goto err; error = pci_enable_busmaster(device); if (error != 0) goto err; error = ioat_setup_intr(ioat); if (error != 0) goto err; error = ioat_reset_hw(ioat); if (error != 0) goto err; ioat_process_events(ioat, FALSE); ioat_setup_sysctl(device); mtx_lock(&ioat_list_mtx); for (i = 0; i < IOAT_MAX_CHANNELS; i++) { if (ioat_channel[i] == NULL) break; } if (i >= IOAT_MAX_CHANNELS) { mtx_unlock(&ioat_list_mtx); device_printf(device, "Too many I/OAT devices in system\n"); error = ENXIO; goto err; } ioat->chan_idx = i; ioat_channel[i] = ioat; if (i >= ioat_channel_index) ioat_channel_index = i + 1; mtx_unlock(&ioat_list_mtx); ioat_test_attach(); err: if (error != 0) ioat_detach(device); return (error); } static inline int ioat_bus_dmamap_destroy(struct ioat_softc *ioat, const char *func, bus_dma_tag_t dmat, bus_dmamap_t map) { int error; error = bus_dmamap_destroy(dmat, map); if (error != 0) { ioat_log_message(0, "%s: bus_dmamap_destroy failed %d\n", func, error); } return (error); } static int ioat_detach(device_t device) { struct ioat_softc *ioat; int i, error; ioat = DEVICE2SOFTC(device); mtx_lock(&ioat_list_mtx); ioat_channel[ioat->chan_idx] = NULL; while (ioat_channel_index > 0 && ioat_channel[ioat_channel_index - 1] == NULL) ioat_channel_index--; mtx_unlock(&ioat_list_mtx); ioat_test_detach(); taskqueue_drain(taskqueue_thread, &ioat->reset_task); mtx_lock(&ioat->submit_lock); ioat->quiescing = TRUE; ioat->destroying = TRUE; wakeup(&ioat->quiescing); wakeup(&ioat->resetting); ioat_drain_locked(ioat); mtx_unlock(&ioat->submit_lock); mtx_lock(&ioat->cleanup_lock); while (ioat_get_active(ioat) > 0) msleep(&ioat->tail, &ioat->cleanup_lock, 0, "ioat_drain", 1); mtx_unlock(&ioat->cleanup_lock); ioat_teardown_intr(ioat); callout_drain(&ioat->poll_timer); pci_disable_busmaster(device); if (ioat->pci_resource != NULL) bus_release_resource(device, SYS_RES_MEMORY, ioat->pci_resource_id, ioat->pci_resource); if (ioat->data_tag != NULL) { for (i = 0; i < 1 << ioat->ring_size_order; i++) { error = ioat_bus_dmamap_destroy(ioat, __func__, ioat->data_tag, ioat->ring[i].src_dmamap); if (error != 0) return (error); } for (i = 0; i < 1 << ioat->ring_size_order; i++) { error = ioat_bus_dmamap_destroy(ioat, __func__, ioat->data_tag, ioat->ring[i].dst_dmamap); if (error != 0) return (error); } for (i = 0; i < 1 << ioat->ring_size_order; i++) { error = ioat_bus_dmamap_destroy(ioat, __func__, ioat->data_tag, ioat->ring[i].src2_dmamap); if (error != 0) return (error); } for (i = 0; i < 1 << ioat->ring_size_order; i++) { error = ioat_bus_dmamap_destroy(ioat, __func__, ioat->data_tag, ioat->ring[i].dst2_dmamap); if (error != 0) return (error); } bus_dma_tag_destroy(ioat->data_tag); } if (ioat->data_crc_tag != NULL) { for (i = 0; i < 1 << ioat->ring_size_order; i++) { error = ioat_bus_dmamap_destroy(ioat, __func__, ioat->data_crc_tag, ioat->ring[i].crc_dmamap); if (error != 0) return (error); } bus_dma_tag_destroy(ioat->data_crc_tag); } if (ioat->ring != NULL) ioat_free_ring(ioat, 1 << ioat->ring_size_order, ioat->ring); if (ioat->comp_update != NULL) { bus_dmamap_unload(ioat->comp_update_tag, ioat->comp_update_map); bus_dmamem_free(ioat->comp_update_tag, ioat->comp_update, ioat->comp_update_map); bus_dma_tag_destroy(ioat->comp_update_tag); } if (ioat->hw_desc_ring != NULL) { bus_dmamap_unload(ioat->hw_desc_tag, ioat->hw_desc_map); bus_dmamem_free(ioat->hw_desc_tag, ioat->hw_desc_ring, ioat->hw_desc_map); bus_dma_tag_destroy(ioat->hw_desc_tag); } return (0); } static int ioat_teardown_intr(struct ioat_softc *ioat) { if (ioat->tag != NULL) bus_teardown_intr(ioat->device, ioat->res, ioat->tag); if (ioat->res != NULL) bus_release_resource(ioat->device, SYS_RES_IRQ, rman_get_rid(ioat->res), ioat->res); pci_release_msi(ioat->device); return (0); } static int ioat_start_channel(struct ioat_softc *ioat) { struct ioat_dma_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct bus_dmadesc *dmadesc; uint64_t status; uint32_t chanerr; int i; ioat_acquire(&ioat->dmaengine); /* Submit 'NULL' operation manually to avoid quiescing flag */ desc = ioat_get_ring_entry(ioat, ioat->head); hw_desc = &ioat_get_descriptor(ioat, ioat->head)->dma; dmadesc = &desc->bus_dmadesc; dmadesc->callback_fn = NULL; dmadesc->callback_arg = NULL; hw_desc->u.control_raw = 0; hw_desc->u.control_generic.op = IOAT_OP_COPY; hw_desc->u.control_generic.completion_update = 1; hw_desc->size = 8; hw_desc->src_addr = 0; hw_desc->dest_addr = 0; hw_desc->u.control.null = 1; ioat_submit_single(ioat); ioat_release(&ioat->dmaengine); for (i = 0; i < 100; i++) { DELAY(1); status = ioat_get_chansts(ioat); if (is_ioat_idle(status)) return (0); } chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET); ioat_log_message(0, "could not start channel: " "status = %#jx error = %b\n", (uintmax_t)status, (int)chanerr, IOAT_CHANERR_STR); return (ENXIO); } /* * Initialize Hardware */ static int ioat3_attach(device_t device) { struct ioat_softc *ioat; struct ioat_descriptor *ring; struct ioat_dma_hw_descriptor *dma_hw_desc; void *hw_desc; size_t ringsz; int i, num_descriptors; int error; uint8_t xfercap; error = 0; ioat = DEVICE2SOFTC(device); ioat->capabilities = ioat_read_dmacapability(ioat); ioat_log_message(0, "Capabilities: %b\n", (int)ioat->capabilities, IOAT_DMACAP_STR); xfercap = ioat_read_xfercap(ioat); ioat->max_xfer_size = 1 << xfercap; ioat->intrdelay_supported = (ioat_read_2(ioat, IOAT_INTRDELAY_OFFSET) & IOAT_INTRDELAY_SUPPORTED) != 0; if (ioat->intrdelay_supported) ioat->intrdelay_max = IOAT_INTRDELAY_US_MASK; /* TODO: need to check DCA here if we ever do XOR/PQ */ mtx_init(&ioat->submit_lock, "ioat_submit", NULL, MTX_DEF); mtx_init(&ioat->cleanup_lock, "ioat_cleanup", NULL, MTX_DEF); callout_init(&ioat->poll_timer, 1); TASK_INIT(&ioat->reset_task, 0, ioat_reset_hw_task, ioat); /* Establish lock order for Witness */ mtx_lock(&ioat->cleanup_lock); mtx_lock(&ioat->submit_lock); mtx_unlock(&ioat->submit_lock); mtx_unlock(&ioat->cleanup_lock); ioat->is_submitter_processing = FALSE; bus_dma_tag_create(bus_get_dma_tag(ioat->device), sizeof(uint64_t), 0x0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, sizeof(uint64_t), 1, sizeof(uint64_t), 0, NULL, NULL, &ioat->comp_update_tag); error = bus_dmamem_alloc(ioat->comp_update_tag, (void **)&ioat->comp_update, BUS_DMA_ZERO, &ioat->comp_update_map); if (ioat->comp_update == NULL) return (ENOMEM); error = bus_dmamap_load(ioat->comp_update_tag, ioat->comp_update_map, ioat->comp_update, sizeof(uint64_t), ioat_comp_update_map, ioat, 0); if (error != 0) return (error); ioat->ring_size_order = g_ioat_ring_order; num_descriptors = 1 << ioat->ring_size_order; ringsz = sizeof(struct ioat_dma_hw_descriptor) * num_descriptors; error = bus_dma_tag_create(bus_get_dma_tag(ioat->device), 2 * 1024 * 1024, 0x0, (bus_addr_t)BUS_SPACE_MAXADDR_40BIT, BUS_SPACE_MAXADDR, NULL, NULL, ringsz, 1, ringsz, 0, NULL, NULL, &ioat->hw_desc_tag); if (error != 0) return (error); error = bus_dmamem_alloc(ioat->hw_desc_tag, &hw_desc, BUS_DMA_ZERO | BUS_DMA_WAITOK, &ioat->hw_desc_map); if (error != 0) return (error); error = bus_dmamap_load(ioat->hw_desc_tag, ioat->hw_desc_map, hw_desc, ringsz, ioat_dmamap_cb, &ioat->hw_desc_bus_addr, BUS_DMA_WAITOK); if (error) return (error); ioat->hw_desc_ring = hw_desc; error = bus_dma_tag_create(bus_get_dma_tag(ioat->device), 1, 0, BUS_SPACE_MAXADDR_40BIT, BUS_SPACE_MAXADDR, NULL, NULL, ioat->max_xfer_size, 1, ioat->max_xfer_size, 0, NULL, NULL, &ioat->data_crc_tag); if (error != 0) { ioat_log_message(0, "%s: bus_dma_tag_create failed %d\n", __func__, error); return (error); } error = bus_dma_tag_create(bus_get_dma_tag(ioat->device), 1, 0, BUS_SPACE_MAXADDR_48BIT, BUS_SPACE_MAXADDR, NULL, NULL, ioat->max_xfer_size, 1, ioat->max_xfer_size, 0, NULL, NULL, &ioat->data_tag); if (error != 0) { ioat_log_message(0, "%s: bus_dma_tag_create failed %d\n", __func__, error); return (error); } - ioat->ring = malloc(num_descriptors * sizeof(*ring), M_IOAT, - M_ZERO | M_WAITOK); + ioat->ring = malloc_domainset(num_descriptors * sizeof(*ring), M_IOAT, + DOMAINSET_PREF(ioat->domain), M_ZERO | M_WAITOK); ring = ioat->ring; for (i = 0; i < num_descriptors; i++) { memset(&ring[i].bus_dmadesc, 0, sizeof(ring[i].bus_dmadesc)); ring[i].id = i; error = bus_dmamap_create(ioat->data_tag, 0, &ring[i].src_dmamap); if (error != 0) { ioat_log_message(0, "%s: bus_dmamap_create failed %d\n", __func__, error); return (error); } error = bus_dmamap_create(ioat->data_tag, 0, &ring[i].dst_dmamap); if (error != 0) { ioat_log_message(0, "%s: bus_dmamap_create failed %d\n", __func__, error); return (error); } error = bus_dmamap_create(ioat->data_tag, 0, &ring[i].src2_dmamap); if (error != 0) { ioat_log_message(0, "%s: bus_dmamap_create failed %d\n", __func__, error); return (error); } error = bus_dmamap_create(ioat->data_tag, 0, &ring[i].dst2_dmamap); if (error != 0) { ioat_log_message(0, "%s: bus_dmamap_create failed %d\n", __func__, error); return (error); } error = bus_dmamap_create(ioat->data_crc_tag, 0, &ring[i].crc_dmamap); if (error != 0) { ioat_log_message(0, "%s: bus_dmamap_create failed %d\n", __func__, error); return (error); } } for (i = 0; i < num_descriptors; i++) { dma_hw_desc = &ioat->hw_desc_ring[i].dma; dma_hw_desc->next = RING_PHYS_ADDR(ioat, i + 1); } ioat->head = 0; ioat->tail = 0; ioat->last_seen = 0; *ioat->comp_update = 0; return (0); } static int ioat_map_pci_bar(struct ioat_softc *ioat) { ioat->pci_resource_id = PCIR_BAR(0); ioat->pci_resource = bus_alloc_resource_any(ioat->device, SYS_RES_MEMORY, &ioat->pci_resource_id, RF_ACTIVE); if (ioat->pci_resource == NULL) { ioat_log_message(0, "unable to allocate pci resource\n"); return (ENODEV); } ioat->pci_bus_tag = rman_get_bustag(ioat->pci_resource); ioat->pci_bus_handle = rman_get_bushandle(ioat->pci_resource); return (0); } static void ioat_comp_update_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) { struct ioat_softc *ioat = arg; KASSERT(error == 0, ("%s: error:%d", __func__, error)); ioat->comp_update_bus_addr = seg[0].ds_addr; } static void ioat_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *baddr; KASSERT(error == 0, ("%s: error:%d", __func__, error)); baddr = arg; *baddr = segs->ds_addr; } /* * Interrupt setup and handlers */ static int ioat_setup_intr(struct ioat_softc *ioat) { uint32_t num_vectors; int error; boolean_t use_msix; boolean_t force_legacy_interrupts; use_msix = FALSE; force_legacy_interrupts = FALSE; if (!g_force_legacy_interrupts && pci_msix_count(ioat->device) >= 1) { num_vectors = 1; pci_alloc_msix(ioat->device, &num_vectors); if (num_vectors == 1) use_msix = TRUE; } if (use_msix) { ioat->rid = 1; ioat->res = bus_alloc_resource_any(ioat->device, SYS_RES_IRQ, &ioat->rid, RF_ACTIVE); } else { ioat->rid = 0; ioat->res = bus_alloc_resource_any(ioat->device, SYS_RES_IRQ, &ioat->rid, RF_SHAREABLE | RF_ACTIVE); } if (ioat->res == NULL) { ioat_log_message(0, "bus_alloc_resource failed\n"); return (ENOMEM); } ioat->tag = NULL; error = bus_setup_intr(ioat->device, ioat->res, INTR_MPSAFE | INTR_TYPE_MISC, NULL, ioat_interrupt_handler, ioat, &ioat->tag); if (error != 0) { ioat_log_message(0, "bus_setup_intr failed\n"); return (error); } ioat_write_intrctrl(ioat, IOAT_INTRCTRL_MASTER_INT_EN); return (0); } static boolean_t ioat_model_resets_msix(struct ioat_softc *ioat) { u_int32_t pciid; pciid = pci_get_devid(ioat->device); switch (pciid) { /* BWD: */ case 0x0c508086: case 0x0c518086: case 0x0c528086: case 0x0c538086: /* BDXDE: */ case 0x6f508086: case 0x6f518086: case 0x6f528086: case 0x6f538086: return (TRUE); } return (FALSE); } static void ioat_interrupt_handler(void *arg) { struct ioat_softc *ioat = arg; ioat->stats.interrupts++; ioat_process_events(ioat, TRUE); } static int chanerr_to_errno(uint32_t chanerr) { if (chanerr == 0) return (0); if ((chanerr & (IOAT_CHANERR_XSADDERR | IOAT_CHANERR_XDADDERR)) != 0) return (EFAULT); if ((chanerr & (IOAT_CHANERR_RDERR | IOAT_CHANERR_WDERR)) != 0) return (EIO); /* This one is probably our fault: */ if ((chanerr & IOAT_CHANERR_NDADDERR) != 0) return (EIO); return (EIO); } static void ioat_process_events(struct ioat_softc *ioat, boolean_t intr) { struct ioat_descriptor *desc; struct bus_dmadesc *dmadesc; uint64_t comp_update, status; uint32_t completed, chanerr; int error; mtx_lock(&ioat->cleanup_lock); /* * Don't run while the hardware is being reset. Reset is responsible * for blocking new work and draining & completing existing work, so * there is nothing to do until new work is queued after reset anyway. */ if (ioat->resetting_cleanup) { mtx_unlock(&ioat->cleanup_lock); return; } completed = 0; comp_update = *ioat->comp_update; status = comp_update & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_MASK; if (status < ioat->hw_desc_bus_addr || status >= ioat->hw_desc_bus_addr + (1 << ioat->ring_size_order) * sizeof(struct ioat_generic_hw_descriptor)) panic("Bogus completion address %jx (channel %u)", (uintmax_t)status, ioat->chan_idx); if (status == ioat->last_seen) { /* * If we landed in process_events and nothing has been * completed, check for a timeout due to channel halt. */ goto out; } CTR4(KTR_IOAT, "%s channel=%u hw_status=0x%lx last_seen=0x%lx", __func__, ioat->chan_idx, comp_update, ioat->last_seen); while (RING_PHYS_ADDR(ioat, ioat->tail - 1) != status) { desc = ioat_get_ring_entry(ioat, ioat->tail); dmadesc = &desc->bus_dmadesc; CTR5(KTR_IOAT, "channel=%u completing desc idx %u (%p) ok cb %p(%p)", ioat->chan_idx, ioat->tail, dmadesc, dmadesc->callback_fn, dmadesc->callback_arg); bus_dmamap_unload(ioat->data_tag, desc->src_dmamap); bus_dmamap_unload(ioat->data_tag, desc->dst_dmamap); bus_dmamap_unload(ioat->data_tag, desc->src2_dmamap); bus_dmamap_unload(ioat->data_tag, desc->dst2_dmamap); bus_dmamap_unload(ioat->data_crc_tag, desc->crc_dmamap); if (dmadesc->callback_fn != NULL) dmadesc->callback_fn(dmadesc->callback_arg, 0); completed++; ioat->tail++; } CTR5(KTR_IOAT, "%s channel=%u head=%u tail=%u active=%u", __func__, ioat->chan_idx, ioat->head, ioat->tail, ioat_get_active(ioat)); if (completed != 0) { ioat->last_seen = RING_PHYS_ADDR(ioat, ioat->tail - 1); ioat->stats.descriptors_processed += completed; wakeup(&ioat->tail); } out: ioat_write_chanctrl(ioat, IOAT_CHANCTRL_RUN); mtx_unlock(&ioat->cleanup_lock); /* * The device doesn't seem to reliably push suspend/halt statuses to * the channel completion memory address, so poll the device register * here. For performance reasons skip it on interrupts, do it only * on much more rare polling events. */ if (!intr) comp_update = ioat_get_chansts(ioat) & IOAT_CHANSTS_STATUS; if (!is_ioat_halted(comp_update) && !is_ioat_suspended(comp_update)) return; ioat->stats.channel_halts++; /* * Fatal programming error on this DMA channel. Flush any outstanding * work with error status and restart the engine. */ mtx_lock(&ioat->submit_lock); ioat->quiescing = TRUE; mtx_unlock(&ioat->submit_lock); /* * This is safe to do here because the submit queue is quiesced. We * know that we will drain all outstanding events, so ioat_reset_hw * can't deadlock. It is necessary to protect other ioat_process_event * threads from racing ioat_reset_hw, reading an indeterminate hw * state, and attempting to continue issuing completions. */ mtx_lock(&ioat->cleanup_lock); ioat->resetting_cleanup = TRUE; chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET); if (1 <= g_ioat_debug_level) ioat_halted_debug(ioat, chanerr); ioat->stats.last_halt_chanerr = chanerr; while (ioat_get_active(ioat) > 0) { desc = ioat_get_ring_entry(ioat, ioat->tail); dmadesc = &desc->bus_dmadesc; CTR5(KTR_IOAT, "channel=%u completing desc idx %u (%p) err cb %p(%p)", ioat->chan_idx, ioat->tail, dmadesc, dmadesc->callback_fn, dmadesc->callback_arg); if (dmadesc->callback_fn != NULL) dmadesc->callback_fn(dmadesc->callback_arg, chanerr_to_errno(chanerr)); ioat->tail++; ioat->stats.descriptors_processed++; ioat->stats.descriptors_error++; } CTR5(KTR_IOAT, "%s channel=%u head=%u tail=%u active=%u", __func__, ioat->chan_idx, ioat->head, ioat->tail, ioat_get_active(ioat)); /* Clear error status */ ioat_write_4(ioat, IOAT_CHANERR_OFFSET, chanerr); mtx_unlock(&ioat->cleanup_lock); ioat_log_message(0, "Resetting channel to recover from error\n"); error = taskqueue_enqueue(taskqueue_thread, &ioat->reset_task); KASSERT(error == 0, ("%s: taskqueue_enqueue failed: %d", __func__, error)); } static void ioat_reset_hw_task(void *ctx, int pending __unused) { struct ioat_softc *ioat; int error; ioat = ctx; ioat_log_message(1, "%s: Resetting channel\n", __func__); error = ioat_reset_hw(ioat); KASSERT(error == 0, ("%s: reset failed: %d", __func__, error)); (void)error; } /* * User API functions */ unsigned ioat_get_nchannels(void) { return (ioat_channel_index); } bus_dmaengine_t ioat_get_dmaengine(uint32_t index, int flags) { struct ioat_softc *ioat; KASSERT((flags & ~(M_NOWAIT | M_WAITOK)) == 0, ("invalid flags: 0x%08x", flags)); KASSERT((flags & (M_NOWAIT | M_WAITOK)) != (M_NOWAIT | M_WAITOK), ("invalid wait | nowait")); mtx_lock(&ioat_list_mtx); if (index >= ioat_channel_index || (ioat = ioat_channel[index]) == NULL) { mtx_unlock(&ioat_list_mtx); return (NULL); } mtx_lock(&ioat->submit_lock); mtx_unlock(&ioat_list_mtx); if (ioat->destroying) { mtx_unlock(&ioat->submit_lock); return (NULL); } ioat_get(ioat); if (ioat->quiescing) { if ((flags & M_NOWAIT) != 0) { ioat_put(ioat); mtx_unlock(&ioat->submit_lock); return (NULL); } while (ioat->quiescing && !ioat->destroying) msleep(&ioat->quiescing, &ioat->submit_lock, 0, "getdma", 0); if (ioat->destroying) { ioat_put(ioat); mtx_unlock(&ioat->submit_lock); return (NULL); } } mtx_unlock(&ioat->submit_lock); return (&ioat->dmaengine); } void ioat_put_dmaengine(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); mtx_lock(&ioat->submit_lock); ioat_put(ioat); mtx_unlock(&ioat->submit_lock); } int ioat_get_hwversion(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); return (ioat->version); } size_t ioat_get_max_io_size(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); return (ioat->max_xfer_size); } uint32_t ioat_get_capabilities(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); return (ioat->capabilities); } int ioat_set_interrupt_coalesce(bus_dmaengine_t dmaengine, uint16_t delay) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); if (!ioat->intrdelay_supported) return (ENODEV); if (delay > ioat->intrdelay_max) return (ERANGE); ioat_write_2(ioat, IOAT_INTRDELAY_OFFSET, delay); ioat->cached_intrdelay = ioat_read_2(ioat, IOAT_INTRDELAY_OFFSET) & IOAT_INTRDELAY_US_MASK; return (0); } uint16_t ioat_get_max_coalesce_period(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); return (ioat->intrdelay_max); } void ioat_acquire(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); mtx_lock(&ioat->submit_lock); CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx); ioat->acq_head = ioat->head; } int ioat_acquire_reserve(bus_dmaengine_t dmaengine, unsigned n, int mflags) { struct ioat_softc *ioat; int error; ioat = to_ioat_softc(dmaengine); ioat_acquire(dmaengine); error = ioat_reserve_space(ioat, n, mflags); if (error != 0) ioat_release(dmaengine); return (error); } void ioat_release(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); CTR3(KTR_IOAT, "%s channel=%u dispatch1 head=%u", __func__, ioat->chan_idx, ioat->head); KFAIL_POINT_CODE(DEBUG_FP, ioat_release, /* do nothing */); CTR3(KTR_IOAT, "%s channel=%u dispatch2 head=%u", __func__, ioat->chan_idx, ioat->head); if (ioat->acq_head != ioat->head) { ioat_write_2(ioat, IOAT_DMACOUNT_OFFSET, (uint16_t)ioat->head); if (!callout_pending(&ioat->poll_timer)) { - callout_reset(&ioat->poll_timer, 1, - ioat_poll_timer_callback, ioat); + callout_reset_on(&ioat->poll_timer, 1, + ioat_poll_timer_callback, ioat, ioat->cpu); } } mtx_unlock(&ioat->submit_lock); } static struct ioat_descriptor * ioat_op_generic(struct ioat_softc *ioat, uint8_t op, uint32_t size, uint64_t src, uint64_t dst, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_generic_hw_descriptor *hw_desc; struct ioat_descriptor *desc; bus_dma_segment_t seg; int mflags, nseg, error; mtx_assert(&ioat->submit_lock, MA_OWNED); KASSERT((flags & ~_DMA_GENERIC_FLAGS) == 0, ("Unrecognized flag(s): %#x", flags & ~_DMA_GENERIC_FLAGS)); if ((flags & DMA_NO_WAIT) != 0) mflags = M_NOWAIT; else mflags = M_WAITOK; if (size > ioat->max_xfer_size) { ioat_log_message(0, "%s: max_xfer_size = %d, requested = %u\n", __func__, ioat->max_xfer_size, (unsigned)size); return (NULL); } if (ioat_reserve_space(ioat, 1, mflags) != 0) return (NULL); desc = ioat_get_ring_entry(ioat, ioat->head); hw_desc = &ioat_get_descriptor(ioat, ioat->head)->generic; hw_desc->u.control_raw = 0; hw_desc->u.control_generic.op = op; hw_desc->u.control_generic.completion_update = 1; if ((flags & DMA_INT_EN) != 0) hw_desc->u.control_generic.int_enable = 1; if ((flags & DMA_FENCE) != 0) hw_desc->u.control_generic.fence = 1; hw_desc->size = size; if (src != 0) { nseg = -1; error = _bus_dmamap_load_phys(ioat->data_tag, desc->src_dmamap, src, size, 0, &seg, &nseg); if (error != 0) { ioat_log_message(0, "%s: _bus_dmamap_load_phys" " failed %d\n", __func__, error); return (NULL); } hw_desc->src_addr = seg.ds_addr; } if (dst != 0) { nseg = -1; error = _bus_dmamap_load_phys(ioat->data_tag, desc->dst_dmamap, dst, size, 0, &seg, &nseg); if (error != 0) { ioat_log_message(0, "%s: _bus_dmamap_load_phys" " failed %d\n", __func__, error); return (NULL); } hw_desc->dest_addr = seg.ds_addr; } desc->bus_dmadesc.callback_fn = callback_fn; desc->bus_dmadesc.callback_arg = callback_arg; return (desc); } struct bus_dmadesc * ioat_null(bus_dmaengine_t dmaengine, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_dma_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx); desc = ioat_op_generic(ioat, IOAT_OP_COPY, 8, 0, 0, callback_fn, callback_arg, flags); if (desc == NULL) return (NULL); hw_desc = &ioat_get_descriptor(ioat, desc->id)->dma; hw_desc->u.control.null = 1; ioat_submit_single(ioat); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_copy(bus_dmaengine_t dmaengine, bus_addr_t dst, bus_addr_t src, bus_size_t len, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_dma_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); if (((src | dst) & (0xffffull << 48)) != 0) { ioat_log_message(0, "%s: High 16 bits of src/dst invalid\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, IOAT_OP_COPY, len, src, dst, callback_fn, callback_arg, flags); if (desc == NULL) return (NULL); hw_desc = &ioat_get_descriptor(ioat, desc->id)->dma; if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); CTR6(KTR_IOAT, "%s channel=%u desc=%p dest=%lx src=%lx len=%lx", __func__, ioat->chan_idx, &desc->bus_dmadesc, dst, src, len); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_copy_8k_aligned(bus_dmaengine_t dmaengine, bus_addr_t dst1, bus_addr_t dst2, bus_addr_t src1, bus_addr_t src2, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_dma_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; bus_size_t src1_len, dst1_len; bus_dma_segment_t seg; int nseg, error; ioat = to_ioat_softc(dmaengine); CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx); if (((src1 | src2 | dst1 | dst2) & (0xffffull << 48)) != 0) { ioat_log_message(0, "%s: High 16 bits of src/dst invalid\n", __func__); return (NULL); } if (((src1 | src2 | dst1 | dst2) & PAGE_MASK) != 0) { ioat_log_message(0, "%s: Addresses must be page-aligned\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, IOAT_OP_COPY, 2 * PAGE_SIZE, 0, 0, callback_fn, callback_arg, flags); if (desc == NULL) return (NULL); hw_desc = &ioat_get_descriptor(ioat, desc->id)->dma; src1_len = (src2 != src1 + PAGE_SIZE) ? PAGE_SIZE : 2 * PAGE_SIZE; nseg = -1; error = _bus_dmamap_load_phys(ioat->data_tag, desc->src_dmamap, src1, src1_len, 0, &seg, &nseg); if (error != 0) { ioat_log_message(0, "%s: _bus_dmamap_load_phys" " failed %d\n", __func__, error); return (NULL); } hw_desc->src_addr = seg.ds_addr; if (src1_len != 2 * PAGE_SIZE) { hw_desc->u.control.src_page_break = 1; nseg = -1; error = _bus_dmamap_load_phys(ioat->data_tag, desc->src2_dmamap, src2, PAGE_SIZE, 0, &seg, &nseg); if (error != 0) { ioat_log_message(0, "%s: _bus_dmamap_load_phys" " failed %d\n", __func__, error); return (NULL); } hw_desc->next_src_addr = seg.ds_addr; } dst1_len = (dst2 != dst1 + PAGE_SIZE) ? PAGE_SIZE : 2 * PAGE_SIZE; nseg = -1; error = _bus_dmamap_load_phys(ioat->data_tag, desc->dst_dmamap, dst1, dst1_len, 0, &seg, &nseg); if (error != 0) { ioat_log_message(0, "%s: _bus_dmamap_load_phys" " failed %d\n", __func__, error); return (NULL); } hw_desc->dest_addr = seg.ds_addr; if (dst1_len != 2 * PAGE_SIZE) { hw_desc->u.control.dest_page_break = 1; nseg = -1; error = _bus_dmamap_load_phys(ioat->data_tag, desc->dst2_dmamap, dst2, PAGE_SIZE, 0, &seg, &nseg); if (error != 0) { ioat_log_message(0, "%s: _bus_dmamap_load_phys" " failed %d\n", __func__, error); return (NULL); } hw_desc->next_dest_addr = seg.ds_addr; } if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_copy_crc(bus_dmaengine_t dmaengine, bus_addr_t dst, bus_addr_t src, bus_size_t len, uint32_t *initialseed, bus_addr_t crcptr, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_crc32_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; uint32_t teststore; uint8_t op; bus_dma_segment_t seg; int nseg, error; ioat = to_ioat_softc(dmaengine); CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx); if ((ioat->capabilities & IOAT_DMACAP_MOVECRC) == 0) { ioat_log_message(0, "%s: Device lacks MOVECRC capability\n", __func__); return (NULL); } if (((src | dst) & (0xffffffull << 40)) != 0) { ioat_log_message(0, "%s: High 24 bits of src/dst invalid\n", __func__); return (NULL); } teststore = (flags & _DMA_CRC_TESTSTORE); if (teststore == _DMA_CRC_TESTSTORE) { ioat_log_message(0, "%s: TEST and STORE invalid\n", __func__); return (NULL); } if (teststore == 0 && (flags & DMA_CRC_INLINE) != 0) { ioat_log_message(0, "%s: INLINE invalid without TEST or STORE\n", __func__); return (NULL); } switch (teststore) { case DMA_CRC_STORE: op = IOAT_OP_MOVECRC_STORE; break; case DMA_CRC_TEST: op = IOAT_OP_MOVECRC_TEST; break; default: KASSERT(teststore == 0, ("bogus")); op = IOAT_OP_MOVECRC; break; } if ((flags & DMA_CRC_INLINE) == 0 && (crcptr & (0xffffffull << 40)) != 0) { ioat_log_message(0, "%s: High 24 bits of crcptr invalid\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, op, len, src, dst, callback_fn, callback_arg, flags & ~_DMA_CRC_FLAGS); if (desc == NULL) return (NULL); hw_desc = &ioat_get_descriptor(ioat, desc->id)->crc32; if ((flags & DMA_CRC_INLINE) == 0) { nseg = -1; error = _bus_dmamap_load_phys(ioat->data_crc_tag, desc->crc_dmamap, crcptr, sizeof(uint32_t), 0, &seg, &nseg); if (error != 0) { ioat_log_message(0, "%s: _bus_dmamap_load_phys" " failed %d\n", __func__, error); return (NULL); } hw_desc->crc_address = seg.ds_addr; } else hw_desc->u.control.crc_location = 1; if (initialseed != NULL) { hw_desc->u.control.use_seed = 1; hw_desc->seed = *initialseed; } if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_crc(bus_dmaengine_t dmaengine, bus_addr_t src, bus_size_t len, uint32_t *initialseed, bus_addr_t crcptr, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_crc32_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; uint32_t teststore; uint8_t op; bus_dma_segment_t seg; int nseg, error; ioat = to_ioat_softc(dmaengine); CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx); if ((ioat->capabilities & IOAT_DMACAP_CRC) == 0) { ioat_log_message(0, "%s: Device lacks CRC capability\n", __func__); return (NULL); } if ((src & (0xffffffull << 40)) != 0) { ioat_log_message(0, "%s: High 24 bits of src invalid\n", __func__); return (NULL); } teststore = (flags & _DMA_CRC_TESTSTORE); if (teststore == _DMA_CRC_TESTSTORE) { ioat_log_message(0, "%s: TEST and STORE invalid\n", __func__); return (NULL); } if (teststore == 0 && (flags & DMA_CRC_INLINE) != 0) { ioat_log_message(0, "%s: INLINE invalid without TEST or STORE\n", __func__); return (NULL); } switch (teststore) { case DMA_CRC_STORE: op = IOAT_OP_CRC_STORE; break; case DMA_CRC_TEST: op = IOAT_OP_CRC_TEST; break; default: KASSERT(teststore == 0, ("bogus")); op = IOAT_OP_CRC; break; } if ((flags & DMA_CRC_INLINE) == 0 && (crcptr & (0xffffffull << 40)) != 0) { ioat_log_message(0, "%s: High 24 bits of crcptr invalid\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, op, len, src, 0, callback_fn, callback_arg, flags & ~_DMA_CRC_FLAGS); if (desc == NULL) return (NULL); hw_desc = &ioat_get_descriptor(ioat, desc->id)->crc32; if ((flags & DMA_CRC_INLINE) == 0) { nseg = -1; error = _bus_dmamap_load_phys(ioat->data_crc_tag, desc->crc_dmamap, crcptr, sizeof(uint32_t), 0, &seg, &nseg); if (error != 0) { ioat_log_message(0, "%s: _bus_dmamap_load_phys" " failed %d\n", __func__, error); return (NULL); } hw_desc->crc_address = seg.ds_addr; } else hw_desc->u.control.crc_location = 1; if (initialseed != NULL) { hw_desc->u.control.use_seed = 1; hw_desc->seed = *initialseed; } if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_blockfill(bus_dmaengine_t dmaengine, bus_addr_t dst, uint64_t fillpattern, bus_size_t len, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_fill_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx); if ((ioat->capabilities & IOAT_DMACAP_BFILL) == 0) { ioat_log_message(0, "%s: Device lacks BFILL capability\n", __func__); return (NULL); } if ((dst & (0xffffull << 48)) != 0) { ioat_log_message(0, "%s: High 16 bits of dst invalid\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, IOAT_OP_FILL, len, 0, dst, callback_fn, callback_arg, flags); if (desc == NULL) return (NULL); hw_desc = &ioat_get_descriptor(ioat, desc->id)->fill; hw_desc->src_data = fillpattern; if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); return (&desc->bus_dmadesc); } /* * Ring Management */ static inline uint32_t ioat_get_active(struct ioat_softc *ioat) { return ((ioat->head - ioat->tail) & ((1 << ioat->ring_size_order) - 1)); } static inline uint32_t ioat_get_ring_space(struct ioat_softc *ioat) { return ((1 << ioat->ring_size_order) - ioat_get_active(ioat) - 1); } /* * Reserves space in this IOAT descriptor ring by ensuring enough slots remain * for 'num_descs'. * * If mflags contains M_WAITOK, blocks until enough space is available. * * Returns zero on success, or an errno on error. If num_descs is beyond the * maximum ring size, returns EINVAl; if allocation would block and mflags * contains M_NOWAIT, returns EAGAIN. * * Must be called with the submit_lock held; returns with the lock held. The * lock may be dropped to allocate the ring. * * (The submit_lock is needed to add any entries to the ring, so callers are * assured enough room is available.) */ static int ioat_reserve_space(struct ioat_softc *ioat, uint32_t num_descs, int mflags) { boolean_t dug; int error; mtx_assert(&ioat->submit_lock, MA_OWNED); error = 0; dug = FALSE; if (num_descs < 1 || num_descs >= (1 << ioat->ring_size_order)) { error = EINVAL; goto out; } for (;;) { if (ioat->quiescing) { error = ENXIO; goto out; } if (ioat_get_ring_space(ioat) >= num_descs) goto out; CTR3(KTR_IOAT, "%s channel=%u starved (%u)", __func__, ioat->chan_idx, num_descs); if (!dug && !ioat->is_submitter_processing) { ioat->is_submitter_processing = TRUE; mtx_unlock(&ioat->submit_lock); CTR2(KTR_IOAT, "%s channel=%u attempting to process events", __func__, ioat->chan_idx); ioat_process_events(ioat, FALSE); mtx_lock(&ioat->submit_lock); dug = TRUE; KASSERT(ioat->is_submitter_processing == TRUE, ("is_submitter_processing")); ioat->is_submitter_processing = FALSE; wakeup(&ioat->tail); continue; } if ((mflags & M_WAITOK) == 0) { error = EAGAIN; break; } CTR2(KTR_IOAT, "%s channel=%u blocking on completions", __func__, ioat->chan_idx); msleep(&ioat->tail, &ioat->submit_lock, 0, "ioat_full", 0); continue; } out: mtx_assert(&ioat->submit_lock, MA_OWNED); KASSERT(!ioat->quiescing || error == ENXIO, ("reserved during quiesce")); return (error); } static void ioat_free_ring(struct ioat_softc *ioat, uint32_t size, struct ioat_descriptor *ring) { - free(ring, M_IOAT); + free_domain(ring, M_IOAT); } static struct ioat_descriptor * ioat_get_ring_entry(struct ioat_softc *ioat, uint32_t index) { return (&ioat->ring[index % (1 << ioat->ring_size_order)]); } static union ioat_hw_descriptor * ioat_get_descriptor(struct ioat_softc *ioat, uint32_t index) { return (&ioat->hw_desc_ring[index % (1 << ioat->ring_size_order)]); } static void ioat_halted_debug(struct ioat_softc *ioat, uint32_t chanerr) { union ioat_hw_descriptor *desc; ioat_log_message(0, "Channel halted (%b)\n", (int)chanerr, IOAT_CHANERR_STR); if (chanerr == 0) return; mtx_assert(&ioat->cleanup_lock, MA_OWNED); desc = ioat_get_descriptor(ioat, ioat->tail + 0); dump_descriptor(desc); desc = ioat_get_descriptor(ioat, ioat->tail + 1); dump_descriptor(desc); } static void ioat_poll_timer_callback(void *arg) { struct ioat_softc *ioat; ioat = arg; ioat_log_message(3, "%s\n", __func__); ioat_process_events(ioat, FALSE); mtx_lock(&ioat->submit_lock); if (ioat_get_active(ioat) > 0) callout_schedule(&ioat->poll_timer, 1); mtx_unlock(&ioat->submit_lock); } /* * Support Functions */ static void ioat_submit_single(struct ioat_softc *ioat) { mtx_assert(&ioat->submit_lock, MA_OWNED); ioat->head++; CTR4(KTR_IOAT, "%s channel=%u head=%u tail=%u", __func__, ioat->chan_idx, ioat->head, ioat->tail); ioat->stats.descriptors_submitted++; } static int ioat_reset_hw(struct ioat_softc *ioat) { uint64_t status; uint32_t chanerr; unsigned timeout; int error; CTR2(KTR_IOAT, "%s channel=%u", __func__, ioat->chan_idx); mtx_lock(&ioat->submit_lock); while (ioat->resetting && !ioat->destroying) msleep(&ioat->resetting, &ioat->submit_lock, 0, "IRH_drain", 0); if (ioat->destroying) { mtx_unlock(&ioat->submit_lock); return (ENXIO); } ioat->resetting = TRUE; ioat->quiescing = TRUE; mtx_unlock(&ioat->submit_lock); mtx_lock(&ioat->cleanup_lock); while (ioat_get_active(ioat) > 0) msleep(&ioat->tail, &ioat->cleanup_lock, 0, "ioat_drain", 1); /* * Suspend ioat_process_events while the hardware and softc are in an * indeterminate state. */ ioat->resetting_cleanup = TRUE; mtx_unlock(&ioat->cleanup_lock); CTR2(KTR_IOAT, "%s channel=%u quiesced and drained", __func__, ioat->chan_idx); status = ioat_get_chansts(ioat); if (is_ioat_active(status) || is_ioat_idle(status)) ioat_suspend(ioat); /* Wait at most 20 ms */ for (timeout = 0; (is_ioat_active(status) || is_ioat_idle(status)) && timeout < 20; timeout++) { DELAY(1000); status = ioat_get_chansts(ioat); } if (timeout == 20) { error = ETIMEDOUT; goto out; } KASSERT(ioat_get_active(ioat) == 0, ("active after quiesce")); chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET); ioat_write_4(ioat, IOAT_CHANERR_OFFSET, chanerr); CTR2(KTR_IOAT, "%s channel=%u hardware suspended", __func__, ioat->chan_idx); /* * IOAT v3 workaround - CHANERRMSK_INT with 3E07h to masks out errors * that can cause stability issues for IOAT v3. */ pci_write_config(ioat->device, IOAT_CFG_CHANERRMASK_INT_OFFSET, 0x3e07, 4); chanerr = pci_read_config(ioat->device, IOAT_CFG_CHANERR_INT_OFFSET, 4); pci_write_config(ioat->device, IOAT_CFG_CHANERR_INT_OFFSET, chanerr, 4); /* * BDXDE and BWD models reset MSI-X registers on device reset. * Save/restore their contents manually. */ if (ioat_model_resets_msix(ioat)) { ioat_log_message(1, "device resets MSI-X registers; saving\n"); pci_save_state(ioat->device); } ioat_reset(ioat); CTR2(KTR_IOAT, "%s channel=%u hardware reset", __func__, ioat->chan_idx); /* Wait at most 20 ms */ for (timeout = 0; ioat_reset_pending(ioat) && timeout < 20; timeout++) DELAY(1000); if (timeout == 20) { error = ETIMEDOUT; goto out; } if (ioat_model_resets_msix(ioat)) { ioat_log_message(1, "device resets registers; restored\n"); pci_restore_state(ioat->device); } /* Reset attempts to return the hardware to "halted." */ status = ioat_get_chansts(ioat); if (is_ioat_active(status) || is_ioat_idle(status)) { /* So this really shouldn't happen... */ ioat_log_message(0, "Device is active after a reset?\n"); ioat_write_chanctrl(ioat, IOAT_CHANCTRL_RUN); error = 0; goto out; } chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET); if (chanerr != 0) { mtx_lock(&ioat->cleanup_lock); ioat_halted_debug(ioat, chanerr); mtx_unlock(&ioat->cleanup_lock); error = EIO; goto out; } /* * Bring device back online after reset. Writing CHAINADDR brings the * device back to active. * * The internal ring counter resets to zero, so we have to start over * at zero as well. */ ioat->tail = ioat->head = 0; ioat->last_seen = 0; *ioat->comp_update = 0; ioat_write_chanctrl(ioat, IOAT_CHANCTRL_RUN); ioat_write_chancmp(ioat, ioat->comp_update_bus_addr); ioat_write_chainaddr(ioat, RING_PHYS_ADDR(ioat, 0)); error = 0; CTR2(KTR_IOAT, "%s channel=%u configured channel", __func__, ioat->chan_idx); out: /* Enqueues a null operation and ensures it completes. */ if (error == 0) { error = ioat_start_channel(ioat); CTR2(KTR_IOAT, "%s channel=%u started channel", __func__, ioat->chan_idx); } /* * Resume completions now that ring state is consistent. */ mtx_lock(&ioat->cleanup_lock); ioat->resetting_cleanup = FALSE; mtx_unlock(&ioat->cleanup_lock); /* Unblock submission of new work */ mtx_lock(&ioat->submit_lock); ioat->quiescing = FALSE; wakeup(&ioat->quiescing); ioat->resetting = FALSE; wakeup(&ioat->resetting); CTR2(KTR_IOAT, "%s channel=%u reset done", __func__, ioat->chan_idx); mtx_unlock(&ioat->submit_lock); return (error); } static int sysctl_handle_chansts(SYSCTL_HANDLER_ARGS) { struct ioat_softc *ioat; struct sbuf sb; uint64_t status; int error; ioat = arg1; status = ioat_get_chansts(ioat) & IOAT_CHANSTS_STATUS; sbuf_new_for_sysctl(&sb, NULL, 256, req); switch (status) { case IOAT_CHANSTS_ACTIVE: sbuf_printf(&sb, "ACTIVE"); break; case IOAT_CHANSTS_IDLE: sbuf_printf(&sb, "IDLE"); break; case IOAT_CHANSTS_SUSPENDED: sbuf_printf(&sb, "SUSPENDED"); break; case IOAT_CHANSTS_HALTED: sbuf_printf(&sb, "HALTED"); break; case IOAT_CHANSTS_ARMED: sbuf_printf(&sb, "ARMED"); break; default: sbuf_printf(&sb, "UNKNOWN"); break; } error = sbuf_finish(&sb); sbuf_delete(&sb); if (error != 0 || req->newptr == NULL) return (error); return (EINVAL); } static int sysctl_handle_dpi(SYSCTL_HANDLER_ARGS) { struct ioat_softc *ioat; struct sbuf sb; #define PRECISION "1" const uintmax_t factor = 10; uintmax_t rate; int error; ioat = arg1; sbuf_new_for_sysctl(&sb, NULL, 16, req); if (ioat->stats.interrupts == 0) { sbuf_printf(&sb, "NaN"); goto out; } rate = ioat->stats.descriptors_processed * factor / ioat->stats.interrupts; sbuf_printf(&sb, "%ju.%." PRECISION "ju", rate / factor, rate % factor); #undef PRECISION out: error = sbuf_finish(&sb); sbuf_delete(&sb); if (error != 0 || req->newptr == NULL) return (error); return (EINVAL); } static int sysctl_handle_reset(SYSCTL_HANDLER_ARGS) { struct ioat_softc *ioat; int error, arg; ioat = arg1; arg = 0; error = SYSCTL_OUT(req, &arg, sizeof(arg)); if (error != 0 || req->newptr == NULL) return (error); error = SYSCTL_IN(req, &arg, sizeof(arg)); if (error != 0) return (error); if (arg != 0) error = ioat_reset_hw(ioat); return (error); } static void dump_descriptor(void *hw_desc) { int i, j; for (i = 0; i < 2; i++) { for (j = 0; j < 8; j++) printf("%08x ", ((uint32_t *)hw_desc)[i * 8 + j]); printf("\n"); } } static void ioat_setup_sysctl(device_t device) { struct sysctl_oid_list *par, *statpar, *state, *hammer; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree, *tmp; struct ioat_softc *ioat; ioat = DEVICE2SOFTC(device); ctx = device_get_sysctl_ctx(device); tree = device_get_sysctl_tree(device); par = SYSCTL_CHILDREN(tree); SYSCTL_ADD_INT(ctx, par, OID_AUTO, "version", CTLFLAG_RD, &ioat->version, 0, "HW version (0xMM form)"); SYSCTL_ADD_UINT(ctx, par, OID_AUTO, "max_xfer_size", CTLFLAG_RD, &ioat->max_xfer_size, 0, "HW maximum transfer size"); SYSCTL_ADD_INT(ctx, par, OID_AUTO, "intrdelay_supported", CTLFLAG_RD, &ioat->intrdelay_supported, 0, "Is INTRDELAY supported"); SYSCTL_ADD_U16(ctx, par, OID_AUTO, "intrdelay_max", CTLFLAG_RD, &ioat->intrdelay_max, 0, "Maximum configurable INTRDELAY on this channel (microseconds)"); tmp = SYSCTL_ADD_NODE(ctx, par, OID_AUTO, "state", CTLFLAG_RD, NULL, "IOAT channel internal state"); state = SYSCTL_CHILDREN(tmp); SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "ring_size_order", CTLFLAG_RD, &ioat->ring_size_order, 0, "SW descriptor ring size order"); SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "head", CTLFLAG_RD, &ioat->head, 0, "SW descriptor head pointer index"); SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "tail", CTLFLAG_RD, &ioat->tail, 0, "SW descriptor tail pointer index"); SYSCTL_ADD_UQUAD(ctx, state, OID_AUTO, "last_completion", CTLFLAG_RD, ioat->comp_update, "HW addr of last completion"); SYSCTL_ADD_INT(ctx, state, OID_AUTO, "is_submitter_processing", CTLFLAG_RD, &ioat->is_submitter_processing, 0, "submitter processing"); SYSCTL_ADD_PROC(ctx, state, OID_AUTO, "chansts", CTLTYPE_STRING | CTLFLAG_RD, ioat, 0, sysctl_handle_chansts, "A", "String of the channel status"); SYSCTL_ADD_U16(ctx, state, OID_AUTO, "intrdelay", CTLFLAG_RD, &ioat->cached_intrdelay, 0, "Current INTRDELAY on this channel (cached, microseconds)"); tmp = SYSCTL_ADD_NODE(ctx, par, OID_AUTO, "hammer", CTLFLAG_RD, NULL, "Big hammers (mostly for testing)"); hammer = SYSCTL_CHILDREN(tmp); SYSCTL_ADD_PROC(ctx, hammer, OID_AUTO, "force_hw_reset", CTLTYPE_INT | CTLFLAG_RW, ioat, 0, sysctl_handle_reset, "I", "Set to non-zero to reset the hardware"); tmp = SYSCTL_ADD_NODE(ctx, par, OID_AUTO, "stats", CTLFLAG_RD, NULL, "IOAT channel statistics"); statpar = SYSCTL_CHILDREN(tmp); SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "interrupts", CTLFLAG_RW, &ioat->stats.interrupts, "Number of interrupts processed on this channel"); SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "descriptors", CTLFLAG_RW, &ioat->stats.descriptors_processed, "Number of descriptors processed on this channel"); SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "submitted", CTLFLAG_RW, &ioat->stats.descriptors_submitted, "Number of descriptors submitted to this channel"); SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "errored", CTLFLAG_RW, &ioat->stats.descriptors_error, "Number of descriptors failed by channel errors"); SYSCTL_ADD_U32(ctx, statpar, OID_AUTO, "halts", CTLFLAG_RW, &ioat->stats.channel_halts, 0, "Number of times the channel has halted"); SYSCTL_ADD_U32(ctx, statpar, OID_AUTO, "last_halt_chanerr", CTLFLAG_RW, &ioat->stats.last_halt_chanerr, 0, "The raw CHANERR when the channel was last halted"); SYSCTL_ADD_PROC(ctx, statpar, OID_AUTO, "desc_per_interrupt", CTLTYPE_STRING | CTLFLAG_RD, ioat, 0, sysctl_handle_dpi, "A", "Descriptors per interrupt"); } static void ioat_get(struct ioat_softc *ioat) { mtx_assert(&ioat->submit_lock, MA_OWNED); KASSERT(ioat->refcnt < UINT32_MAX, ("refcnt overflow")); ioat->refcnt++; } static void ioat_put(struct ioat_softc *ioat) { mtx_assert(&ioat->submit_lock, MA_OWNED); KASSERT(ioat->refcnt >= 1, ("refcnt error")); if (--ioat->refcnt == 0) wakeup(&ioat->refcnt); } static void ioat_drain_locked(struct ioat_softc *ioat) { mtx_assert(&ioat->submit_lock, MA_OWNED); while (ioat->refcnt > 0) msleep(&ioat->refcnt, &ioat->submit_lock, 0, "ioat_drain", 0); } #ifdef DDB #define _db_show_lock(lo) LOCK_CLASS(lo)->lc_ddb_show(lo) #define db_show_lock(lk) _db_show_lock(&(lk)->lock_object) DB_SHOW_COMMAND(ioat, db_show_ioat) { struct ioat_softc *sc; unsigned idx; if (!have_addr) goto usage; idx = (unsigned)addr; if (idx >= ioat_channel_index) goto usage; sc = ioat_channel[idx]; db_printf("ioat softc at %p\n", sc); if (sc == NULL) return; db_printf(" version: %d\n", sc->version); db_printf(" chan_idx: %u\n", sc->chan_idx); db_printf(" submit_lock: "); db_show_lock(&sc->submit_lock); db_printf(" capabilities: %b\n", (int)sc->capabilities, IOAT_DMACAP_STR); db_printf(" cached_intrdelay: %u\n", sc->cached_intrdelay); db_printf(" *comp_update: 0x%jx\n", (uintmax_t)*sc->comp_update); db_printf(" poll_timer:\n"); db_printf(" c_time: %ju\n", (uintmax_t)sc->poll_timer.c_time); db_printf(" c_arg: %p\n", sc->poll_timer.c_arg); db_printf(" c_func: %p\n", sc->poll_timer.c_func); db_printf(" c_lock: %p\n", sc->poll_timer.c_lock); db_printf(" c_flags: 0x%x\n", (unsigned)sc->poll_timer.c_flags); db_printf(" quiescing: %d\n", (int)sc->quiescing); db_printf(" destroying: %d\n", (int)sc->destroying); db_printf(" is_submitter_processing: %d\n", (int)sc->is_submitter_processing); db_printf(" intrdelay_supported: %d\n", (int)sc->intrdelay_supported); db_printf(" resetting: %d\n", (int)sc->resetting); db_printf(" head: %u\n", sc->head); db_printf(" tail: %u\n", sc->tail); db_printf(" ring_size_order: %u\n", sc->ring_size_order); db_printf(" last_seen: 0x%lx\n", sc->last_seen); db_printf(" ring: %p\n", sc->ring); db_printf(" descriptors: %p\n", sc->hw_desc_ring); db_printf(" descriptors (phys): 0x%jx\n", (uintmax_t)sc->hw_desc_bus_addr); db_printf(" ring[%u] (tail):\n", sc->tail % (1 << sc->ring_size_order)); db_printf(" id: %u\n", ioat_get_ring_entry(sc, sc->tail)->id); db_printf(" addr: 0x%lx\n", RING_PHYS_ADDR(sc, sc->tail)); db_printf(" next: 0x%lx\n", ioat_get_descriptor(sc, sc->tail)->generic.next); db_printf(" ring[%u] (head - 1):\n", (sc->head - 1) % (1 << sc->ring_size_order)); db_printf(" id: %u\n", ioat_get_ring_entry(sc, sc->head - 1)->id); db_printf(" addr: 0x%lx\n", RING_PHYS_ADDR(sc, sc->head - 1)); db_printf(" next: 0x%lx\n", ioat_get_descriptor(sc, sc->head - 1)->generic.next); db_printf(" ring[%u] (head):\n", (sc->head) % (1 << sc->ring_size_order)); db_printf(" id: %u\n", ioat_get_ring_entry(sc, sc->head)->id); db_printf(" addr: 0x%lx\n", RING_PHYS_ADDR(sc, sc->head)); db_printf(" next: 0x%lx\n", ioat_get_descriptor(sc, sc->head)->generic.next); for (idx = 0; idx < (1 << sc->ring_size_order); idx++) if ((*sc->comp_update & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_MASK) == RING_PHYS_ADDR(sc, idx)) db_printf(" ring[%u] == hardware tail\n", idx); db_printf(" cleanup_lock: "); db_show_lock(&sc->cleanup_lock); db_printf(" refcnt: %u\n", sc->refcnt); db_printf(" stats:\n"); db_printf(" interrupts: %lu\n", sc->stats.interrupts); db_printf(" descriptors_processed: %lu\n", sc->stats.descriptors_processed); db_printf(" descriptors_error: %lu\n", sc->stats.descriptors_error); db_printf(" descriptors_submitted: %lu\n", sc->stats.descriptors_submitted); db_printf(" channel_halts: %u\n", sc->stats.channel_halts); db_printf(" last_halt_chanerr: %u\n", sc->stats.last_halt_chanerr); if (db_pager_quit) return; db_printf(" hw status:\n"); db_printf(" status: 0x%lx\n", ioat_get_chansts(sc)); db_printf(" chanctrl: 0x%x\n", (unsigned)ioat_read_2(sc, IOAT_CHANCTRL_OFFSET)); db_printf(" chancmd: 0x%x\n", (unsigned)ioat_read_1(sc, IOAT_CHANCMD_OFFSET)); db_printf(" dmacount: 0x%x\n", (unsigned)ioat_read_2(sc, IOAT_DMACOUNT_OFFSET)); db_printf(" chainaddr: 0x%lx\n", ioat_read_double_4(sc, IOAT_CHAINADDR_OFFSET_LOW)); db_printf(" chancmp: 0x%lx\n", ioat_read_double_4(sc, IOAT_CHANCMP_OFFSET_LOW)); db_printf(" chanerr: %b\n", (int)ioat_read_4(sc, IOAT_CHANERR_OFFSET), IOAT_CHANERR_STR); return; usage: db_printf("usage: show ioat <0-%u>\n", ioat_channel_index); return; } #endif /* DDB */ Index: projects/clang900-import/sys/dev/ioat/ioat_internal.h =================================================================== --- projects/clang900-import/sys/dev/ioat/ioat_internal.h (revision 352586) +++ projects/clang900-import/sys/dev/ioat/ioat_internal.h (revision 352587) @@ -1,609 +1,611 @@ /*- * Copyright (C) 2012 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ __FBSDID("$FreeBSD$"); #ifndef __IOAT_INTERNAL_H__ #define __IOAT_INTERNAL_H__ #include #define DEVICE2SOFTC(dev) ((struct ioat_softc *) device_get_softc(dev)) #define KTR_IOAT KTR_SPARE3 #define ioat_read_chancnt(ioat) \ ioat_read_1((ioat), IOAT_CHANCNT_OFFSET) #define ioat_read_xfercap(ioat) \ (ioat_read_1((ioat), IOAT_XFERCAP_OFFSET) & IOAT_XFERCAP_VALID_MASK) #define ioat_write_intrctrl(ioat, value) \ ioat_write_1((ioat), IOAT_INTRCTRL_OFFSET, (value)) #define ioat_read_cbver(ioat) \ (ioat_read_1((ioat), IOAT_CBVER_OFFSET) & 0xFF) #define ioat_read_dmacapability(ioat) \ ioat_read_4((ioat), IOAT_DMACAPABILITY_OFFSET) #define ioat_write_chanctrl(ioat, value) \ ioat_write_2((ioat), IOAT_CHANCTRL_OFFSET, (value)) static __inline uint64_t ioat_bus_space_read_8_lower_first(bus_space_tag_t tag, bus_space_handle_t handle, bus_size_t offset) { return (bus_space_read_4(tag, handle, offset) | ((uint64_t)bus_space_read_4(tag, handle, offset + 4)) << 32); } static __inline void ioat_bus_space_write_8_lower_first(bus_space_tag_t tag, bus_space_handle_t handle, bus_size_t offset, uint64_t val) { bus_space_write_4(tag, handle, offset, val); bus_space_write_4(tag, handle, offset + 4, val >> 32); } #ifdef __i386__ #define ioat_bus_space_read_8 ioat_bus_space_read_8_lower_first #define ioat_bus_space_write_8 ioat_bus_space_write_8_lower_first #else #define ioat_bus_space_read_8(tag, handle, offset) \ bus_space_read_8((tag), (handle), (offset)) #define ioat_bus_space_write_8(tag, handle, offset, val) \ bus_space_write_8((tag), (handle), (offset), (val)) #endif #define ioat_read_1(ioat, offset) \ bus_space_read_1((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \ (offset)) #define ioat_read_2(ioat, offset) \ bus_space_read_2((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \ (offset)) #define ioat_read_4(ioat, offset) \ bus_space_read_4((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \ (offset)) #define ioat_read_8(ioat, offset) \ ioat_bus_space_read_8((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \ (offset)) #define ioat_read_double_4(ioat, offset) \ ioat_bus_space_read_8_lower_first((ioat)->pci_bus_tag, \ (ioat)->pci_bus_handle, (offset)) #define ioat_write_1(ioat, offset, value) \ bus_space_write_1((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \ (offset), (value)) #define ioat_write_2(ioat, offset, value) \ bus_space_write_2((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \ (offset), (value)) #define ioat_write_4(ioat, offset, value) \ bus_space_write_4((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \ (offset), (value)) #define ioat_write_8(ioat, offset, value) \ ioat_bus_space_write_8((ioat)->pci_bus_tag, (ioat)->pci_bus_handle, \ (offset), (value)) #define ioat_write_double_4(ioat, offset, value) \ ioat_bus_space_write_8_lower_first((ioat)->pci_bus_tag, \ (ioat)->pci_bus_handle, (offset), (value)) MALLOC_DECLARE(M_IOAT); SYSCTL_DECL(_hw_ioat); extern int g_ioat_debug_level; struct generic_dma_control { uint32_t int_enable:1; uint32_t src_snoop_disable:1; uint32_t dest_snoop_disable:1; uint32_t completion_update:1; uint32_t fence:1; uint32_t reserved1:1; uint32_t src_page_break:1; uint32_t dest_page_break:1; uint32_t bundle:1; uint32_t dest_dca:1; uint32_t hint:1; uint32_t reserved2:13; uint32_t op:8; }; struct ioat_generic_hw_descriptor { uint32_t size; union { uint32_t control_raw; struct generic_dma_control control_generic; } u; uint64_t src_addr; uint64_t dest_addr; uint64_t next; uint64_t reserved[4]; }; struct ioat_dma_hw_descriptor { uint32_t size; union { uint32_t control_raw; struct generic_dma_control control_generic; struct { uint32_t int_enable:1; uint32_t src_snoop_disable:1; uint32_t dest_snoop_disable:1; uint32_t completion_update:1; uint32_t fence:1; uint32_t null:1; uint32_t src_page_break:1; uint32_t dest_page_break:1; uint32_t bundle:1; uint32_t dest_dca:1; uint32_t hint:1; uint32_t reserved:13; #define IOAT_OP_COPY 0x00 uint32_t op:8; } control; } u; uint64_t src_addr; uint64_t dest_addr; uint64_t next; uint64_t next_src_addr; uint64_t next_dest_addr; uint64_t user1; uint64_t user2; }; struct ioat_fill_hw_descriptor { uint32_t size; union { uint32_t control_raw; struct generic_dma_control control_generic; struct { uint32_t int_enable:1; uint32_t reserved:1; uint32_t dest_snoop_disable:1; uint32_t completion_update:1; uint32_t fence:1; uint32_t reserved2:2; uint32_t dest_page_break:1; uint32_t bundle:1; uint32_t reserved3:15; #define IOAT_OP_FILL 0x01 uint32_t op:8; } control; } u; uint64_t src_data; uint64_t dest_addr; uint64_t next; uint64_t reserved; uint64_t next_dest_addr; uint64_t user1; uint64_t user2; }; struct ioat_crc32_hw_descriptor { uint32_t size; union { uint32_t control_raw; struct generic_dma_control control_generic; struct { uint32_t int_enable:1; uint32_t src_snoop_disable:1; uint32_t dest_snoop_disable:1; uint32_t completion_update:1; uint32_t fence:1; uint32_t reserved1:3; uint32_t bundle:1; uint32_t dest_dca:1; uint32_t hint:1; uint32_t use_seed:1; /* * crc_location: * For IOAT_OP_MOVECRC_TEST and IOAT_OP_CRC_TEST: * 0: comparison value is pointed to by CRC Address * field. * 1: comparison value follows data in wire format * ("inverted reflected bit order") in the 4 bytes * following the source data. * * For IOAT_OP_CRC_STORE: * 0: Result will be stored at location pointed to by * CRC Address field (in wire format). * 1: Result will be stored directly following the * source data. * * For IOAT_OP_MOVECRC_STORE: * 0: Result will be stored at location pointed to by * CRC Address field (in wire format). * 1: Result will be stored directly following the * *destination* data. */ uint32_t crc_location:1; uint32_t reserved2:11; /* * MOVECRC - Move data in the same way as standard copy * operation, but also compute CRC32. * * CRC - Only compute CRC on source data. * * There is a CRC accumulator register in the hardware. * If 'initial' is set, it is initialized to the value * in 'seed.' * * In all modes, these operators accumulate size bytes * at src_addr into the running CRC32C. * * Store mode emits the accumulated CRC, in wire * format, as specified by the crc_location bit above. * * Test mode compares the accumulated CRC against the * reference CRC, as described in crc_location above. * On failure, halts the DMA engine with a CRC error * status. */ #define IOAT_OP_MOVECRC 0x41 #define IOAT_OP_MOVECRC_TEST 0x42 #define IOAT_OP_MOVECRC_STORE 0x43 #define IOAT_OP_CRC 0x81 #define IOAT_OP_CRC_TEST 0x82 #define IOAT_OP_CRC_STORE 0x83 uint32_t op:8; } control; } u; uint64_t src_addr; uint64_t dest_addr; uint64_t next; uint64_t next_src_addr; uint64_t next_dest_addr; uint32_t seed; uint32_t reserved; uint64_t crc_address; }; struct ioat_xor_hw_descriptor { uint32_t size; union { uint32_t control_raw; struct generic_dma_control control_generic; struct { uint32_t int_enable:1; uint32_t src_snoop_disable:1; uint32_t dest_snoop_disable:1; uint32_t completion_update:1; uint32_t fence:1; uint32_t src_count:3; uint32_t bundle:1; uint32_t dest_dca:1; uint32_t hint:1; uint32_t reserved:13; #define IOAT_OP_XOR 0x87 #define IOAT_OP_XOR_VAL 0x88 uint32_t op:8; } control; } u; uint64_t src_addr; uint64_t dest_addr; uint64_t next; uint64_t src_addr2; uint64_t src_addr3; uint64_t src_addr4; uint64_t src_addr5; }; struct ioat_xor_ext_hw_descriptor { uint64_t src_addr6; uint64_t src_addr7; uint64_t src_addr8; uint64_t next; uint64_t reserved[4]; }; struct ioat_pq_hw_descriptor { uint32_t size; union { uint32_t control_raw; struct generic_dma_control control_generic; struct { uint32_t int_enable:1; uint32_t src_snoop_disable:1; uint32_t dest_snoop_disable:1; uint32_t completion_update:1; uint32_t fence:1; uint32_t src_count:3; uint32_t bundle:1; uint32_t dest_dca:1; uint32_t hint:1; uint32_t p_disable:1; uint32_t q_disable:1; uint32_t reserved:11; #define IOAT_OP_PQ 0x89 #define IOAT_OP_PQ_VAL 0x8a uint32_t op:8; } control; } u; uint64_t src_addr; uint64_t p_addr; uint64_t next; uint64_t src_addr2; uint64_t src_addr3; uint8_t coef[8]; uint64_t q_addr; }; struct ioat_pq_ext_hw_descriptor { uint64_t src_addr4; uint64_t src_addr5; uint64_t src_addr6; uint64_t next; uint64_t src_addr7; uint64_t src_addr8; uint64_t reserved[2]; }; struct ioat_pq_update_hw_descriptor { uint32_t size; union { uint32_t control_raw; struct generic_dma_control control_generic; struct { uint32_t int_enable:1; uint32_t src_snoop_disable:1; uint32_t dest_snoop_disable:1; uint32_t completion_update:1; uint32_t fence:1; uint32_t src_cnt:3; uint32_t bundle:1; uint32_t dest_dca:1; uint32_t hint:1; uint32_t p_disable:1; uint32_t q_disable:1; uint32_t reserved:3; uint32_t coef:8; #define IOAT_OP_PQ_UP 0x8b uint32_t op:8; } control; } u; uint64_t src_addr; uint64_t p_addr; uint64_t next; uint64_t src_addr2; uint64_t p_src; uint64_t q_src; uint64_t q_addr; }; struct ioat_raw_hw_descriptor { uint64_t field[8]; }; struct bus_dmadesc { bus_dmaengine_callback_t callback_fn; void *callback_arg; }; struct ioat_descriptor { struct bus_dmadesc bus_dmadesc; uint32_t id; bus_dmamap_t src_dmamap; bus_dmamap_t dst_dmamap; bus_dmamap_t src2_dmamap; bus_dmamap_t dst2_dmamap; bus_dmamap_t crc_dmamap; }; /* Unused by this driver at this time. */ #define IOAT_OP_MARKER 0x84 /* * Deprecated OPs -- v3 DMA generates an abort if given these. And this driver * doesn't support anything older than v3. */ #define IOAT_OP_OLD_XOR 0x85 #define IOAT_OP_OLD_XOR_VAL 0x86 /* One of these per allocated PCI device. */ struct ioat_softc { bus_dmaengine_t dmaengine; #define to_ioat_softc(_dmaeng) \ ({ \ bus_dmaengine_t *_p = (_dmaeng); \ (struct ioat_softc *)((char *)_p - \ offsetof(struct ioat_softc, dmaengine)); \ }) device_t device; + int domain; + int cpu; int version; unsigned chan_idx; bus_space_tag_t pci_bus_tag; bus_space_handle_t pci_bus_handle; struct resource *pci_resource; int pci_resource_id; uint32_t max_xfer_size; uint32_t capabilities; uint32_t ring_size_order; uint16_t intrdelay_max; uint16_t cached_intrdelay; int rid; struct resource *res; void *tag; bus_dma_tag_t hw_desc_tag; bus_dmamap_t hw_desc_map; bus_dma_tag_t data_tag; bus_dma_tag_t data_crc_tag; bus_dma_tag_t comp_update_tag; bus_dmamap_t comp_update_map; uint64_t *comp_update; bus_addr_t comp_update_bus_addr; boolean_t quiescing; boolean_t destroying; boolean_t is_submitter_processing; boolean_t intrdelay_supported; boolean_t resetting; /* submit_lock */ boolean_t resetting_cleanup; /* cleanup_lock */ struct ioat_descriptor *ring; union ioat_hw_descriptor { struct ioat_generic_hw_descriptor generic; struct ioat_dma_hw_descriptor dma; struct ioat_fill_hw_descriptor fill; struct ioat_crc32_hw_descriptor crc32; struct ioat_xor_hw_descriptor xor; struct ioat_xor_ext_hw_descriptor xor_ext; struct ioat_pq_hw_descriptor pq; struct ioat_pq_ext_hw_descriptor pq_ext; struct ioat_raw_hw_descriptor raw; } *hw_desc_ring; bus_addr_t hw_desc_bus_addr; #define RING_PHYS_ADDR(sc, i) (sc)->hw_desc_bus_addr + \ (((i) % (1 << (sc)->ring_size_order)) * sizeof(struct ioat_dma_hw_descriptor)) struct mtx_padalign submit_lock; struct callout poll_timer; struct task reset_task; struct mtx_padalign cleanup_lock; uint32_t refcnt; uint32_t head; uint32_t acq_head; uint32_t tail; bus_addr_t last_seen; struct { uint64_t interrupts; uint64_t descriptors_processed; uint64_t descriptors_error; uint64_t descriptors_submitted; uint32_t channel_halts; uint32_t last_halt_chanerr; } stats; }; void ioat_test_attach(void); void ioat_test_detach(void); /* * XXX DO NOT USE this routine for obtaining the current completed descriptor. * * The double_4 read on ioat<3.3 appears to result in torn reads. And v3.2 * hardware is still commonplace (Broadwell Xeon has it). Instead, use the * device-pushed *comp_update. * * It is safe to use ioat_get_chansts() for the low status bits. */ static inline uint64_t ioat_get_chansts(struct ioat_softc *ioat) { uint64_t status; if (ioat->version >= IOAT_VER_3_3) status = ioat_read_8(ioat, IOAT_CHANSTS_OFFSET); else /* Must read lower 4 bytes before upper 4 bytes. */ status = ioat_read_double_4(ioat, IOAT_CHANSTS_OFFSET); return (status); } static inline void ioat_write_chancmp(struct ioat_softc *ioat, uint64_t addr) { if (ioat->version >= IOAT_VER_3_3) ioat_write_8(ioat, IOAT_CHANCMP_OFFSET_LOW, addr); else ioat_write_double_4(ioat, IOAT_CHANCMP_OFFSET_LOW, addr); } static inline void ioat_write_chainaddr(struct ioat_softc *ioat, uint64_t addr) { if (ioat->version >= IOAT_VER_3_3) ioat_write_8(ioat, IOAT_CHAINADDR_OFFSET_LOW, addr); else ioat_write_double_4(ioat, IOAT_CHAINADDR_OFFSET_LOW, addr); } static inline boolean_t is_ioat_active(uint64_t status) { return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_ACTIVE); } static inline boolean_t is_ioat_idle(uint64_t status) { return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_IDLE); } static inline boolean_t is_ioat_halted(uint64_t status) { return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_HALTED); } static inline boolean_t is_ioat_suspended(uint64_t status) { return ((status & IOAT_CHANSTS_STATUS) == IOAT_CHANSTS_SUSPENDED); } static inline void ioat_suspend(struct ioat_softc *ioat) { ioat_write_1(ioat, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_SUSPEND); } static inline void ioat_reset(struct ioat_softc *ioat) { ioat_write_1(ioat, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET); } static inline boolean_t ioat_reset_pending(struct ioat_softc *ioat) { uint8_t cmd; cmd = ioat_read_1(ioat, IOAT_CHANCMD_OFFSET); return ((cmd & IOAT_CHANCMD_RESET) != 0); } #endif /* __IOAT_INTERNAL_H__ */ Index: projects/clang900-import/sys/dev/usb/controller/xhci.c =================================================================== --- projects/clang900-import/sys/dev/usb/controller/xhci.c (revision 352586) +++ projects/clang900-import/sys/dev/usb/controller/xhci.c (revision 352587) @@ -1,4369 +1,4372 @@ /* $FreeBSD$ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 Hans Petter Selasky. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * USB eXtensible Host Controller Interface, a.k.a. USB 3.0 controller. * * The XHCI 1.0 spec can be found at * http://www.intel.com/technology/usb/download/xHCI_Specification_for_USB.pdf * and the USB 3.0 spec at * http://www.usb.org/developers/docs/usb_30_spec_060910.zip */ /* * A few words about the design implementation: This driver emulates * the concept about TDs which is found in EHCI specification. This * way we achieve that the USB controller drivers look similar to * eachother which makes it easier to understand the code. */ #ifdef USB_GLOBAL_INCLUDE_FILE #include USB_GLOBAL_INCLUDE_FILE #else #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define USB_DEBUG_VAR xhcidebug #include #include #include #include #include #include #include #include #include #include #endif /* USB_GLOBAL_INCLUDE_FILE */ #include #include #define XHCI_BUS2SC(bus) \ ((struct xhci_softc *)(((uint8_t *)(bus)) - \ ((uint8_t *)&(((struct xhci_softc *)0)->sc_bus)))) static SYSCTL_NODE(_hw_usb, OID_AUTO, xhci, CTLFLAG_RW, 0, "USB XHCI"); static int xhcistreams; SYSCTL_INT(_hw_usb_xhci, OID_AUTO, streams, CTLFLAG_RWTUN, &xhcistreams, 0, "Set to enable streams mode support"); #ifdef USB_DEBUG static int xhcidebug; static int xhciroute; static int xhcipolling; static int xhcidma32; static int xhcictlstep; SYSCTL_INT(_hw_usb_xhci, OID_AUTO, debug, CTLFLAG_RWTUN, &xhcidebug, 0, "Debug level"); SYSCTL_INT(_hw_usb_xhci, OID_AUTO, xhci_port_route, CTLFLAG_RWTUN, &xhciroute, 0, "Routing bitmap for switching EHCI ports to the XHCI controller"); SYSCTL_INT(_hw_usb_xhci, OID_AUTO, use_polling, CTLFLAG_RWTUN, &xhcipolling, 0, "Set to enable software interrupt polling for the XHCI controller"); SYSCTL_INT(_hw_usb_xhci, OID_AUTO, dma32, CTLFLAG_RWTUN, &xhcidma32, 0, "Set to only use 32-bit DMA for the XHCI controller"); SYSCTL_INT(_hw_usb_xhci, OID_AUTO, ctlstep, CTLFLAG_RWTUN, &xhcictlstep, 0, "Set to enable control endpoint status stage stepping"); #else #define xhciroute 0 #define xhcidma32 0 #define xhcictlstep 0 #endif #define XHCI_INTR_ENDPT 1 struct xhci_std_temp { struct xhci_softc *sc; struct usb_page_cache *pc; struct xhci_td *td; struct xhci_td *td_next; uint32_t len; uint32_t offset; uint32_t max_packet_size; uint32_t average; uint16_t isoc_delta; uint16_t isoc_frame; uint8_t shortpkt; uint8_t multishort; uint8_t last_frame; uint8_t trb_type; uint8_t direction; uint8_t tbc; uint8_t tlbpc; uint8_t step_td; uint8_t do_isoc_sync; }; static void xhci_do_poll(struct usb_bus *); static void xhci_device_done(struct usb_xfer *, usb_error_t); static void xhci_root_intr(struct xhci_softc *); static void xhci_free_device_ext(struct usb_device *); static struct xhci_endpoint_ext *xhci_get_endpoint_ext(struct usb_device *, struct usb_endpoint_descriptor *); static usb_proc_callback_t xhci_configure_msg; static usb_error_t xhci_configure_device(struct usb_device *); static usb_error_t xhci_configure_endpoint(struct usb_device *, struct usb_endpoint_descriptor *, struct xhci_endpoint_ext *, uint16_t, uint8_t, uint8_t, uint8_t, uint16_t, uint16_t, uint8_t); static usb_error_t xhci_configure_mask(struct usb_device *, uint32_t, uint8_t); static usb_error_t xhci_cmd_evaluate_ctx(struct xhci_softc *, uint64_t, uint8_t); static void xhci_endpoint_doorbell(struct usb_xfer *); static void xhci_ctx_set_le32(struct xhci_softc *sc, volatile uint32_t *ptr, uint32_t val); static uint32_t xhci_ctx_get_le32(struct xhci_softc *sc, volatile uint32_t *ptr); static void xhci_ctx_set_le64(struct xhci_softc *sc, volatile uint64_t *ptr, uint64_t val); #ifdef USB_DEBUG static uint64_t xhci_ctx_get_le64(struct xhci_softc *sc, volatile uint64_t *ptr); #endif static const struct usb_bus_methods xhci_bus_methods; #ifdef USB_DEBUG static void xhci_dump_trb(struct xhci_trb *trb) { DPRINTFN(5, "trb = %p\n", trb); DPRINTFN(5, "qwTrb0 = 0x%016llx\n", (long long)le64toh(trb->qwTrb0)); DPRINTFN(5, "dwTrb2 = 0x%08x\n", le32toh(trb->dwTrb2)); DPRINTFN(5, "dwTrb3 = 0x%08x\n", le32toh(trb->dwTrb3)); } static void xhci_dump_endpoint(struct xhci_softc *sc, struct xhci_endp_ctx *pep) { DPRINTFN(5, "pep = %p\n", pep); DPRINTFN(5, "dwEpCtx0=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx0)); DPRINTFN(5, "dwEpCtx1=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx1)); DPRINTFN(5, "qwEpCtx2=0x%016llx\n", (long long)xhci_ctx_get_le64(sc, &pep->qwEpCtx2)); DPRINTFN(5, "dwEpCtx4=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx4)); DPRINTFN(5, "dwEpCtx5=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx5)); DPRINTFN(5, "dwEpCtx6=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx6)); DPRINTFN(5, "dwEpCtx7=0x%08x\n", xhci_ctx_get_le32(sc, &pep->dwEpCtx7)); } static void xhci_dump_device(struct xhci_softc *sc, struct xhci_slot_ctx *psl) { DPRINTFN(5, "psl = %p\n", psl); DPRINTFN(5, "dwSctx0=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx0)); DPRINTFN(5, "dwSctx1=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx1)); DPRINTFN(5, "dwSctx2=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx2)); DPRINTFN(5, "dwSctx3=0x%08x\n", xhci_ctx_get_le32(sc, &psl->dwSctx3)); } #endif uint8_t xhci_use_polling(void) { #ifdef USB_DEBUG return (xhcipolling != 0); #else return (0); #endif } static void xhci_iterate_hw_softc(struct usb_bus *bus, usb_bus_mem_sub_cb_t *cb) { struct xhci_softc *sc = XHCI_BUS2SC(bus); uint16_t i; cb(bus, &sc->sc_hw.root_pc, &sc->sc_hw.root_pg, sizeof(struct xhci_hw_root), XHCI_PAGE_SIZE); cb(bus, &sc->sc_hw.ctx_pc, &sc->sc_hw.ctx_pg, sizeof(struct xhci_dev_ctx_addr), XHCI_PAGE_SIZE); for (i = 0; i != sc->sc_noscratch; i++) { cb(bus, &sc->sc_hw.scratch_pc[i], &sc->sc_hw.scratch_pg[i], XHCI_PAGE_SIZE, XHCI_PAGE_SIZE); } } static void xhci_ctx_set_le32(struct xhci_softc *sc, volatile uint32_t *ptr, uint32_t val) { if (sc->sc_ctx_is_64_byte) { uint32_t offset; /* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */ /* all contexts are initially 32-bytes */ offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U)); ptr = (volatile uint32_t *)(((volatile uint8_t *)ptr) + offset); } *ptr = htole32(val); } static uint32_t xhci_ctx_get_le32(struct xhci_softc *sc, volatile uint32_t *ptr) { if (sc->sc_ctx_is_64_byte) { uint32_t offset; /* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */ /* all contexts are initially 32-bytes */ offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U)); ptr = (volatile uint32_t *)(((volatile uint8_t *)ptr) + offset); } return (le32toh(*ptr)); } static void xhci_ctx_set_le64(struct xhci_softc *sc, volatile uint64_t *ptr, uint64_t val) { if (sc->sc_ctx_is_64_byte) { uint32_t offset; /* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */ /* all contexts are initially 32-bytes */ offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U)); ptr = (volatile uint64_t *)(((volatile uint8_t *)ptr) + offset); } *ptr = htole64(val); } #ifdef USB_DEBUG static uint64_t xhci_ctx_get_le64(struct xhci_softc *sc, volatile uint64_t *ptr) { if (sc->sc_ctx_is_64_byte) { uint32_t offset; /* exploit the fact that our structures are XHCI_PAGE_SIZE aligned */ /* all contexts are initially 32-bytes */ offset = ((uintptr_t)ptr) & ((XHCI_PAGE_SIZE - 1) & ~(31U)); ptr = (volatile uint64_t *)(((volatile uint8_t *)ptr) + offset); } return (le64toh(*ptr)); } #endif static int xhci_reset_command_queue_locked(struct xhci_softc *sc) { struct usb_page_search buf_res; struct xhci_hw_root *phwr; uint64_t addr; uint32_t temp; DPRINTF("\n"); temp = XREAD4(sc, oper, XHCI_CRCR_LO); if (temp & XHCI_CRCR_LO_CRR) { DPRINTF("Command ring running\n"); temp &= ~(XHCI_CRCR_LO_CS | XHCI_CRCR_LO_CA); /* * Try to abort the last command as per section * 4.6.1.2 "Aborting a Command" of the XHCI * specification: */ /* stop and cancel */ XWRITE4(sc, oper, XHCI_CRCR_LO, temp | XHCI_CRCR_LO_CS); XWRITE4(sc, oper, XHCI_CRCR_HI, 0); XWRITE4(sc, oper, XHCI_CRCR_LO, temp | XHCI_CRCR_LO_CA); XWRITE4(sc, oper, XHCI_CRCR_HI, 0); /* wait 250ms */ usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 4); /* check if command ring is still running */ temp = XREAD4(sc, oper, XHCI_CRCR_LO); if (temp & XHCI_CRCR_LO_CRR) { DPRINTF("Comand ring still running\n"); return (USB_ERR_IOERROR); } } /* reset command ring */ sc->sc_command_ccs = 1; sc->sc_command_idx = 0; usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res); /* set up command ring control base address */ addr = buf_res.physaddr; phwr = buf_res.buffer; addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[0]; DPRINTF("CRCR=0x%016llx\n", (unsigned long long)addr); memset(phwr->hwr_commands, 0, sizeof(phwr->hwr_commands)); phwr->hwr_commands[XHCI_MAX_COMMANDS - 1].qwTrb0 = htole64(addr); usb_pc_cpu_flush(&sc->sc_hw.root_pc); XWRITE4(sc, oper, XHCI_CRCR_LO, ((uint32_t)addr) | XHCI_CRCR_LO_RCS); XWRITE4(sc, oper, XHCI_CRCR_HI, (uint32_t)(addr >> 32)); return (0); } usb_error_t xhci_start_controller(struct xhci_softc *sc) { struct usb_page_search buf_res; struct xhci_hw_root *phwr; struct xhci_dev_ctx_addr *pdctxa; usb_error_t err; uint64_t addr; uint32_t temp; uint16_t i; DPRINTF("\n"); sc->sc_event_ccs = 1; sc->sc_event_idx = 0; sc->sc_command_ccs = 1; sc->sc_command_idx = 0; err = xhci_reset_controller(sc); if (err) return (err); /* set up number of device slots */ DPRINTF("CONFIG=0x%08x -> 0x%08x\n", XREAD4(sc, oper, XHCI_CONFIG), sc->sc_noslot); XWRITE4(sc, oper, XHCI_CONFIG, sc->sc_noslot); temp = XREAD4(sc, oper, XHCI_USBSTS); /* clear interrupts */ XWRITE4(sc, oper, XHCI_USBSTS, temp); /* disable all device notifications */ XWRITE4(sc, oper, XHCI_DNCTRL, 0); /* set up device context base address */ usbd_get_page(&sc->sc_hw.ctx_pc, 0, &buf_res); pdctxa = buf_res.buffer; memset(pdctxa, 0, sizeof(*pdctxa)); addr = buf_res.physaddr; addr += (uintptr_t)&((struct xhci_dev_ctx_addr *)0)->qwSpBufPtr[0]; /* slot 0 points to the table of scratchpad pointers */ pdctxa->qwBaaDevCtxAddr[0] = htole64(addr); for (i = 0; i != sc->sc_noscratch; i++) { struct usb_page_search buf_scp; usbd_get_page(&sc->sc_hw.scratch_pc[i], 0, &buf_scp); pdctxa->qwSpBufPtr[i] = htole64((uint64_t)buf_scp.physaddr); } addr = buf_res.physaddr; XWRITE4(sc, oper, XHCI_DCBAAP_LO, (uint32_t)addr); XWRITE4(sc, oper, XHCI_DCBAAP_HI, (uint32_t)(addr >> 32)); XWRITE4(sc, oper, XHCI_DCBAAP_LO, (uint32_t)addr); XWRITE4(sc, oper, XHCI_DCBAAP_HI, (uint32_t)(addr >> 32)); /* set up event table size */ DPRINTF("ERSTSZ=0x%08x -> 0x%08x\n", XREAD4(sc, runt, XHCI_ERSTSZ(0)), sc->sc_erst_max); XWRITE4(sc, runt, XHCI_ERSTSZ(0), XHCI_ERSTS_SET(sc->sc_erst_max)); /* set up interrupt rate */ XWRITE4(sc, runt, XHCI_IMOD(0), sc->sc_imod_default); usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res); phwr = buf_res.buffer; addr = buf_res.physaddr; addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_events[0]; /* reset hardware root structure */ memset(phwr, 0, sizeof(*phwr)); phwr->hwr_ring_seg[0].qwEvrsTablePtr = htole64(addr); phwr->hwr_ring_seg[0].dwEvrsTableSize = htole32(XHCI_MAX_EVENTS); DPRINTF("ERDP(0)=0x%016llx\n", (unsigned long long)addr); XWRITE4(sc, runt, XHCI_ERDP_LO(0), (uint32_t)addr); XWRITE4(sc, runt, XHCI_ERDP_HI(0), (uint32_t)(addr >> 32)); addr = buf_res.physaddr; DPRINTF("ERSTBA(0)=0x%016llx\n", (unsigned long long)addr); XWRITE4(sc, runt, XHCI_ERSTBA_LO(0), (uint32_t)addr); XWRITE4(sc, runt, XHCI_ERSTBA_HI(0), (uint32_t)(addr >> 32)); /* set up interrupter registers */ temp = XREAD4(sc, runt, XHCI_IMAN(0)); temp |= XHCI_IMAN_INTR_ENA; XWRITE4(sc, runt, XHCI_IMAN(0), temp); /* set up command ring control base address */ addr = buf_res.physaddr; addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[0]; DPRINTF("CRCR=0x%016llx\n", (unsigned long long)addr); XWRITE4(sc, oper, XHCI_CRCR_LO, ((uint32_t)addr) | XHCI_CRCR_LO_RCS); XWRITE4(sc, oper, XHCI_CRCR_HI, (uint32_t)(addr >> 32)); phwr->hwr_commands[XHCI_MAX_COMMANDS - 1].qwTrb0 = htole64(addr); usb_bus_mem_flush_all(&sc->sc_bus, &xhci_iterate_hw_softc); /* Go! */ XWRITE4(sc, oper, XHCI_USBCMD, XHCI_CMD_RS | XHCI_CMD_INTE | XHCI_CMD_HSEE); for (i = 0; i != 100; i++) { usb_pause_mtx(NULL, hz / 100); temp = XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_HCH; if (!temp) break; } if (temp) { XWRITE4(sc, oper, XHCI_USBCMD, 0); device_printf(sc->sc_bus.parent, "Run timeout.\n"); return (USB_ERR_IOERROR); } /* catch any lost interrupts */ xhci_do_poll(&sc->sc_bus); if (sc->sc_port_route != NULL) { /* Route all ports to the XHCI by default */ sc->sc_port_route(sc->sc_bus.parent, ~xhciroute, xhciroute); } return (0); } usb_error_t xhci_halt_controller(struct xhci_softc *sc) { uint32_t temp; uint16_t i; DPRINTF("\n"); sc->sc_capa_off = 0; sc->sc_oper_off = XREAD1(sc, capa, XHCI_CAPLENGTH); sc->sc_runt_off = XREAD4(sc, capa, XHCI_RTSOFF) & ~0xF; sc->sc_door_off = XREAD4(sc, capa, XHCI_DBOFF) & ~0x3; /* Halt controller */ XWRITE4(sc, oper, XHCI_USBCMD, 0); for (i = 0; i != 100; i++) { usb_pause_mtx(NULL, hz / 100); temp = XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_HCH; if (temp) break; } if (!temp) { device_printf(sc->sc_bus.parent, "Controller halt timeout.\n"); return (USB_ERR_IOERROR); } return (0); } usb_error_t xhci_reset_controller(struct xhci_softc *sc) { uint32_t temp = 0; uint16_t i; DPRINTF("\n"); /* Reset controller */ XWRITE4(sc, oper, XHCI_USBCMD, XHCI_CMD_HCRST); for (i = 0; i != 100; i++) { usb_pause_mtx(NULL, hz / 100); temp = (XREAD4(sc, oper, XHCI_USBCMD) & XHCI_CMD_HCRST) | (XREAD4(sc, oper, XHCI_USBSTS) & XHCI_STS_CNR); if (!temp) break; } if (temp) { device_printf(sc->sc_bus.parent, "Controller " "reset timeout.\n"); return (USB_ERR_IOERROR); } return (0); } usb_error_t xhci_init(struct xhci_softc *sc, device_t self, uint8_t dma32) { uint32_t temp; DPRINTF("\n"); /* initialize some bus fields */ sc->sc_bus.parent = self; /* set the bus revision */ sc->sc_bus.usbrev = USB_REV_3_0; /* set up the bus struct */ sc->sc_bus.methods = &xhci_bus_methods; /* set up devices array */ sc->sc_bus.devices = sc->sc_devices; sc->sc_bus.devices_max = XHCI_MAX_DEVICES; /* set default cycle state in case of early interrupts */ sc->sc_event_ccs = 1; sc->sc_command_ccs = 1; /* set up bus space offsets */ sc->sc_capa_off = 0; sc->sc_oper_off = XREAD1(sc, capa, XHCI_CAPLENGTH); sc->sc_runt_off = XREAD4(sc, capa, XHCI_RTSOFF) & ~0x1F; sc->sc_door_off = XREAD4(sc, capa, XHCI_DBOFF) & ~0x3; DPRINTF("CAPLENGTH=0x%x\n", sc->sc_oper_off); DPRINTF("RUNTIMEOFFSET=0x%x\n", sc->sc_runt_off); DPRINTF("DOOROFFSET=0x%x\n", sc->sc_door_off); DPRINTF("xHCI version = 0x%04x\n", XREAD2(sc, capa, XHCI_HCIVERSION)); if (!(XREAD4(sc, oper, XHCI_PAGESIZE) & XHCI_PAGESIZE_4K)) { device_printf(sc->sc_bus.parent, "Controller does " "not support 4K page size.\n"); return (ENXIO); } temp = XREAD4(sc, capa, XHCI_HCSPARAMS0); DPRINTF("HCS0 = 0x%08x\n", temp); /* set up context size */ if (XHCI_HCS0_CSZ(temp)) { sc->sc_ctx_is_64_byte = 1; } else { sc->sc_ctx_is_64_byte = 0; } /* get DMA bits */ sc->sc_bus.dma_bits = (XHCI_HCS0_AC64(temp) && xhcidma32 == 0 && dma32 == 0) ? 64 : 32; device_printf(self, "%d bytes context size, %d-bit DMA\n", sc->sc_ctx_is_64_byte ? 64 : 32, (int)sc->sc_bus.dma_bits); + /* enable 64Kbyte control endpoint quirk */ + sc->sc_bus.control_ep_quirk = 1; + temp = XREAD4(sc, capa, XHCI_HCSPARAMS1); /* get number of device slots */ sc->sc_noport = XHCI_HCS1_N_PORTS(temp); if (sc->sc_noport == 0) { device_printf(sc->sc_bus.parent, "Invalid number " "of ports: %u\n", sc->sc_noport); return (ENXIO); } sc->sc_noport = sc->sc_noport; sc->sc_noslot = XHCI_HCS1_DEVSLOT_MAX(temp); DPRINTF("Max slots: %u\n", sc->sc_noslot); if (sc->sc_noslot > XHCI_MAX_DEVICES) sc->sc_noslot = XHCI_MAX_DEVICES; temp = XREAD4(sc, capa, XHCI_HCSPARAMS2); DPRINTF("HCS2=0x%08x\n", temp); /* get number of scratchpads */ sc->sc_noscratch = XHCI_HCS2_SPB_MAX(temp); if (sc->sc_noscratch > XHCI_MAX_SCRATCHPADS) { device_printf(sc->sc_bus.parent, "XHCI request " "too many scratchpads\n"); return (ENOMEM); } DPRINTF("Max scratch: %u\n", sc->sc_noscratch); /* get event table size */ sc->sc_erst_max = 1U << XHCI_HCS2_ERST_MAX(temp); if (sc->sc_erst_max > XHCI_MAX_RSEG) sc->sc_erst_max = XHCI_MAX_RSEG; temp = XREAD4(sc, capa, XHCI_HCSPARAMS3); /* get maximum exit latency */ sc->sc_exit_lat_max = XHCI_HCS3_U1_DEL(temp) + XHCI_HCS3_U2_DEL(temp) + 250 /* us */; /* Check if we should use the default IMOD value. */ if (sc->sc_imod_default == 0) sc->sc_imod_default = XHCI_IMOD_DEFAULT; /* get all DMA memory */ if (usb_bus_mem_alloc_all(&sc->sc_bus, USB_GET_DMA_TAG(self), &xhci_iterate_hw_softc)) { return (ENOMEM); } /* set up command queue mutex and condition varible */ cv_init(&sc->sc_cmd_cv, "CMDQ"); sx_init(&sc->sc_cmd_sx, "CMDQ lock"); sc->sc_config_msg[0].hdr.pm_callback = &xhci_configure_msg; sc->sc_config_msg[0].bus = &sc->sc_bus; sc->sc_config_msg[1].hdr.pm_callback = &xhci_configure_msg; sc->sc_config_msg[1].bus = &sc->sc_bus; return (0); } void xhci_uninit(struct xhci_softc *sc) { /* * NOTE: At this point the control transfer process is gone * and "xhci_configure_msg" is no longer called. Consequently * waiting for the configuration messages to complete is not * needed. */ usb_bus_mem_free_all(&sc->sc_bus, &xhci_iterate_hw_softc); cv_destroy(&sc->sc_cmd_cv); sx_destroy(&sc->sc_cmd_sx); } static void xhci_set_hw_power_sleep(struct usb_bus *bus, uint32_t state) { struct xhci_softc *sc = XHCI_BUS2SC(bus); switch (state) { case USB_HW_POWER_SUSPEND: DPRINTF("Stopping the XHCI\n"); xhci_halt_controller(sc); xhci_reset_controller(sc); break; case USB_HW_POWER_SHUTDOWN: DPRINTF("Stopping the XHCI\n"); xhci_halt_controller(sc); xhci_reset_controller(sc); break; case USB_HW_POWER_RESUME: DPRINTF("Starting the XHCI\n"); xhci_start_controller(sc); break; default: break; } } static usb_error_t xhci_generic_done_sub(struct usb_xfer *xfer) { struct xhci_td *td; struct xhci_td *td_alt_next; uint32_t len; uint8_t status; td = xfer->td_transfer_cache; td_alt_next = td->alt_next; if (xfer->aframes != xfer->nframes) usbd_xfer_set_frame_len(xfer, xfer->aframes, 0); while (1) { usb_pc_cpu_invalidate(td->page_cache); status = td->status; len = td->remainder; DPRINTFN(4, "xfer=%p[%u/%u] rem=%u/%u status=%u\n", xfer, (unsigned int)xfer->aframes, (unsigned int)xfer->nframes, (unsigned int)len, (unsigned int)td->len, (unsigned int)status); /* * Verify the status length and * add the length to "frlengths[]": */ if (len > td->len) { /* should not happen */ DPRINTF("Invalid status length, " "0x%04x/0x%04x bytes\n", len, td->len); status = XHCI_TRB_ERROR_LENGTH; } else if (xfer->aframes != xfer->nframes) { xfer->frlengths[xfer->aframes] += td->len - len; } /* Check for last transfer */ if (((void *)td) == xfer->td_transfer_last) { td = NULL; break; } /* Check for transfer error */ if (status != XHCI_TRB_ERROR_SHORT_PKT && status != XHCI_TRB_ERROR_SUCCESS) { /* the transfer is finished */ td = NULL; break; } /* Check for short transfer */ if (len > 0) { if (xfer->flags_int.short_frames_ok || xfer->flags_int.isochronous_xfr || xfer->flags_int.control_xfr) { /* follow alt next */ td = td->alt_next; } else { /* the transfer is finished */ td = NULL; } break; } td = td->obj_next; if (td->alt_next != td_alt_next) { /* this USB frame is complete */ break; } } /* update transfer cache */ xfer->td_transfer_cache = td; return ((status == XHCI_TRB_ERROR_STALL) ? USB_ERR_STALLED : (status != XHCI_TRB_ERROR_SHORT_PKT && status != XHCI_TRB_ERROR_SUCCESS) ? USB_ERR_IOERROR : USB_ERR_NORMAL_COMPLETION); } static void xhci_generic_done(struct usb_xfer *xfer) { usb_error_t err = 0; DPRINTFN(13, "xfer=%p endpoint=%p transfer done\n", xfer, xfer->endpoint); /* reset scanner */ xfer->td_transfer_cache = xfer->td_transfer_first; if (xfer->flags_int.control_xfr) { if (xfer->flags_int.control_hdr) err = xhci_generic_done_sub(xfer); xfer->aframes = 1; if (xfer->td_transfer_cache == NULL) goto done; } while (xfer->aframes != xfer->nframes) { err = xhci_generic_done_sub(xfer); xfer->aframes++; if (xfer->td_transfer_cache == NULL) goto done; } if (xfer->flags_int.control_xfr && !xfer->flags_int.control_act) err = xhci_generic_done_sub(xfer); done: /* transfer is complete */ xhci_device_done(xfer, err); } static void xhci_activate_transfer(struct usb_xfer *xfer) { struct xhci_td *td; td = xfer->td_transfer_cache; usb_pc_cpu_invalidate(td->page_cache); if (!(td->td_trb[0].dwTrb3 & htole32(XHCI_TRB_3_CYCLE_BIT))) { /* activate the transfer */ td->td_trb[0].dwTrb3 |= htole32(XHCI_TRB_3_CYCLE_BIT); usb_pc_cpu_flush(td->page_cache); xhci_endpoint_doorbell(xfer); } } static void xhci_skip_transfer(struct usb_xfer *xfer) { struct xhci_td *td; struct xhci_td *td_last; td = xfer->td_transfer_cache; td_last = xfer->td_transfer_last; td = td->alt_next; usb_pc_cpu_invalidate(td->page_cache); if (!(td->td_trb[0].dwTrb3 & htole32(XHCI_TRB_3_CYCLE_BIT))) { usb_pc_cpu_invalidate(td_last->page_cache); /* copy LINK TRB to current waiting location */ td->td_trb[0].qwTrb0 = td_last->td_trb[td_last->ntrb].qwTrb0; td->td_trb[0].dwTrb2 = td_last->td_trb[td_last->ntrb].dwTrb2; usb_pc_cpu_flush(td->page_cache); td->td_trb[0].dwTrb3 = td_last->td_trb[td_last->ntrb].dwTrb3; usb_pc_cpu_flush(td->page_cache); xhci_endpoint_doorbell(xfer); } } /*------------------------------------------------------------------------* * xhci_check_transfer *------------------------------------------------------------------------*/ static void xhci_check_transfer(struct xhci_softc *sc, struct xhci_trb *trb) { struct xhci_endpoint_ext *pepext; int64_t offset; uint64_t td_event; uint32_t temp; uint32_t remainder; uint16_t stream_id = 0; uint16_t i; uint8_t status; uint8_t halted; uint8_t epno; uint8_t index; /* decode TRB */ td_event = le64toh(trb->qwTrb0); temp = le32toh(trb->dwTrb2); remainder = XHCI_TRB_2_REM_GET(temp); status = XHCI_TRB_2_ERROR_GET(temp); temp = le32toh(trb->dwTrb3); epno = XHCI_TRB_3_EP_GET(temp); index = XHCI_TRB_3_SLOT_GET(temp); /* check if error means halted */ halted = (status != XHCI_TRB_ERROR_SHORT_PKT && status != XHCI_TRB_ERROR_SUCCESS); DPRINTF("slot=%u epno=%u remainder=%u status=%u\n", index, epno, remainder, status); if (index > sc->sc_noslot) { DPRINTF("Invalid slot.\n"); return; } if ((epno == 0) || (epno >= XHCI_MAX_ENDPOINTS)) { DPRINTF("Invalid endpoint.\n"); return; } pepext = &sc->sc_hw.devs[index].endp[epno]; /* try to find the USB transfer that generated the event */ for (i = 0;; i++) { struct usb_xfer *xfer; struct xhci_td *td; if (i == (XHCI_MAX_TRANSFERS - 1)) { if (pepext->trb_ep_mode != USB_EP_MODE_STREAMS || stream_id == (XHCI_MAX_STREAMS - 1)) break; stream_id++; i = 0; DPRINTFN(5, "stream_id=%u\n", stream_id); } xfer = pepext->xfer[i + (XHCI_MAX_TRANSFERS * stream_id)]; if (xfer == NULL) continue; td = xfer->td_transfer_cache; DPRINTFN(5, "Checking if 0x%016llx == (0x%016llx .. 0x%016llx)\n", (long long)td_event, (long long)td->td_self, (long long)td->td_self + sizeof(td->td_trb)); /* * NOTE: Some XHCI implementations might not trigger * an event on the last LINK TRB so we need to * consider both the last and second last event * address as conditions for a successful transfer. * * NOTE: We assume that the XHCI will only trigger one * event per chain of TRBs. */ offset = td_event - td->td_self; if (offset >= 0 && offset < (int64_t)sizeof(td->td_trb)) { usb_pc_cpu_invalidate(td->page_cache); /* compute rest of remainder, if any */ for (i = (offset / 16) + 1; i < td->ntrb; i++) { temp = le32toh(td->td_trb[i].dwTrb2); remainder += XHCI_TRB_2_BYTES_GET(temp); } DPRINTFN(5, "New remainder: %u\n", remainder); /* clear isochronous transfer errors */ if (xfer->flags_int.isochronous_xfr) { if (halted) { halted = 0; status = XHCI_TRB_ERROR_SUCCESS; remainder = td->len; } } /* "td->remainder" is verified later */ td->remainder = remainder; td->status = status; usb_pc_cpu_flush(td->page_cache); /* * 1) Last transfer descriptor makes the * transfer done */ if (((void *)td) == xfer->td_transfer_last) { DPRINTF("TD is last\n"); xhci_generic_done(xfer); break; } /* * 2) Any kind of error makes the transfer * done */ if (halted) { DPRINTF("TD has I/O error\n"); xhci_generic_done(xfer); break; } /* * 3) If there is no alternate next transfer, * a short packet also makes the transfer done */ if (td->remainder > 0) { if (td->alt_next == NULL) { DPRINTF( "short TD has no alternate next\n"); xhci_generic_done(xfer); break; } DPRINTF("TD has short pkt\n"); if (xfer->flags_int.short_frames_ok || xfer->flags_int.isochronous_xfr || xfer->flags_int.control_xfr) { /* follow the alt next */ xfer->td_transfer_cache = td->alt_next; xhci_activate_transfer(xfer); break; } xhci_skip_transfer(xfer); xhci_generic_done(xfer); break; } /* * 4) Transfer complete - go to next TD */ DPRINTF("Following next TD\n"); xfer->td_transfer_cache = td->obj_next; xhci_activate_transfer(xfer); break; /* there should only be one match */ } } } static int xhci_check_command(struct xhci_softc *sc, struct xhci_trb *trb) { if (sc->sc_cmd_addr == trb->qwTrb0) { DPRINTF("Received command event\n"); sc->sc_cmd_result[0] = trb->dwTrb2; sc->sc_cmd_result[1] = trb->dwTrb3; cv_signal(&sc->sc_cmd_cv); return (1); /* command match */ } return (0); } static int xhci_interrupt_poll(struct xhci_softc *sc) { struct usb_page_search buf_res; struct xhci_hw_root *phwr; uint64_t addr; uint32_t temp; int retval = 0; uint16_t i; uint8_t event; uint8_t j; uint8_t k; uint8_t t; usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res); phwr = buf_res.buffer; /* Receive any events */ usb_pc_cpu_invalidate(&sc->sc_hw.root_pc); i = sc->sc_event_idx; j = sc->sc_event_ccs; t = 2; while (1) { temp = le32toh(phwr->hwr_events[i].dwTrb3); k = (temp & XHCI_TRB_3_CYCLE_BIT) ? 1 : 0; if (j != k) break; event = XHCI_TRB_3_TYPE_GET(temp); DPRINTFN(10, "event[%u] = %u (0x%016llx 0x%08lx 0x%08lx)\n", i, event, (long long)le64toh(phwr->hwr_events[i].qwTrb0), (long)le32toh(phwr->hwr_events[i].dwTrb2), (long)le32toh(phwr->hwr_events[i].dwTrb3)); switch (event) { case XHCI_TRB_EVENT_TRANSFER: xhci_check_transfer(sc, &phwr->hwr_events[i]); break; case XHCI_TRB_EVENT_CMD_COMPLETE: retval |= xhci_check_command(sc, &phwr->hwr_events[i]); break; default: DPRINTF("Unhandled event = %u\n", event); break; } i++; if (i == XHCI_MAX_EVENTS) { i = 0; j ^= 1; /* check for timeout */ if (!--t) break; } } sc->sc_event_idx = i; sc->sc_event_ccs = j; /* * NOTE: The Event Ring Dequeue Pointer Register is 64-bit * latched. That means to activate the register we need to * write both the low and high double word of the 64-bit * register. */ addr = buf_res.physaddr; addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_events[i]; /* try to clear busy bit */ addr |= XHCI_ERDP_LO_BUSY; XWRITE4(sc, runt, XHCI_ERDP_LO(0), (uint32_t)addr); XWRITE4(sc, runt, XHCI_ERDP_HI(0), (uint32_t)(addr >> 32)); return (retval); } static usb_error_t xhci_do_command(struct xhci_softc *sc, struct xhci_trb *trb, uint16_t timeout_ms) { struct usb_page_search buf_res; struct xhci_hw_root *phwr; uint64_t addr; uint32_t temp; uint8_t i; uint8_t j; uint8_t timeout = 0; int err; XHCI_CMD_ASSERT_LOCKED(sc); /* get hardware root structure */ usbd_get_page(&sc->sc_hw.root_pc, 0, &buf_res); phwr = buf_res.buffer; /* Queue command */ USB_BUS_LOCK(&sc->sc_bus); retry: i = sc->sc_command_idx; j = sc->sc_command_ccs; DPRINTFN(10, "command[%u] = %u (0x%016llx, 0x%08lx, 0x%08lx)\n", i, XHCI_TRB_3_TYPE_GET(le32toh(trb->dwTrb3)), (long long)le64toh(trb->qwTrb0), (long)le32toh(trb->dwTrb2), (long)le32toh(trb->dwTrb3)); phwr->hwr_commands[i].qwTrb0 = trb->qwTrb0; phwr->hwr_commands[i].dwTrb2 = trb->dwTrb2; usb_pc_cpu_flush(&sc->sc_hw.root_pc); temp = trb->dwTrb3; if (j) temp |= htole32(XHCI_TRB_3_CYCLE_BIT); else temp &= ~htole32(XHCI_TRB_3_CYCLE_BIT); temp &= ~htole32(XHCI_TRB_3_TC_BIT); phwr->hwr_commands[i].dwTrb3 = temp; usb_pc_cpu_flush(&sc->sc_hw.root_pc); addr = buf_res.physaddr; addr += (uintptr_t)&((struct xhci_hw_root *)0)->hwr_commands[i]; sc->sc_cmd_addr = htole64(addr); i++; if (i == (XHCI_MAX_COMMANDS - 1)) { if (j) { temp = htole32(XHCI_TRB_3_TC_BIT | XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK) | XHCI_TRB_3_CYCLE_BIT); } else { temp = htole32(XHCI_TRB_3_TC_BIT | XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK)); } phwr->hwr_commands[i].dwTrb3 = temp; usb_pc_cpu_flush(&sc->sc_hw.root_pc); i = 0; j ^= 1; } sc->sc_command_idx = i; sc->sc_command_ccs = j; XWRITE4(sc, door, XHCI_DOORBELL(0), 0); err = cv_timedwait(&sc->sc_cmd_cv, &sc->sc_bus.bus_mtx, USB_MS_TO_TICKS(timeout_ms)); /* * In some error cases event interrupts are not generated. * Poll one time to see if the command has completed. */ if (err != 0 && xhci_interrupt_poll(sc) != 0) { DPRINTF("Command was completed when polling\n"); err = 0; } if (err != 0) { DPRINTF("Command timeout!\n"); /* * After some weeks of continuous operation, it has * been observed that the ASMedia Technology, ASM1042 * SuperSpeed USB Host Controller can suddenly stop * accepting commands via the command queue. Try to * first reset the command queue. If that fails do a * host controller reset. */ if (timeout == 0 && xhci_reset_command_queue_locked(sc) == 0) { temp = le32toh(trb->dwTrb3); /* * Avoid infinite XHCI reset loops if the set * address command fails to respond due to a * non-enumerating device: */ if (XHCI_TRB_3_TYPE_GET(temp) == XHCI_TRB_TYPE_ADDRESS_DEVICE && (temp & XHCI_TRB_3_BSR_BIT) == 0) { DPRINTF("Set address timeout\n"); } else { timeout = 1; goto retry; } } else { DPRINTF("Controller reset!\n"); usb_bus_reset_async_locked(&sc->sc_bus); } err = USB_ERR_TIMEOUT; trb->dwTrb2 = 0; trb->dwTrb3 = 0; } else { temp = le32toh(sc->sc_cmd_result[0]); if (XHCI_TRB_2_ERROR_GET(temp) != XHCI_TRB_ERROR_SUCCESS) err = USB_ERR_IOERROR; trb->dwTrb2 = sc->sc_cmd_result[0]; trb->dwTrb3 = sc->sc_cmd_result[1]; } USB_BUS_UNLOCK(&sc->sc_bus); return (err); } #if 0 static usb_error_t xhci_cmd_nop(struct xhci_softc *sc) { struct xhci_trb trb; uint32_t temp; DPRINTF("\n"); trb.qwTrb0 = 0; trb.dwTrb2 = 0; temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NOOP); trb.dwTrb3 = htole32(temp); return (xhci_do_command(sc, &trb, 100 /* ms */)); } #endif static usb_error_t xhci_cmd_enable_slot(struct xhci_softc *sc, uint8_t *pslot) { struct xhci_trb trb; uint32_t temp; usb_error_t err; DPRINTF("\n"); trb.qwTrb0 = 0; trb.dwTrb2 = 0; trb.dwTrb3 = htole32(XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ENABLE_SLOT)); err = xhci_do_command(sc, &trb, 100 /* ms */); if (err) goto done; temp = le32toh(trb.dwTrb3); *pslot = XHCI_TRB_3_SLOT_GET(temp); done: return (err); } static usb_error_t xhci_cmd_disable_slot(struct xhci_softc *sc, uint8_t slot_id) { struct xhci_trb trb; uint32_t temp; DPRINTF("\n"); trb.qwTrb0 = 0; trb.dwTrb2 = 0; temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_DISABLE_SLOT) | XHCI_TRB_3_SLOT_SET(slot_id); trb.dwTrb3 = htole32(temp); return (xhci_do_command(sc, &trb, 100 /* ms */)); } static usb_error_t xhci_cmd_set_address(struct xhci_softc *sc, uint64_t input_ctx, uint8_t bsr, uint8_t slot_id) { struct xhci_trb trb; uint32_t temp; DPRINTF("\n"); trb.qwTrb0 = htole64(input_ctx); trb.dwTrb2 = 0; temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ADDRESS_DEVICE) | XHCI_TRB_3_SLOT_SET(slot_id); if (bsr) temp |= XHCI_TRB_3_BSR_BIT; trb.dwTrb3 = htole32(temp); return (xhci_do_command(sc, &trb, 500 /* ms */)); } static usb_error_t xhci_set_address(struct usb_device *udev, struct mtx *mtx, uint16_t address) { struct usb_page_search buf_inp; struct usb_page_search buf_dev; struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); struct xhci_hw_dev *hdev; struct xhci_dev_ctx *pdev; struct xhci_endpoint_ext *pepext; uint32_t temp; uint16_t mps; usb_error_t err; uint8_t index; /* the root HUB case is not handled here */ if (udev->parent_hub == NULL) return (USB_ERR_INVAL); index = udev->controller_slot_id; hdev = &sc->sc_hw.devs[index]; if (mtx != NULL) mtx_unlock(mtx); XHCI_CMD_LOCK(sc); switch (hdev->state) { case XHCI_ST_DEFAULT: case XHCI_ST_ENABLED: hdev->state = XHCI_ST_ENABLED; /* set configure mask to slot and EP0 */ xhci_configure_mask(udev, 3, 0); /* configure input slot context structure */ err = xhci_configure_device(udev); if (err != 0) { DPRINTF("Could not configure device\n"); break; } /* configure input endpoint context structure */ switch (udev->speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: mps = 8; break; case USB_SPEED_HIGH: mps = 64; break; default: mps = 512; break; } pepext = xhci_get_endpoint_ext(udev, &udev->ctrl_ep_desc); /* ensure the control endpoint is setup again */ USB_BUS_LOCK(udev->bus); pepext->trb_halted = 1; pepext->trb_running = 0; USB_BUS_UNLOCK(udev->bus); err = xhci_configure_endpoint(udev, &udev->ctrl_ep_desc, pepext, 0, 1, 1, 0, mps, mps, USB_EP_MODE_DEFAULT); if (err != 0) { DPRINTF("Could not configure default endpoint\n"); break; } /* execute set address command */ usbd_get_page(&hdev->input_pc, 0, &buf_inp); err = xhci_cmd_set_address(sc, buf_inp.physaddr, (address == 0), index); if (err != 0) { temp = le32toh(sc->sc_cmd_result[0]); if (address == 0 && sc->sc_port_route != NULL && XHCI_TRB_2_ERROR_GET(temp) == XHCI_TRB_ERROR_PARAMETER) { /* LynxPoint XHCI - ports are not switchable */ /* Un-route all ports from the XHCI */ sc->sc_port_route(sc->sc_bus.parent, 0, ~0); } DPRINTF("Could not set address " "for slot %u.\n", index); if (address != 0) break; } /* update device address to new value */ usbd_get_page(&hdev->device_pc, 0, &buf_dev); pdev = buf_dev.buffer; usb_pc_cpu_invalidate(&hdev->device_pc); temp = xhci_ctx_get_le32(sc, &pdev->ctx_slot.dwSctx3); udev->address = XHCI_SCTX_3_DEV_ADDR_GET(temp); /* update device state to new value */ if (address != 0) hdev->state = XHCI_ST_ADDRESSED; else hdev->state = XHCI_ST_DEFAULT; break; default: DPRINTF("Wrong state for set address.\n"); err = USB_ERR_IOERROR; break; } XHCI_CMD_UNLOCK(sc); if (mtx != NULL) mtx_lock(mtx); return (err); } static usb_error_t xhci_cmd_configure_ep(struct xhci_softc *sc, uint64_t input_ctx, uint8_t deconfigure, uint8_t slot_id) { struct xhci_trb trb; uint32_t temp; DPRINTF("\n"); trb.qwTrb0 = htole64(input_ctx); trb.dwTrb2 = 0; temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_CONFIGURE_EP) | XHCI_TRB_3_SLOT_SET(slot_id); if (deconfigure) temp |= XHCI_TRB_3_DCEP_BIT; trb.dwTrb3 = htole32(temp); return (xhci_do_command(sc, &trb, 100 /* ms */)); } static usb_error_t xhci_cmd_evaluate_ctx(struct xhci_softc *sc, uint64_t input_ctx, uint8_t slot_id) { struct xhci_trb trb; uint32_t temp; DPRINTF("\n"); trb.qwTrb0 = htole64(input_ctx); trb.dwTrb2 = 0; temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_EVALUATE_CTX) | XHCI_TRB_3_SLOT_SET(slot_id); trb.dwTrb3 = htole32(temp); return (xhci_do_command(sc, &trb, 100 /* ms */)); } static usb_error_t xhci_cmd_reset_ep(struct xhci_softc *sc, uint8_t preserve, uint8_t ep_id, uint8_t slot_id) { struct xhci_trb trb; uint32_t temp; DPRINTF("\n"); trb.qwTrb0 = 0; trb.dwTrb2 = 0; temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_RESET_EP) | XHCI_TRB_3_SLOT_SET(slot_id) | XHCI_TRB_3_EP_SET(ep_id); if (preserve) temp |= XHCI_TRB_3_PRSV_BIT; trb.dwTrb3 = htole32(temp); return (xhci_do_command(sc, &trb, 100 /* ms */)); } static usb_error_t xhci_cmd_set_tr_dequeue_ptr(struct xhci_softc *sc, uint64_t dequeue_ptr, uint16_t stream_id, uint8_t ep_id, uint8_t slot_id) { struct xhci_trb trb; uint32_t temp; DPRINTF("\n"); trb.qwTrb0 = htole64(dequeue_ptr); temp = XHCI_TRB_2_STREAM_SET(stream_id); trb.dwTrb2 = htole32(temp); temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_SET_TR_DEQUEUE) | XHCI_TRB_3_SLOT_SET(slot_id) | XHCI_TRB_3_EP_SET(ep_id); trb.dwTrb3 = htole32(temp); return (xhci_do_command(sc, &trb, 100 /* ms */)); } static usb_error_t xhci_cmd_stop_ep(struct xhci_softc *sc, uint8_t suspend, uint8_t ep_id, uint8_t slot_id) { struct xhci_trb trb; uint32_t temp; DPRINTF("\n"); trb.qwTrb0 = 0; trb.dwTrb2 = 0; temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_STOP_EP) | XHCI_TRB_3_SLOT_SET(slot_id) | XHCI_TRB_3_EP_SET(ep_id); if (suspend) temp |= XHCI_TRB_3_SUSP_EP_BIT; trb.dwTrb3 = htole32(temp); return (xhci_do_command(sc, &trb, 100 /* ms */)); } static usb_error_t xhci_cmd_reset_dev(struct xhci_softc *sc, uint8_t slot_id) { struct xhci_trb trb; uint32_t temp; DPRINTF("\n"); trb.qwTrb0 = 0; trb.dwTrb2 = 0; temp = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_RESET_DEVICE) | XHCI_TRB_3_SLOT_SET(slot_id); trb.dwTrb3 = htole32(temp); return (xhci_do_command(sc, &trb, 100 /* ms */)); } /*------------------------------------------------------------------------* * xhci_interrupt - XHCI interrupt handler *------------------------------------------------------------------------*/ void xhci_interrupt(struct xhci_softc *sc) { uint32_t status; uint32_t temp; USB_BUS_LOCK(&sc->sc_bus); status = XREAD4(sc, oper, XHCI_USBSTS); /* acknowledge interrupts, if any */ if (status != 0) { XWRITE4(sc, oper, XHCI_USBSTS, status); DPRINTFN(16, "real interrupt (status=0x%08x)\n", status); } temp = XREAD4(sc, runt, XHCI_IMAN(0)); /* force clearing of pending interrupts */ if (temp & XHCI_IMAN_INTR_PEND) XWRITE4(sc, runt, XHCI_IMAN(0), temp); /* check for event(s) */ xhci_interrupt_poll(sc); if (status & (XHCI_STS_PCD | XHCI_STS_HCH | XHCI_STS_HSE | XHCI_STS_HCE)) { if (status & XHCI_STS_PCD) { xhci_root_intr(sc); } if (status & XHCI_STS_HCH) { printf("%s: host controller halted\n", __FUNCTION__); } if (status & XHCI_STS_HSE) { printf("%s: host system error\n", __FUNCTION__); } if (status & XHCI_STS_HCE) { printf("%s: host controller error\n", __FUNCTION__); } } USB_BUS_UNLOCK(&sc->sc_bus); } /*------------------------------------------------------------------------* * xhci_timeout - XHCI timeout handler *------------------------------------------------------------------------*/ static void xhci_timeout(void *arg) { struct usb_xfer *xfer = arg; DPRINTF("xfer=%p\n", xfer); USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED); /* transfer is transferred */ xhci_device_done(xfer, USB_ERR_TIMEOUT); } static void xhci_do_poll(struct usb_bus *bus) { struct xhci_softc *sc = XHCI_BUS2SC(bus); USB_BUS_LOCK(&sc->sc_bus); xhci_interrupt_poll(sc); USB_BUS_UNLOCK(&sc->sc_bus); } static void xhci_setup_generic_chain_sub(struct xhci_std_temp *temp) { struct usb_page_search buf_res; struct xhci_td *td; struct xhci_td *td_next; struct xhci_td *td_alt_next; struct xhci_td *td_first; uint32_t buf_offset; uint32_t average; uint32_t len_old; uint32_t npkt_off; uint32_t dword; uint8_t shortpkt_old; uint8_t precompute; uint8_t x; td_alt_next = NULL; buf_offset = 0; shortpkt_old = temp->shortpkt; len_old = temp->len; npkt_off = 0; precompute = 1; restart: td = temp->td; td_next = td_first = temp->td_next; while (1) { if (temp->len == 0) { if (temp->shortpkt) break; /* send a Zero Length Packet, ZLP, last */ temp->shortpkt = 1; average = 0; } else { average = temp->average; if (temp->len < average) { if (temp->len % temp->max_packet_size) { temp->shortpkt = 1; } average = temp->len; } } if (td_next == NULL) panic("%s: out of XHCI transfer descriptors!", __FUNCTION__); /* get next TD */ td = td_next; td_next = td->obj_next; /* check if we are pre-computing */ if (precompute) { /* update remaining length */ temp->len -= average; continue; } /* fill out current TD */ td->len = average; td->remainder = 0; td->status = 0; /* update remaining length */ temp->len -= average; /* reset TRB index */ x = 0; if (temp->trb_type == XHCI_TRB_TYPE_SETUP_STAGE) { /* immediate data */ if (average > 8) average = 8; td->td_trb[0].qwTrb0 = 0; usbd_copy_out(temp->pc, temp->offset + buf_offset, (uint8_t *)(uintptr_t)&td->td_trb[0].qwTrb0, average); dword = XHCI_TRB_2_BYTES_SET(8) | XHCI_TRB_2_TDSZ_SET(0) | XHCI_TRB_2_IRQ_SET(0); td->td_trb[0].dwTrb2 = htole32(dword); dword = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_SETUP_STAGE) | XHCI_TRB_3_IDT_BIT | XHCI_TRB_3_CYCLE_BIT; /* check wLength */ if (td->td_trb[0].qwTrb0 & htole64(XHCI_TRB_0_WLENGTH_MASK)) { if (td->td_trb[0].qwTrb0 & htole64(XHCI_TRB_0_DIR_IN_MASK)) dword |= XHCI_TRB_3_TRT_IN; else dword |= XHCI_TRB_3_TRT_OUT; } td->td_trb[0].dwTrb3 = htole32(dword); #ifdef USB_DEBUG xhci_dump_trb(&td->td_trb[x]); #endif x++; } else do { uint32_t npkt; /* fill out buffer pointers */ if (average == 0) { memset(&buf_res, 0, sizeof(buf_res)); } else { usbd_get_page(temp->pc, temp->offset + buf_offset, &buf_res); /* get length to end of page */ if (buf_res.length > average) buf_res.length = average; /* check for maximum length */ if (buf_res.length > XHCI_TD_PAGE_SIZE) buf_res.length = XHCI_TD_PAGE_SIZE; npkt_off += buf_res.length; } /* set up npkt */ npkt = howmany(len_old - npkt_off, temp->max_packet_size); if (npkt == 0) npkt = 1; else if (npkt > 31) npkt = 31; /* fill out TRB's */ td->td_trb[x].qwTrb0 = htole64((uint64_t)buf_res.physaddr); dword = XHCI_TRB_2_BYTES_SET(buf_res.length) | XHCI_TRB_2_TDSZ_SET(npkt) | XHCI_TRB_2_IRQ_SET(0); td->td_trb[x].dwTrb2 = htole32(dword); switch (temp->trb_type) { case XHCI_TRB_TYPE_ISOCH: dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_TBC_SET(temp->tbc) | XHCI_TRB_3_TLBPC_SET(temp->tlbpc); if (td != td_first) { dword |= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NORMAL); } else if (temp->do_isoc_sync != 0) { temp->do_isoc_sync = 0; /* wait until "isoc_frame" */ dword |= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ISOCH) | XHCI_TRB_3_FRID_SET(temp->isoc_frame / 8); } else { /* start data transfer at next interval */ dword |= XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_ISOCH) | XHCI_TRB_3_ISO_SIA_BIT; } if (temp->direction == UE_DIR_IN) dword |= XHCI_TRB_3_ISP_BIT; break; case XHCI_TRB_TYPE_DATA_STAGE: dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_DATA_STAGE); if (temp->direction == UE_DIR_IN) dword |= XHCI_TRB_3_DIR_IN | XHCI_TRB_3_ISP_BIT; /* * Section 3.2.9 in the XHCI * specification about control * transfers says that we should use a * normal-TRB if there are more TRBs * extending the data-stage * TRB. Update the "trb_type". */ temp->trb_type = XHCI_TRB_TYPE_NORMAL; break; case XHCI_TRB_TYPE_STATUS_STAGE: dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_STATUS_STAGE); if (temp->direction == UE_DIR_IN) dword |= XHCI_TRB_3_DIR_IN; break; default: /* XHCI_TRB_TYPE_NORMAL */ dword = XHCI_TRB_3_CHAIN_BIT | XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_NORMAL); if (temp->direction == UE_DIR_IN) dword |= XHCI_TRB_3_ISP_BIT; break; } td->td_trb[x].dwTrb3 = htole32(dword); average -= buf_res.length; buf_offset += buf_res.length; #ifdef USB_DEBUG xhci_dump_trb(&td->td_trb[x]); #endif x++; } while (average != 0); td->td_trb[x-1].dwTrb3 |= htole32(XHCI_TRB_3_IOC_BIT); /* store number of data TRB's */ td->ntrb = x; DPRINTF("NTRB=%u\n", x); /* fill out link TRB */ if (td_next != NULL) { /* link the current TD with the next one */ td->td_trb[x].qwTrb0 = htole64((uint64_t)td_next->td_self); DPRINTF("LINK=0x%08llx\n", (long long)td_next->td_self); } else { /* this field will get updated later */ DPRINTF("NOLINK\n"); } dword = XHCI_TRB_2_IRQ_SET(0); td->td_trb[x].dwTrb2 = htole32(dword); dword = XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK) | XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_IOC_BIT | /* * CHAIN-BIT: Ensure that a multi-TRB IN-endpoint * frame only receives a single short packet event * by setting the CHAIN bit in the LINK field. In * addition some XHCI controllers have problems * sending a ZLP unless the CHAIN-BIT is set in * the LINK TRB. */ XHCI_TRB_3_CHAIN_BIT; td->td_trb[x].dwTrb3 = htole32(dword); td->alt_next = td_alt_next; #ifdef USB_DEBUG xhci_dump_trb(&td->td_trb[x]); #endif usb_pc_cpu_flush(td->page_cache); } if (precompute) { precompute = 0; /* set up alt next pointer, if any */ if (temp->last_frame) { td_alt_next = NULL; } else { /* we use this field internally */ td_alt_next = td_next; } /* restore */ temp->shortpkt = shortpkt_old; temp->len = len_old; goto restart; } /* * Remove cycle bit from the first TRB if we are * stepping them: */ if (temp->step_td != 0) { td_first->td_trb[0].dwTrb3 &= ~htole32(XHCI_TRB_3_CYCLE_BIT); usb_pc_cpu_flush(td_first->page_cache); } /* clear TD SIZE to zero, hence this is the last TRB */ /* remove chain bit because this is the last data TRB in the chain */ - td->td_trb[td->ntrb - 1].dwTrb2 &= ~htole32(XHCI_TRB_2_TDSZ_SET(15)); + td->td_trb[td->ntrb - 1].dwTrb2 &= ~htole32(XHCI_TRB_2_TDSZ_SET(31)); td->td_trb[td->ntrb - 1].dwTrb3 &= ~htole32(XHCI_TRB_3_CHAIN_BIT); /* remove CHAIN-BIT from last LINK TRB */ td->td_trb[td->ntrb].dwTrb3 &= ~htole32(XHCI_TRB_3_CHAIN_BIT); usb_pc_cpu_flush(td->page_cache); temp->td = td; temp->td_next = td_next; } static void xhci_setup_generic_chain(struct usb_xfer *xfer) { struct xhci_std_temp temp; struct xhci_td *td; uint32_t x; uint32_t y; uint8_t mult; temp.do_isoc_sync = 0; temp.step_td = 0; temp.tbc = 0; temp.tlbpc = 0; temp.average = xfer->max_hc_frame_size; temp.max_packet_size = xfer->max_packet_size; temp.sc = XHCI_BUS2SC(xfer->xroot->bus); temp.pc = NULL; temp.last_frame = 0; temp.offset = 0; temp.multishort = xfer->flags_int.isochronous_xfr || xfer->flags_int.control_xfr || xfer->flags_int.short_frames_ok; /* toggle the DMA set we are using */ xfer->flags_int.curr_dma_set ^= 1; /* get next DMA set */ td = xfer->td_start[xfer->flags_int.curr_dma_set]; temp.td = NULL; temp.td_next = td; xfer->td_transfer_first = td; xfer->td_transfer_cache = td; if (xfer->flags_int.isochronous_xfr) { uint8_t shift; /* compute multiplier for ISOCHRONOUS transfers */ mult = xfer->endpoint->ecomp ? UE_GET_SS_ISO_MULT(xfer->endpoint->ecomp->bmAttributes) : 0; /* check for USB 2.0 multiplier */ if (mult == 0) { mult = (xfer->endpoint->edesc-> wMaxPacketSize[1] >> 3) & 3; } /* range check */ if (mult > 2) mult = 3; else mult++; x = XREAD4(temp.sc, runt, XHCI_MFINDEX); DPRINTF("MFINDEX=0x%08x\n", x); switch (usbd_get_speed(xfer->xroot->udev)) { case USB_SPEED_FULL: shift = 3; temp.isoc_delta = 8; /* 1ms */ x += temp.isoc_delta - 1; x &= ~(temp.isoc_delta - 1); break; default: shift = usbd_xfer_get_fps_shift(xfer); temp.isoc_delta = 1U << shift; x += temp.isoc_delta - 1; x &= ~(temp.isoc_delta - 1); /* simple frame load balancing */ x += xfer->endpoint->usb_uframe; break; } y = XHCI_MFINDEX_GET(x - xfer->endpoint->isoc_next); if ((xfer->endpoint->is_synced == 0) || (y < (xfer->nframes << shift)) || (XHCI_MFINDEX_GET(-y) >= (128 * 8))) { /* * If there is data underflow or the pipe * queue is empty we schedule the transfer a * few frames ahead of the current frame * position. Else two isochronous transfers * might overlap. */ xfer->endpoint->isoc_next = XHCI_MFINDEX_GET(x + (3 * 8)); xfer->endpoint->is_synced = 1; temp.do_isoc_sync = 1; DPRINTFN(3, "start next=%d\n", xfer->endpoint->isoc_next); } /* compute isochronous completion time */ y = XHCI_MFINDEX_GET(xfer->endpoint->isoc_next - (x & ~7)); xfer->isoc_time_complete = usb_isoc_time_expand(&temp.sc->sc_bus, x / 8) + (y / 8) + (((xfer->nframes << shift) + 7) / 8); x = 0; temp.isoc_frame = xfer->endpoint->isoc_next; temp.trb_type = XHCI_TRB_TYPE_ISOCH; xfer->endpoint->isoc_next += xfer->nframes << shift; } else if (xfer->flags_int.control_xfr) { /* check if we should prepend a setup message */ if (xfer->flags_int.control_hdr) { temp.len = xfer->frlengths[0]; temp.pc = xfer->frbuffers + 0; temp.shortpkt = temp.len ? 1 : 0; temp.trb_type = XHCI_TRB_TYPE_SETUP_STAGE; temp.direction = 0; /* check for last frame */ if (xfer->nframes == 1) { /* no STATUS stage yet, SETUP is last */ if (xfer->flags_int.control_act) temp.last_frame = 1; } xhci_setup_generic_chain_sub(&temp); } x = 1; mult = 1; temp.isoc_delta = 0; temp.isoc_frame = 0; temp.trb_type = xfer->flags_int.control_did_data ? XHCI_TRB_TYPE_NORMAL : XHCI_TRB_TYPE_DATA_STAGE; } else { x = 0; mult = 1; temp.isoc_delta = 0; temp.isoc_frame = 0; temp.trb_type = XHCI_TRB_TYPE_NORMAL; } if (x != xfer->nframes) { /* set up page_cache pointer */ temp.pc = xfer->frbuffers + x; /* set endpoint direction */ temp.direction = UE_GET_DIR(xfer->endpointno); } while (x != xfer->nframes) { /* DATA0 / DATA1 message */ temp.len = xfer->frlengths[x]; temp.step_td = ((xfer->endpointno & UE_DIR_IN) && x != 0 && temp.multishort == 0); x++; if (x == xfer->nframes) { if (xfer->flags_int.control_xfr) { /* no STATUS stage yet, DATA is last */ if (xfer->flags_int.control_act) temp.last_frame = 1; } else { temp.last_frame = 1; } } if (temp.len == 0) { /* make sure that we send an USB packet */ temp.shortpkt = 0; temp.tbc = 0; temp.tlbpc = mult - 1; } else if (xfer->flags_int.isochronous_xfr) { uint8_t tdpc; /* * Isochronous transfers don't have short * packet termination: */ temp.shortpkt = 1; /* isochronous transfers have a transfer limit */ if (temp.len > xfer->max_frame_size) temp.len = xfer->max_frame_size; /* compute TD packet count */ tdpc = howmany(temp.len, xfer->max_packet_size); temp.tbc = howmany(tdpc, mult) - 1; temp.tlbpc = (tdpc % mult); if (temp.tlbpc == 0) temp.tlbpc = mult - 1; else temp.tlbpc--; } else { /* regular data transfer */ temp.shortpkt = xfer->flags.force_short_xfer ? 0 : 1; } xhci_setup_generic_chain_sub(&temp); if (xfer->flags_int.isochronous_xfr) { temp.offset += xfer->frlengths[x - 1]; temp.isoc_frame += temp.isoc_delta; } else { /* get next Page Cache pointer */ temp.pc = xfer->frbuffers + x; } } /* check if we should append a status stage */ if (xfer->flags_int.control_xfr && !xfer->flags_int.control_act) { /* * Send a DATA1 message and invert the current * endpoint direction. */ if (xhcictlstep || temp.sc->sc_ctlstep) { /* * Some XHCI controllers will not delay the * status stage until the next SOF. Force this * behaviour to avoid failed control * transfers. */ temp.step_td = (xfer->nframes != 0); } else { temp.step_td = 0; } temp.direction = UE_GET_DIR(xfer->endpointno) ^ UE_DIR_IN; temp.len = 0; temp.pc = NULL; temp.shortpkt = 0; temp.last_frame = 1; temp.trb_type = XHCI_TRB_TYPE_STATUS_STAGE; xhci_setup_generic_chain_sub(&temp); } td = temp.td; /* must have at least one frame! */ xfer->td_transfer_last = td; DPRINTF("first=%p last=%p\n", xfer->td_transfer_first, td); } static void xhci_set_slot_pointer(struct xhci_softc *sc, uint8_t index, uint64_t dev_addr) { struct usb_page_search buf_res; struct xhci_dev_ctx_addr *pdctxa; usbd_get_page(&sc->sc_hw.ctx_pc, 0, &buf_res); pdctxa = buf_res.buffer; DPRINTF("addr[%u]=0x%016llx\n", index, (long long)dev_addr); pdctxa->qwBaaDevCtxAddr[index] = htole64(dev_addr); usb_pc_cpu_flush(&sc->sc_hw.ctx_pc); } static usb_error_t xhci_configure_mask(struct usb_device *udev, uint32_t mask, uint8_t drop) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); struct usb_page_search buf_inp; struct xhci_input_dev_ctx *pinp; uint32_t temp; uint8_t index; uint8_t x; index = udev->controller_slot_id; usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp); pinp = buf_inp.buffer; if (drop) { mask &= XHCI_INCTX_NON_CTRL_MASK; xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx0, mask); xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx1, 0); } else { /* * Some hardware requires that we drop the endpoint * context before adding it again: */ xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx0, mask & XHCI_INCTX_NON_CTRL_MASK); /* Add new endpoint context */ xhci_ctx_set_le32(sc, &pinp->ctx_input.dwInCtx1, mask); /* find most significant set bit */ for (x = 31; x != 1; x--) { if (mask & (1 << x)) break; } /* adjust */ x--; /* figure out the maximum number of contexts */ if (x > sc->sc_hw.devs[index].context_num) sc->sc_hw.devs[index].context_num = x; else x = sc->sc_hw.devs[index].context_num; /* update number of contexts */ temp = xhci_ctx_get_le32(sc, &pinp->ctx_slot.dwSctx0); temp &= ~XHCI_SCTX_0_CTX_NUM_SET(31); temp |= XHCI_SCTX_0_CTX_NUM_SET(x + 1); xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx0, temp); } usb_pc_cpu_flush(&sc->sc_hw.devs[index].input_pc); return (0); } static usb_error_t xhci_configure_endpoint(struct usb_device *udev, struct usb_endpoint_descriptor *edesc, struct xhci_endpoint_ext *pepext, uint16_t interval, uint8_t max_packet_count, uint8_t mult, uint8_t fps_shift, uint16_t max_packet_size, uint16_t max_frame_size, uint8_t ep_mode) { struct usb_page_search buf_inp; struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); struct xhci_input_dev_ctx *pinp; uint64_t ring_addr = pepext->physaddr; uint32_t temp; uint8_t index; uint8_t epno; uint8_t type; index = udev->controller_slot_id; usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp); pinp = buf_inp.buffer; epno = edesc->bEndpointAddress; type = edesc->bmAttributes & UE_XFERTYPE; if (type == UE_CONTROL) epno |= UE_DIR_IN; epno = XHCI_EPNO2EPID(epno); if (epno == 0) return (USB_ERR_NO_PIPE); /* invalid */ if (max_packet_count == 0) return (USB_ERR_BAD_BUFSIZE); max_packet_count--; if (mult == 0) return (USB_ERR_BAD_BUFSIZE); /* store endpoint mode */ pepext->trb_ep_mode = ep_mode; /* store bMaxPacketSize for control endpoints */ pepext->trb_ep_maxp = edesc->wMaxPacketSize[0]; usb_pc_cpu_flush(pepext->page_cache); if (ep_mode == USB_EP_MODE_STREAMS) { temp = XHCI_EPCTX_0_EPSTATE_SET(0) | XHCI_EPCTX_0_MAXP_STREAMS_SET(XHCI_MAX_STREAMS_LOG - 1) | XHCI_EPCTX_0_LSA_SET(1); ring_addr += sizeof(struct xhci_trb) * XHCI_MAX_TRANSFERS * XHCI_MAX_STREAMS; } else { temp = XHCI_EPCTX_0_EPSTATE_SET(0) | XHCI_EPCTX_0_MAXP_STREAMS_SET(0) | XHCI_EPCTX_0_LSA_SET(0); ring_addr |= XHCI_EPCTX_2_DCS_SET(1); } switch (udev->speed) { case USB_SPEED_FULL: case USB_SPEED_LOW: /* 1ms -> 125us */ fps_shift += 3; break; default: break; } switch (type) { case UE_INTERRUPT: if (fps_shift > 3) fps_shift--; temp |= XHCI_EPCTX_0_IVAL_SET(fps_shift); break; case UE_ISOCHRONOUS: temp |= XHCI_EPCTX_0_IVAL_SET(fps_shift); switch (udev->speed) { case USB_SPEED_SUPER: if (mult > 3) mult = 3; temp |= XHCI_EPCTX_0_MULT_SET(mult - 1); max_packet_count /= mult; break; default: break; } break; default: break; } xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx0, temp); temp = XHCI_EPCTX_1_HID_SET(0) | XHCI_EPCTX_1_MAXB_SET(max_packet_count) | XHCI_EPCTX_1_MAXP_SIZE_SET(max_packet_size); /* * Always enable the "three strikes and you are gone" feature * except for ISOCHRONOUS endpoints. This is suggested by * section 4.3.3 in the XHCI specification about device slot * initialisation. */ if (type != UE_ISOCHRONOUS) temp |= XHCI_EPCTX_1_CERR_SET(3); switch (type) { case UE_CONTROL: temp |= XHCI_EPCTX_1_EPTYPE_SET(4); break; case UE_ISOCHRONOUS: temp |= XHCI_EPCTX_1_EPTYPE_SET(1); break; case UE_BULK: temp |= XHCI_EPCTX_1_EPTYPE_SET(2); break; default: temp |= XHCI_EPCTX_1_EPTYPE_SET(3); break; } /* check for IN direction */ if (epno & 1) temp |= XHCI_EPCTX_1_EPTYPE_SET(4); xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx1, temp); xhci_ctx_set_le64(sc, &pinp->ctx_ep[epno - 1].qwEpCtx2, ring_addr); switch (edesc->bmAttributes & UE_XFERTYPE) { case UE_INTERRUPT: case UE_ISOCHRONOUS: temp = XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_SET(max_frame_size) | XHCI_EPCTX_4_AVG_TRB_LEN_SET(MIN(XHCI_PAGE_SIZE, max_frame_size)); break; case UE_CONTROL: temp = XHCI_EPCTX_4_AVG_TRB_LEN_SET(8); break; default: temp = XHCI_EPCTX_4_AVG_TRB_LEN_SET(XHCI_PAGE_SIZE); break; } xhci_ctx_set_le32(sc, &pinp->ctx_ep[epno - 1].dwEpCtx4, temp); #ifdef USB_DEBUG xhci_dump_endpoint(sc, &pinp->ctx_ep[epno - 1]); #endif usb_pc_cpu_flush(&sc->sc_hw.devs[index].input_pc); return (0); /* success */ } static usb_error_t xhci_configure_endpoint_by_xfer(struct usb_xfer *xfer) { struct xhci_endpoint_ext *pepext; struct usb_endpoint_ss_comp_descriptor *ecomp; usb_stream_t x; pepext = xhci_get_endpoint_ext(xfer->xroot->udev, xfer->endpoint->edesc); ecomp = xfer->endpoint->ecomp; for (x = 0; x != XHCI_MAX_STREAMS; x++) { uint64_t temp; /* halt any transfers */ pepext->trb[x * XHCI_MAX_TRANSFERS].dwTrb3 = 0; /* compute start of TRB ring for stream "x" */ temp = pepext->physaddr + (x * XHCI_MAX_TRANSFERS * sizeof(struct xhci_trb)) + XHCI_SCTX_0_SCT_SEC_TR_RING; /* make tree structure */ pepext->trb[(XHCI_MAX_TRANSFERS * XHCI_MAX_STREAMS) + x].qwTrb0 = htole64(temp); /* reserved fields */ pepext->trb[(XHCI_MAX_TRANSFERS * XHCI_MAX_STREAMS) + x].dwTrb2 = 0; pepext->trb[(XHCI_MAX_TRANSFERS * XHCI_MAX_STREAMS) + x].dwTrb3 = 0; } usb_pc_cpu_flush(pepext->page_cache); return (xhci_configure_endpoint(xfer->xroot->udev, xfer->endpoint->edesc, pepext, xfer->interval, xfer->max_packet_count, (ecomp != NULL) ? UE_GET_SS_ISO_MULT(ecomp->bmAttributes) + 1 : 1, usbd_xfer_get_fps_shift(xfer), xfer->max_packet_size, xfer->max_frame_size, xfer->endpoint->ep_mode)); } static usb_error_t xhci_configure_device(struct usb_device *udev) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); struct usb_page_search buf_inp; struct usb_page_cache *pcinp; struct xhci_input_dev_ctx *pinp; struct usb_device *hubdev; uint32_t temp; uint32_t route; uint32_t rh_port; uint8_t is_hub; uint8_t index; uint8_t depth; index = udev->controller_slot_id; DPRINTF("index=%u\n", index); pcinp = &sc->sc_hw.devs[index].input_pc; usbd_get_page(pcinp, 0, &buf_inp); pinp = buf_inp.buffer; rh_port = 0; route = 0; /* figure out route string and root HUB port number */ for (hubdev = udev; hubdev != NULL; hubdev = hubdev->parent_hub) { if (hubdev->parent_hub == NULL) break; depth = hubdev->parent_hub->depth; /* * NOTE: HS/FS/LS devices and the SS root HUB can have * more than 15 ports */ rh_port = hubdev->port_no; if (depth == 0) break; if (rh_port > 15) rh_port = 15; if (depth < 6) route |= rh_port << (4 * (depth - 1)); } DPRINTF("Route=0x%08x\n", route); temp = XHCI_SCTX_0_ROUTE_SET(route) | XHCI_SCTX_0_CTX_NUM_SET( sc->sc_hw.devs[index].context_num + 1); switch (udev->speed) { case USB_SPEED_LOW: temp |= XHCI_SCTX_0_SPEED_SET(2); if (udev->parent_hs_hub != NULL && udev->parent_hs_hub->ddesc.bDeviceProtocol == UDPROTO_HSHUBMTT) { DPRINTF("Device inherits MTT\n"); temp |= XHCI_SCTX_0_MTT_SET(1); } break; case USB_SPEED_HIGH: temp |= XHCI_SCTX_0_SPEED_SET(3); if (sc->sc_hw.devs[index].nports != 0 && udev->ddesc.bDeviceProtocol == UDPROTO_HSHUBMTT) { DPRINTF("HUB supports MTT\n"); temp |= XHCI_SCTX_0_MTT_SET(1); } break; case USB_SPEED_FULL: temp |= XHCI_SCTX_0_SPEED_SET(1); if (udev->parent_hs_hub != NULL && udev->parent_hs_hub->ddesc.bDeviceProtocol == UDPROTO_HSHUBMTT) { DPRINTF("Device inherits MTT\n"); temp |= XHCI_SCTX_0_MTT_SET(1); } break; default: temp |= XHCI_SCTX_0_SPEED_SET(4); break; } is_hub = sc->sc_hw.devs[index].nports != 0 && (udev->speed == USB_SPEED_SUPER || udev->speed == USB_SPEED_HIGH); if (is_hub) temp |= XHCI_SCTX_0_HUB_SET(1); xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx0, temp); temp = XHCI_SCTX_1_RH_PORT_SET(rh_port); if (is_hub) { temp |= XHCI_SCTX_1_NUM_PORTS_SET( sc->sc_hw.devs[index].nports); } switch (udev->speed) { case USB_SPEED_SUPER: switch (sc->sc_hw.devs[index].state) { case XHCI_ST_ADDRESSED: case XHCI_ST_CONFIGURED: /* enable power save */ temp |= XHCI_SCTX_1_MAX_EL_SET(sc->sc_exit_lat_max); break; default: /* disable power save */ break; } break; default: break; } xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx1, temp); temp = XHCI_SCTX_2_IRQ_TARGET_SET(0); if (is_hub) { temp |= XHCI_SCTX_2_TT_THINK_TIME_SET( sc->sc_hw.devs[index].tt); } hubdev = udev->parent_hs_hub; /* check if we should activate the transaction translator */ switch (udev->speed) { case USB_SPEED_FULL: case USB_SPEED_LOW: if (hubdev != NULL) { temp |= XHCI_SCTX_2_TT_HUB_SID_SET( hubdev->controller_slot_id); temp |= XHCI_SCTX_2_TT_PORT_NUM_SET( udev->hs_port_no); } break; default: break; } xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx2, temp); /* * These fields should be initialized to zero, according to * XHCI section 6.2.2 - slot context: */ temp = XHCI_SCTX_3_DEV_ADDR_SET(0) | XHCI_SCTX_3_SLOT_STATE_SET(0); xhci_ctx_set_le32(sc, &pinp->ctx_slot.dwSctx3, temp); #ifdef USB_DEBUG xhci_dump_device(sc, &pinp->ctx_slot); #endif usb_pc_cpu_flush(pcinp); return (0); /* success */ } static usb_error_t xhci_alloc_device_ext(struct usb_device *udev) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); struct usb_page_search buf_dev; struct usb_page_search buf_ep; struct xhci_trb *trb; struct usb_page_cache *pc; struct usb_page *pg; uint64_t addr; uint8_t index; uint8_t i; index = udev->controller_slot_id; pc = &sc->sc_hw.devs[index].device_pc; pg = &sc->sc_hw.devs[index].device_pg; /* need to initialize the page cache */ pc->tag_parent = sc->sc_bus.dma_parent_tag; if (usb_pc_alloc_mem(pc, pg, sc->sc_ctx_is_64_byte ? (2 * sizeof(struct xhci_dev_ctx)) : sizeof(struct xhci_dev_ctx), XHCI_PAGE_SIZE)) goto error; usbd_get_page(pc, 0, &buf_dev); pc = &sc->sc_hw.devs[index].input_pc; pg = &sc->sc_hw.devs[index].input_pg; /* need to initialize the page cache */ pc->tag_parent = sc->sc_bus.dma_parent_tag; if (usb_pc_alloc_mem(pc, pg, sc->sc_ctx_is_64_byte ? (2 * sizeof(struct xhci_input_dev_ctx)) : sizeof(struct xhci_input_dev_ctx), XHCI_PAGE_SIZE)) { goto error; } /* initialize all endpoint LINK TRBs */ for (i = 0; i != XHCI_MAX_ENDPOINTS; i++) { pc = &sc->sc_hw.devs[index].endpoint_pc[i]; pg = &sc->sc_hw.devs[index].endpoint_pg[i]; /* need to initialize the page cache */ pc->tag_parent = sc->sc_bus.dma_parent_tag; if (usb_pc_alloc_mem(pc, pg, sizeof(struct xhci_dev_endpoint_trbs), XHCI_TRB_ALIGN)) { goto error; } /* lookup endpoint TRB ring */ usbd_get_page(pc, 0, &buf_ep); /* get TRB pointer */ trb = buf_ep.buffer; trb += XHCI_MAX_TRANSFERS - 1; /* get TRB start address */ addr = buf_ep.physaddr; /* create LINK TRB */ trb->qwTrb0 = htole64(addr); trb->dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0)); trb->dwTrb3 = htole32(XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK)); usb_pc_cpu_flush(pc); } xhci_set_slot_pointer(sc, index, buf_dev.physaddr); return (0); error: xhci_free_device_ext(udev); return (USB_ERR_NOMEM); } static void xhci_free_device_ext(struct usb_device *udev) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); uint8_t index; uint8_t i; index = udev->controller_slot_id; xhci_set_slot_pointer(sc, index, 0); usb_pc_free_mem(&sc->sc_hw.devs[index].device_pc); usb_pc_free_mem(&sc->sc_hw.devs[index].input_pc); for (i = 0; i != XHCI_MAX_ENDPOINTS; i++) usb_pc_free_mem(&sc->sc_hw.devs[index].endpoint_pc[i]); } static struct xhci_endpoint_ext * xhci_get_endpoint_ext(struct usb_device *udev, struct usb_endpoint_descriptor *edesc) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); struct xhci_endpoint_ext *pepext; struct usb_page_cache *pc; struct usb_page_search buf_ep; uint8_t epno; uint8_t index; epno = edesc->bEndpointAddress; if ((edesc->bmAttributes & UE_XFERTYPE) == UE_CONTROL) epno |= UE_DIR_IN; epno = XHCI_EPNO2EPID(epno); index = udev->controller_slot_id; pc = &sc->sc_hw.devs[index].endpoint_pc[epno]; usbd_get_page(pc, 0, &buf_ep); pepext = &sc->sc_hw.devs[index].endp[epno]; pepext->page_cache = pc; pepext->trb = buf_ep.buffer; pepext->physaddr = buf_ep.physaddr; return (pepext); } static void xhci_endpoint_doorbell(struct usb_xfer *xfer) { struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus); uint8_t epno; uint8_t index; epno = xfer->endpointno; if (xfer->flags_int.control_xfr) epno |= UE_DIR_IN; epno = XHCI_EPNO2EPID(epno); index = xfer->xroot->udev->controller_slot_id; if (xfer->xroot->udev->flags.self_suspended == 0) { XWRITE4(sc, door, XHCI_DOORBELL(index), epno | XHCI_DB_SID_SET(xfer->stream_id)); } } static void xhci_transfer_remove(struct usb_xfer *xfer, usb_error_t error) { struct xhci_endpoint_ext *pepext; if (xfer->flags_int.bandwidth_reclaimed) { xfer->flags_int.bandwidth_reclaimed = 0; pepext = xhci_get_endpoint_ext(xfer->xroot->udev, xfer->endpoint->edesc); pepext->trb_used[xfer->stream_id]--; pepext->xfer[xfer->qh_pos] = NULL; if (error && pepext->trb_running != 0) { pepext->trb_halted = 1; pepext->trb_running = 0; } } } static usb_error_t xhci_transfer_insert(struct usb_xfer *xfer) { struct xhci_td *td_first; struct xhci_td *td_last; struct xhci_trb *trb_link; struct xhci_endpoint_ext *pepext; uint64_t addr; usb_stream_t id; uint8_t i; uint8_t inext; uint8_t trb_limit; DPRINTFN(8, "\n"); id = xfer->stream_id; /* check if already inserted */ if (xfer->flags_int.bandwidth_reclaimed) { DPRINTFN(8, "Already in schedule\n"); return (0); } pepext = xhci_get_endpoint_ext(xfer->xroot->udev, xfer->endpoint->edesc); td_first = xfer->td_transfer_first; td_last = xfer->td_transfer_last; addr = pepext->physaddr; switch (xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE) { case UE_CONTROL: case UE_INTERRUPT: /* single buffered */ trb_limit = 1; break; default: /* multi buffered */ trb_limit = (XHCI_MAX_TRANSFERS - 2); break; } if (pepext->trb_used[id] >= trb_limit) { DPRINTFN(8, "Too many TDs queued.\n"); return (USB_ERR_NOMEM); } /* check if bMaxPacketSize changed */ if (xfer->flags_int.control_xfr != 0 && pepext->trb_ep_maxp != xfer->endpoint->edesc->wMaxPacketSize[0]) { DPRINTFN(8, "Reconfigure control endpoint\n"); /* force driver to reconfigure endpoint */ pepext->trb_halted = 1; pepext->trb_running = 0; } /* check for stopped condition, after putting transfer on interrupt queue */ if (pepext->trb_running == 0) { struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus); DPRINTFN(8, "Not running\n"); /* start configuration */ (void)usb_proc_msignal(USB_BUS_CONTROL_XFER_PROC(&sc->sc_bus), &sc->sc_config_msg[0], &sc->sc_config_msg[1]); return (0); } pepext->trb_used[id]++; /* get current TRB index */ i = pepext->trb_index[id]; /* get next TRB index */ inext = (i + 1); /* the last entry of the ring is a hardcoded link TRB */ if (inext >= (XHCI_MAX_TRANSFERS - 1)) inext = 0; /* store next TRB index, before stream ID offset is added */ pepext->trb_index[id] = inext; /* offset for stream */ i += id * XHCI_MAX_TRANSFERS; inext += id * XHCI_MAX_TRANSFERS; /* compute terminating return address */ addr += (inext * sizeof(struct xhci_trb)); /* compute link TRB pointer */ trb_link = td_last->td_trb + td_last->ntrb; /* update next pointer of last link TRB */ trb_link->qwTrb0 = htole64(addr); trb_link->dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0)); trb_link->dwTrb3 = htole32(XHCI_TRB_3_IOC_BIT | XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK)); #ifdef USB_DEBUG xhci_dump_trb(&td_last->td_trb[td_last->ntrb]); #endif usb_pc_cpu_flush(td_last->page_cache); /* write ahead chain end marker */ pepext->trb[inext].qwTrb0 = 0; pepext->trb[inext].dwTrb2 = 0; pepext->trb[inext].dwTrb3 = 0; /* update next pointer of link TRB */ pepext->trb[i].qwTrb0 = htole64((uint64_t)td_first->td_self); pepext->trb[i].dwTrb2 = htole32(XHCI_TRB_2_IRQ_SET(0)); #ifdef USB_DEBUG xhci_dump_trb(&pepext->trb[i]); #endif usb_pc_cpu_flush(pepext->page_cache); /* toggle cycle bit which activates the transfer chain */ pepext->trb[i].dwTrb3 = htole32(XHCI_TRB_3_CYCLE_BIT | XHCI_TRB_3_TYPE_SET(XHCI_TRB_TYPE_LINK)); usb_pc_cpu_flush(pepext->page_cache); DPRINTF("qh_pos = %u\n", i); pepext->xfer[i] = xfer; xfer->qh_pos = i; xfer->flags_int.bandwidth_reclaimed = 1; xhci_endpoint_doorbell(xfer); return (0); } static void xhci_root_intr(struct xhci_softc *sc) { uint16_t i; USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED); /* clear any old interrupt data */ memset(sc->sc_hub_idata, 0, sizeof(sc->sc_hub_idata)); for (i = 1; i <= sc->sc_noport; i++) { /* pick out CHANGE bits from the status register */ if (XREAD4(sc, oper, XHCI_PORTSC(i)) & ( XHCI_PS_CSC | XHCI_PS_PEC | XHCI_PS_OCC | XHCI_PS_WRC | XHCI_PS_PRC | XHCI_PS_PLC | XHCI_PS_CEC)) { sc->sc_hub_idata[i / 8] |= 1 << (i % 8); DPRINTF("port %d changed\n", i); } } uhub_root_intr(&sc->sc_bus, sc->sc_hub_idata, sizeof(sc->sc_hub_idata)); } /*------------------------------------------------------------------------* * xhci_device_done - XHCI done handler * * NOTE: This function can be called two times in a row on * the same USB transfer. From close and from interrupt. *------------------------------------------------------------------------*/ static void xhci_device_done(struct usb_xfer *xfer, usb_error_t error) { DPRINTFN(2, "xfer=%p, endpoint=%p, error=%d\n", xfer, xfer->endpoint, error); /* remove transfer from HW queue */ xhci_transfer_remove(xfer, error); /* dequeue transfer and start next transfer */ usbd_transfer_done(xfer, error); } /*------------------------------------------------------------------------* * XHCI data transfer support (generic type) *------------------------------------------------------------------------*/ static void xhci_device_generic_open(struct usb_xfer *xfer) { if (xfer->flags_int.isochronous_xfr) { switch (xfer->xroot->udev->speed) { case USB_SPEED_FULL: break; default: usb_hs_bandwidth_alloc(xfer); break; } } } static void xhci_device_generic_close(struct usb_xfer *xfer) { DPRINTF("\n"); xhci_device_done(xfer, USB_ERR_CANCELLED); if (xfer->flags_int.isochronous_xfr) { switch (xfer->xroot->udev->speed) { case USB_SPEED_FULL: break; default: usb_hs_bandwidth_free(xfer); break; } } } static void xhci_device_generic_multi_enter(struct usb_endpoint *ep, usb_stream_t stream_id, struct usb_xfer *enter_xfer) { struct usb_xfer *xfer; /* check if there is a current transfer */ xfer = ep->endpoint_q[stream_id].curr; if (xfer == NULL) return; /* * Check if the current transfer is started and then pickup * the next one, if any. Else wait for next start event due to * block on failure feature. */ if (!xfer->flags_int.bandwidth_reclaimed) return; xfer = TAILQ_FIRST(&ep->endpoint_q[stream_id].head); if (xfer == NULL) { /* * In case of enter we have to consider that the * transfer is queued by the USB core after the enter * method is called. */ xfer = enter_xfer; if (xfer == NULL) return; } /* try to multi buffer */ xhci_transfer_insert(xfer); } static void xhci_device_generic_enter(struct usb_xfer *xfer) { DPRINTF("\n"); /* set up TD's and QH */ xhci_setup_generic_chain(xfer); xhci_device_generic_multi_enter(xfer->endpoint, xfer->stream_id, xfer); } static void xhci_device_generic_start(struct usb_xfer *xfer) { DPRINTF("\n"); /* try to insert xfer on HW queue */ xhci_transfer_insert(xfer); /* try to multi buffer */ xhci_device_generic_multi_enter(xfer->endpoint, xfer->stream_id, NULL); /* add transfer last on interrupt queue */ usbd_transfer_enqueue(&xfer->xroot->bus->intr_q, xfer); /* start timeout, if any */ if (xfer->timeout != 0) usbd_transfer_timeout_ms(xfer, &xhci_timeout, xfer->timeout); } static const struct usb_pipe_methods xhci_device_generic_methods = { .open = xhci_device_generic_open, .close = xhci_device_generic_close, .enter = xhci_device_generic_enter, .start = xhci_device_generic_start, }; /*------------------------------------------------------------------------* * xhci root HUB support *------------------------------------------------------------------------* * Simulate a hardware HUB by handling all the necessary requests. *------------------------------------------------------------------------*/ #define HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } static const struct usb_device_descriptor xhci_devd = { .bLength = sizeof(xhci_devd), .bDescriptorType = UDESC_DEVICE, /* type */ HSETW(.bcdUSB, 0x0300), /* USB version */ .bDeviceClass = UDCLASS_HUB, /* class */ .bDeviceSubClass = UDSUBCLASS_HUB, /* subclass */ .bDeviceProtocol = UDPROTO_SSHUB, /* protocol */ .bMaxPacketSize = 9, /* max packet size */ HSETW(.idVendor, 0x0000), /* vendor */ HSETW(.idProduct, 0x0000), /* product */ HSETW(.bcdDevice, 0x0100), /* device version */ .iManufacturer = 1, .iProduct = 2, .iSerialNumber = 0, .bNumConfigurations = 1, /* # of configurations */ }; static const struct xhci_bos_desc xhci_bosd = { .bosd = { .bLength = sizeof(xhci_bosd.bosd), .bDescriptorType = UDESC_BOS, HSETW(.wTotalLength, sizeof(xhci_bosd)), .bNumDeviceCaps = 3, }, .usb2extd = { .bLength = sizeof(xhci_bosd.usb2extd), .bDescriptorType = 1, .bDevCapabilityType = 2, .bmAttributes[0] = 2, }, .usbdcd = { .bLength = sizeof(xhci_bosd.usbdcd), .bDescriptorType = UDESC_DEVICE_CAPABILITY, .bDevCapabilityType = 3, .bmAttributes = 0, /* XXX */ HSETW(.wSpeedsSupported, 0x000C), .bFunctionalitySupport = 8, .bU1DevExitLat = 255, /* dummy - not used */ .wU2DevExitLat = { 0x00, 0x08 }, }, .cidd = { .bLength = sizeof(xhci_bosd.cidd), .bDescriptorType = 1, .bDevCapabilityType = 4, .bReserved = 0, .bContainerID = 0, /* XXX */ }, }; static const struct xhci_config_desc xhci_confd = { .confd = { .bLength = sizeof(xhci_confd.confd), .bDescriptorType = UDESC_CONFIG, .wTotalLength[0] = sizeof(xhci_confd), .bNumInterface = 1, .bConfigurationValue = 1, .iConfiguration = 0, .bmAttributes = UC_SELF_POWERED, .bMaxPower = 0 /* max power */ }, .ifcd = { .bLength = sizeof(xhci_confd.ifcd), .bDescriptorType = UDESC_INTERFACE, .bNumEndpoints = 1, .bInterfaceClass = UICLASS_HUB, .bInterfaceSubClass = UISUBCLASS_HUB, .bInterfaceProtocol = 0, }, .endpd = { .bLength = sizeof(xhci_confd.endpd), .bDescriptorType = UDESC_ENDPOINT, .bEndpointAddress = UE_DIR_IN | XHCI_INTR_ENDPT, .bmAttributes = UE_INTERRUPT, .wMaxPacketSize[0] = 2, /* max 15 ports */ .bInterval = 255, }, .endpcd = { .bLength = sizeof(xhci_confd.endpcd), .bDescriptorType = UDESC_ENDPOINT_SS_COMP, .bMaxBurst = 0, .bmAttributes = 0, }, }; static const struct usb_hub_ss_descriptor xhci_hubd = { .bLength = sizeof(xhci_hubd), .bDescriptorType = UDESC_SS_HUB, }; static usb_error_t xhci_roothub_exec(struct usb_device *udev, struct usb_device_request *req, const void **pptr, uint16_t *plength) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); const char *str_ptr; const void *ptr; uint32_t port; uint32_t v; uint16_t len; uint16_t i; uint16_t value; uint16_t index; uint8_t j; usb_error_t err; USB_BUS_LOCK_ASSERT(&sc->sc_bus, MA_OWNED); /* buffer reset */ ptr = (const void *)&sc->sc_hub_desc; len = 0; err = 0; value = UGETW(req->wValue); index = UGETW(req->wIndex); DPRINTFN(3, "type=0x%02x request=0x%02x wLen=0x%04x " "wValue=0x%04x wIndex=0x%04x\n", req->bmRequestType, req->bRequest, UGETW(req->wLength), value, index); #define C(x,y) ((x) | ((y) << 8)) switch (C(req->bRequest, req->bmRequestType)) { case C(UR_CLEAR_FEATURE, UT_WRITE_DEVICE): case C(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE): case C(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT): /* * DEVICE_REMOTE_WAKEUP and ENDPOINT_HALT are no-ops * for the integrated root hub. */ break; case C(UR_GET_CONFIG, UT_READ_DEVICE): len = 1; sc->sc_hub_desc.temp[0] = sc->sc_conf; break; case C(UR_GET_DESCRIPTOR, UT_READ_DEVICE): switch (value >> 8) { case UDESC_DEVICE: if ((value & 0xff) != 0) { err = USB_ERR_IOERROR; goto done; } len = sizeof(xhci_devd); ptr = (const void *)&xhci_devd; break; case UDESC_BOS: if ((value & 0xff) != 0) { err = USB_ERR_IOERROR; goto done; } len = sizeof(xhci_bosd); ptr = (const void *)&xhci_bosd; break; case UDESC_CONFIG: if ((value & 0xff) != 0) { err = USB_ERR_IOERROR; goto done; } len = sizeof(xhci_confd); ptr = (const void *)&xhci_confd; break; case UDESC_STRING: switch (value & 0xff) { case 0: /* Language table */ str_ptr = "\001"; break; case 1: /* Vendor */ str_ptr = sc->sc_vendor; break; case 2: /* Product */ str_ptr = "XHCI root HUB"; break; default: str_ptr = ""; break; } len = usb_make_str_desc( sc->sc_hub_desc.temp, sizeof(sc->sc_hub_desc.temp), str_ptr); break; default: err = USB_ERR_IOERROR; goto done; } break; case C(UR_GET_INTERFACE, UT_READ_INTERFACE): len = 1; sc->sc_hub_desc.temp[0] = 0; break; case C(UR_GET_STATUS, UT_READ_DEVICE): len = 2; USETW(sc->sc_hub_desc.stat.wStatus, UDS_SELF_POWERED); break; case C(UR_GET_STATUS, UT_READ_INTERFACE): case C(UR_GET_STATUS, UT_READ_ENDPOINT): len = 2; USETW(sc->sc_hub_desc.stat.wStatus, 0); break; case C(UR_SET_ADDRESS, UT_WRITE_DEVICE): if (value >= XHCI_MAX_DEVICES) { err = USB_ERR_IOERROR; goto done; } break; case C(UR_SET_CONFIG, UT_WRITE_DEVICE): if (value != 0 && value != 1) { err = USB_ERR_IOERROR; goto done; } sc->sc_conf = value; break; case C(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE): break; case C(UR_SET_FEATURE, UT_WRITE_DEVICE): case C(UR_SET_FEATURE, UT_WRITE_INTERFACE): case C(UR_SET_FEATURE, UT_WRITE_ENDPOINT): err = USB_ERR_IOERROR; goto done; case C(UR_SET_INTERFACE, UT_WRITE_INTERFACE): break; case C(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT): break; /* Hub requests */ case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_DEVICE): break; case C(UR_CLEAR_FEATURE, UT_WRITE_CLASS_OTHER): DPRINTFN(9, "UR_CLEAR_PORT_FEATURE\n"); if ((index < 1) || (index > sc->sc_noport)) { err = USB_ERR_IOERROR; goto done; } port = XHCI_PORTSC(index); v = XREAD4(sc, oper, port); i = XHCI_PS_PLS_GET(v); v &= ~XHCI_PS_CLEAR; switch (value) { case UHF_C_BH_PORT_RESET: XWRITE4(sc, oper, port, v | XHCI_PS_WRC); break; case UHF_C_PORT_CONFIG_ERROR: XWRITE4(sc, oper, port, v | XHCI_PS_CEC); break; case UHF_C_PORT_SUSPEND: case UHF_C_PORT_LINK_STATE: XWRITE4(sc, oper, port, v | XHCI_PS_PLC); break; case UHF_C_PORT_CONNECTION: XWRITE4(sc, oper, port, v | XHCI_PS_CSC); break; case UHF_C_PORT_ENABLE: XWRITE4(sc, oper, port, v | XHCI_PS_PEC); break; case UHF_C_PORT_OVER_CURRENT: XWRITE4(sc, oper, port, v | XHCI_PS_OCC); break; case UHF_C_PORT_RESET: XWRITE4(sc, oper, port, v | XHCI_PS_PRC); break; case UHF_PORT_ENABLE: XWRITE4(sc, oper, port, v | XHCI_PS_PED); break; case UHF_PORT_POWER: XWRITE4(sc, oper, port, v & ~XHCI_PS_PP); break; case UHF_PORT_INDICATOR: XWRITE4(sc, oper, port, v & ~XHCI_PS_PIC_SET(3)); break; case UHF_PORT_SUSPEND: /* U3 -> U15 */ if (i == 3) { XWRITE4(sc, oper, port, v | XHCI_PS_PLS_SET(0xF) | XHCI_PS_LWS); } /* wait 20ms for resume sequence to complete */ usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 50); /* U0 */ XWRITE4(sc, oper, port, v | XHCI_PS_PLS_SET(0) | XHCI_PS_LWS); break; default: err = USB_ERR_IOERROR; goto done; } break; case C(UR_GET_DESCRIPTOR, UT_READ_CLASS_DEVICE): if ((value & 0xff) != 0) { err = USB_ERR_IOERROR; goto done; } v = XREAD4(sc, capa, XHCI_HCSPARAMS0); sc->sc_hub_desc.hubd = xhci_hubd; sc->sc_hub_desc.hubd.bNbrPorts = sc->sc_noport; if (XHCI_HCS0_PPC(v)) i = UHD_PWR_INDIVIDUAL; else i = UHD_PWR_GANGED; if (XHCI_HCS0_PIND(v)) i |= UHD_PORT_IND; i |= UHD_OC_INDIVIDUAL; USETW(sc->sc_hub_desc.hubd.wHubCharacteristics, i); /* see XHCI section 5.4.9: */ sc->sc_hub_desc.hubd.bPwrOn2PwrGood = 10; for (j = 1; j <= sc->sc_noport; j++) { v = XREAD4(sc, oper, XHCI_PORTSC(j)); if (v & XHCI_PS_DR) { sc->sc_hub_desc.hubd. DeviceRemovable[j / 8] |= 1U << (j % 8); } } len = sc->sc_hub_desc.hubd.bLength; break; case C(UR_GET_STATUS, UT_READ_CLASS_DEVICE): len = 16; memset(sc->sc_hub_desc.temp, 0, 16); break; case C(UR_GET_STATUS, UT_READ_CLASS_OTHER): DPRINTFN(9, "UR_GET_STATUS i=%d\n", index); if ((index < 1) || (index > sc->sc_noport)) { err = USB_ERR_IOERROR; goto done; } v = XREAD4(sc, oper, XHCI_PORTSC(index)); DPRINTFN(9, "port status=0x%08x\n", v); i = UPS_PORT_LINK_STATE_SET(XHCI_PS_PLS_GET(v)); switch (XHCI_PS_SPEED_GET(v)) { case 3: i |= UPS_HIGH_SPEED; break; case 2: i |= UPS_LOW_SPEED; break; case 1: /* FULL speed */ break; default: i |= UPS_OTHER_SPEED; break; } if (v & XHCI_PS_CCS) i |= UPS_CURRENT_CONNECT_STATUS; if (v & XHCI_PS_PED) i |= UPS_PORT_ENABLED; if (v & XHCI_PS_OCA) i |= UPS_OVERCURRENT_INDICATOR; if (v & XHCI_PS_PR) i |= UPS_RESET; if (v & XHCI_PS_PP) { /* * The USB 3.0 RH is using the * USB 2.0's power bit */ i |= UPS_PORT_POWER; } USETW(sc->sc_hub_desc.ps.wPortStatus, i); i = 0; if (v & XHCI_PS_CSC) i |= UPS_C_CONNECT_STATUS; if (v & XHCI_PS_PEC) i |= UPS_C_PORT_ENABLED; if (v & XHCI_PS_OCC) i |= UPS_C_OVERCURRENT_INDICATOR; if (v & XHCI_PS_WRC) i |= UPS_C_BH_PORT_RESET; if (v & XHCI_PS_PRC) i |= UPS_C_PORT_RESET; if (v & XHCI_PS_PLC) i |= UPS_C_PORT_LINK_STATE; if (v & XHCI_PS_CEC) i |= UPS_C_PORT_CONFIG_ERROR; USETW(sc->sc_hub_desc.ps.wPortChange, i); len = sizeof(sc->sc_hub_desc.ps); break; case C(UR_SET_DESCRIPTOR, UT_WRITE_CLASS_DEVICE): err = USB_ERR_IOERROR; goto done; case C(UR_SET_FEATURE, UT_WRITE_CLASS_DEVICE): break; case C(UR_SET_FEATURE, UT_WRITE_CLASS_OTHER): i = index >> 8; index &= 0x00FF; if ((index < 1) || (index > sc->sc_noport)) { err = USB_ERR_IOERROR; goto done; } port = XHCI_PORTSC(index); v = XREAD4(sc, oper, port) & ~XHCI_PS_CLEAR; switch (value) { case UHF_PORT_U1_TIMEOUT: if (XHCI_PS_SPEED_GET(v) != 4) { err = USB_ERR_IOERROR; goto done; } port = XHCI_PORTPMSC(index); v = XREAD4(sc, oper, port); v &= ~XHCI_PM3_U1TO_SET(0xFF); v |= XHCI_PM3_U1TO_SET(i); XWRITE4(sc, oper, port, v); break; case UHF_PORT_U2_TIMEOUT: if (XHCI_PS_SPEED_GET(v) != 4) { err = USB_ERR_IOERROR; goto done; } port = XHCI_PORTPMSC(index); v = XREAD4(sc, oper, port); v &= ~XHCI_PM3_U2TO_SET(0xFF); v |= XHCI_PM3_U2TO_SET(i); XWRITE4(sc, oper, port, v); break; case UHF_BH_PORT_RESET: XWRITE4(sc, oper, port, v | XHCI_PS_WPR); break; case UHF_PORT_LINK_STATE: XWRITE4(sc, oper, port, v | XHCI_PS_PLS_SET(i) | XHCI_PS_LWS); /* 4ms settle time */ usb_pause_mtx(&sc->sc_bus.bus_mtx, hz / 250); break; case UHF_PORT_ENABLE: DPRINTFN(3, "set port enable %d\n", index); break; case UHF_PORT_SUSPEND: DPRINTFN(6, "suspend port %u (LPM=%u)\n", index, i); j = XHCI_PS_SPEED_GET(v); if ((j < 1) || (j > 3)) { /* non-supported speed */ err = USB_ERR_IOERROR; goto done; } XWRITE4(sc, oper, port, v | XHCI_PS_PLS_SET(i ? 2 /* LPM */ : 3) | XHCI_PS_LWS); break; case UHF_PORT_RESET: DPRINTFN(6, "reset port %d\n", index); XWRITE4(sc, oper, port, v | XHCI_PS_PR); break; case UHF_PORT_POWER: DPRINTFN(3, "set port power %d\n", index); XWRITE4(sc, oper, port, v | XHCI_PS_PP); break; case UHF_PORT_TEST: DPRINTFN(3, "set port test %d\n", index); break; case UHF_PORT_INDICATOR: DPRINTFN(3, "set port indicator %d\n", index); v &= ~XHCI_PS_PIC_SET(3); v |= XHCI_PS_PIC_SET(1); XWRITE4(sc, oper, port, v); break; default: err = USB_ERR_IOERROR; goto done; } break; case C(UR_CLEAR_TT_BUFFER, UT_WRITE_CLASS_OTHER): case C(UR_RESET_TT, UT_WRITE_CLASS_OTHER): case C(UR_GET_TT_STATE, UT_READ_CLASS_OTHER): case C(UR_STOP_TT, UT_WRITE_CLASS_OTHER): break; default: err = USB_ERR_IOERROR; goto done; } done: *plength = len; *pptr = ptr; return (err); } static void xhci_xfer_setup(struct usb_setup_params *parm) { struct usb_page_search page_info; struct usb_page_cache *pc; struct usb_xfer *xfer; void *last_obj; uint32_t ntd; uint32_t n; xfer = parm->curr_xfer; /* * The proof for the "ntd" formula is illustrated like this: * * +------------------------------------+ * | | * | |remainder -> | * | +-----+---+ | * | | xxx | x | frm 0 | * | +-----+---++ | * | | xxx | xx | frm 1 | * | +-----+----+ | * | ... | * +------------------------------------+ * * "xxx" means a completely full USB transfer descriptor * * "x" and "xx" means a short USB packet * * For the remainder of an USB transfer modulo * "max_data_length" we need two USB transfer descriptors. * One to transfer the remaining data and one to finalise with * a zero length packet in case the "force_short_xfer" flag is * set. We only need two USB transfer descriptors in the case * where the transfer length of the first one is a factor of * "max_frame_size". The rest of the needed USB transfer * descriptors is given by the buffer size divided by the * maximum data payload. */ parm->hc_max_packet_size = 0x400; parm->hc_max_packet_count = 16 * 3; parm->hc_max_frame_size = XHCI_TD_PAYLOAD_MAX; xfer->flags_int.bdma_enable = 1; usbd_transfer_setup_sub(parm); if (xfer->flags_int.isochronous_xfr) { ntd = ((1 * xfer->nframes) + (xfer->max_data_length / xfer->max_hc_frame_size)); } else if (xfer->flags_int.control_xfr) { ntd = ((2 * xfer->nframes) + 1 /* STATUS */ + (xfer->max_data_length / xfer->max_hc_frame_size)); } else { ntd = ((2 * xfer->nframes) + (xfer->max_data_length / xfer->max_hc_frame_size)); } alloc_dma_set: if (parm->err) return; /* * Allocate queue heads and transfer descriptors */ last_obj = NULL; if (usbd_transfer_setup_sub_malloc( parm, &pc, sizeof(struct xhci_td), XHCI_TD_ALIGN, ntd)) { parm->err = USB_ERR_NOMEM; return; } if (parm->buf) { for (n = 0; n != ntd; n++) { struct xhci_td *td; usbd_get_page(pc + n, 0, &page_info); td = page_info.buffer; /* init TD */ td->td_self = page_info.physaddr; td->obj_next = last_obj; td->page_cache = pc + n; last_obj = td; usb_pc_cpu_flush(pc + n); } } xfer->td_start[xfer->flags_int.curr_dma_set] = last_obj; if (!xfer->flags_int.curr_dma_set) { xfer->flags_int.curr_dma_set = 1; goto alloc_dma_set; } } static usb_error_t xhci_configure_reset_endpoint(struct usb_xfer *xfer) { struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus); struct usb_page_search buf_inp; struct usb_device *udev; struct xhci_endpoint_ext *pepext; struct usb_endpoint_descriptor *edesc; struct usb_page_cache *pcinp; usb_error_t err; usb_stream_t stream_id; uint8_t index; uint8_t epno; pepext = xhci_get_endpoint_ext(xfer->xroot->udev, xfer->endpoint->edesc); udev = xfer->xroot->udev; index = udev->controller_slot_id; pcinp = &sc->sc_hw.devs[index].input_pc; usbd_get_page(pcinp, 0, &buf_inp); edesc = xfer->endpoint->edesc; epno = edesc->bEndpointAddress; stream_id = xfer->stream_id; if ((edesc->bmAttributes & UE_XFERTYPE) == UE_CONTROL) epno |= UE_DIR_IN; epno = XHCI_EPNO2EPID(epno); if (epno == 0) return (USB_ERR_NO_PIPE); /* invalid */ XHCI_CMD_LOCK(sc); /* configure endpoint */ err = xhci_configure_endpoint_by_xfer(xfer); if (err != 0) { XHCI_CMD_UNLOCK(sc); return (err); } /* * Get the endpoint into the stopped state according to the * endpoint context state diagram in the XHCI specification: */ err = xhci_cmd_stop_ep(sc, 0, epno, index); if (err != 0) DPRINTF("Could not stop endpoint %u\n", epno); err = xhci_cmd_reset_ep(sc, 0, epno, index); if (err != 0) DPRINTF("Could not reset endpoint %u\n", epno); err = xhci_cmd_set_tr_dequeue_ptr(sc, (pepext->physaddr + (stream_id * sizeof(struct xhci_trb) * XHCI_MAX_TRANSFERS)) | XHCI_EPCTX_2_DCS_SET(1), stream_id, epno, index); if (err != 0) DPRINTF("Could not set dequeue ptr for endpoint %u\n", epno); /* * Get the endpoint into the running state according to the * endpoint context state diagram in the XHCI specification: */ xhci_configure_mask(udev, (1U << epno) | 1U, 0); if (epno > 1) err = xhci_cmd_configure_ep(sc, buf_inp.physaddr, 0, index); else err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index); if (err != 0) DPRINTF("Could not configure endpoint %u\n", epno); XHCI_CMD_UNLOCK(sc); return (0); } static void xhci_xfer_unsetup(struct usb_xfer *xfer) { return; } static void xhci_start_dma_delay(struct usb_xfer *xfer) { struct xhci_softc *sc = XHCI_BUS2SC(xfer->xroot->bus); /* put transfer on interrupt queue (again) */ usbd_transfer_enqueue(&sc->sc_bus.intr_q, xfer); (void)usb_proc_msignal(USB_BUS_CONTROL_XFER_PROC(&sc->sc_bus), &sc->sc_config_msg[0], &sc->sc_config_msg[1]); } static void xhci_configure_msg(struct usb_proc_msg *pm) { struct xhci_softc *sc; struct xhci_endpoint_ext *pepext; struct usb_xfer *xfer; sc = XHCI_BUS2SC(((struct usb_bus_msg *)pm)->bus); restart: TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) { pepext = xhci_get_endpoint_ext(xfer->xroot->udev, xfer->endpoint->edesc); if ((pepext->trb_halted != 0) || (pepext->trb_running == 0)) { uint16_t i; /* clear halted and running */ pepext->trb_halted = 0; pepext->trb_running = 0; /* nuke remaining buffered transfers */ for (i = 0; i != (XHCI_MAX_TRANSFERS * XHCI_MAX_STREAMS); i++) { /* * NOTE: We need to use the timeout * error code here else existing * isochronous clients can get * confused: */ if (pepext->xfer[i] != NULL) { xhci_device_done(pepext->xfer[i], USB_ERR_TIMEOUT); } } /* * NOTE: The USB transfer cannot vanish in * this state! */ USB_BUS_UNLOCK(&sc->sc_bus); xhci_configure_reset_endpoint(xfer); USB_BUS_LOCK(&sc->sc_bus); /* check if halted is still cleared */ if (pepext->trb_halted == 0) { pepext->trb_running = 1; memset(pepext->trb_index, 0, sizeof(pepext->trb_index)); } goto restart; } if (xfer->flags_int.did_dma_delay) { /* remove transfer from interrupt queue (again) */ usbd_transfer_dequeue(xfer); /* we are finally done */ usb_dma_delay_done_cb(xfer); /* queue changed - restart */ goto restart; } } TAILQ_FOREACH(xfer, &sc->sc_bus.intr_q.head, wait_entry) { /* try to insert xfer on HW queue */ xhci_transfer_insert(xfer); /* try to multi buffer */ xhci_device_generic_multi_enter(xfer->endpoint, xfer->stream_id, NULL); } } static void xhci_ep_init(struct usb_device *udev, struct usb_endpoint_descriptor *edesc, struct usb_endpoint *ep) { struct xhci_endpoint_ext *pepext; DPRINTFN(2, "endpoint=%p, addr=%d, endpt=%d, mode=%d\n", ep, udev->address, edesc->bEndpointAddress, udev->flags.usb_mode); if (udev->parent_hub == NULL) { /* root HUB has special endpoint handling */ return; } ep->methods = &xhci_device_generic_methods; pepext = xhci_get_endpoint_ext(udev, edesc); USB_BUS_LOCK(udev->bus); pepext->trb_halted = 1; pepext->trb_running = 0; USB_BUS_UNLOCK(udev->bus); } static void xhci_ep_uninit(struct usb_device *udev, struct usb_endpoint *ep) { } static void xhci_ep_clear_stall(struct usb_device *udev, struct usb_endpoint *ep) { struct xhci_endpoint_ext *pepext; DPRINTF("\n"); if (udev->flags.usb_mode != USB_MODE_HOST) { /* not supported */ return; } if (udev->parent_hub == NULL) { /* root HUB has special endpoint handling */ return; } pepext = xhci_get_endpoint_ext(udev, ep->edesc); USB_BUS_LOCK(udev->bus); pepext->trb_halted = 1; pepext->trb_running = 0; USB_BUS_UNLOCK(udev->bus); } static usb_error_t xhci_device_init(struct usb_device *udev) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); usb_error_t err; uint8_t temp; /* no init for root HUB */ if (udev->parent_hub == NULL) return (0); XHCI_CMD_LOCK(sc); /* set invalid default */ udev->controller_slot_id = sc->sc_noslot + 1; /* try to get a new slot ID from the XHCI */ err = xhci_cmd_enable_slot(sc, &temp); if (err) { XHCI_CMD_UNLOCK(sc); return (err); } if (temp > sc->sc_noslot) { XHCI_CMD_UNLOCK(sc); return (USB_ERR_BAD_ADDRESS); } if (sc->sc_hw.devs[temp].state != XHCI_ST_DISABLED) { DPRINTF("slot %u already allocated.\n", temp); XHCI_CMD_UNLOCK(sc); return (USB_ERR_BAD_ADDRESS); } /* store slot ID for later reference */ udev->controller_slot_id = temp; /* reset data structure */ memset(&sc->sc_hw.devs[temp], 0, sizeof(sc->sc_hw.devs[0])); /* set mark slot allocated */ sc->sc_hw.devs[temp].state = XHCI_ST_ENABLED; err = xhci_alloc_device_ext(udev); XHCI_CMD_UNLOCK(sc); /* get device into default state */ if (err == 0) err = xhci_set_address(udev, NULL, 0); return (err); } static void xhci_device_uninit(struct usb_device *udev) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); uint8_t index; /* no init for root HUB */ if (udev->parent_hub == NULL) return; XHCI_CMD_LOCK(sc); index = udev->controller_slot_id; if (index <= sc->sc_noslot) { xhci_cmd_disable_slot(sc, index); sc->sc_hw.devs[index].state = XHCI_ST_DISABLED; /* free device extension */ xhci_free_device_ext(udev); } XHCI_CMD_UNLOCK(sc); } static void xhci_get_dma_delay(struct usb_device *udev, uint32_t *pus) { /* * Wait until the hardware has finished any possible use of * the transfer descriptor(s) */ *pus = 2048; /* microseconds */ } static void xhci_device_resume(struct usb_device *udev) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); uint8_t index; uint8_t n; uint8_t p; DPRINTF("\n"); /* check for root HUB */ if (udev->parent_hub == NULL) return; index = udev->controller_slot_id; XHCI_CMD_LOCK(sc); /* blindly resume all endpoints */ USB_BUS_LOCK(udev->bus); for (n = 1; n != XHCI_MAX_ENDPOINTS; n++) { for (p = 0; p != XHCI_MAX_STREAMS; p++) { XWRITE4(sc, door, XHCI_DOORBELL(index), n | XHCI_DB_SID_SET(p)); } } USB_BUS_UNLOCK(udev->bus); XHCI_CMD_UNLOCK(sc); } static void xhci_device_suspend(struct usb_device *udev) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); uint8_t index; uint8_t n; usb_error_t err; DPRINTF("\n"); /* check for root HUB */ if (udev->parent_hub == NULL) return; index = udev->controller_slot_id; XHCI_CMD_LOCK(sc); /* blindly suspend all endpoints */ for (n = 1; n != XHCI_MAX_ENDPOINTS; n++) { err = xhci_cmd_stop_ep(sc, 1, n, index); if (err != 0) { DPRINTF("Failed to suspend endpoint " "%u on slot %u (ignored).\n", n, index); } } XHCI_CMD_UNLOCK(sc); } static void xhci_set_hw_power(struct usb_bus *bus) { DPRINTF("\n"); } static void xhci_device_state_change(struct usb_device *udev) { struct xhci_softc *sc = XHCI_BUS2SC(udev->bus); struct usb_page_search buf_inp; usb_error_t err; uint8_t index; /* check for root HUB */ if (udev->parent_hub == NULL) return; index = udev->controller_slot_id; DPRINTF("\n"); if (usb_get_device_state(udev) == USB_STATE_CONFIGURED) { err = uhub_query_info(udev, &sc->sc_hw.devs[index].nports, &sc->sc_hw.devs[index].tt); if (err != 0) sc->sc_hw.devs[index].nports = 0; } XHCI_CMD_LOCK(sc); switch (usb_get_device_state(udev)) { case USB_STATE_POWERED: if (sc->sc_hw.devs[index].state == XHCI_ST_DEFAULT) break; /* set default state */ sc->sc_hw.devs[index].state = XHCI_ST_DEFAULT; /* reset number of contexts */ sc->sc_hw.devs[index].context_num = 0; err = xhci_cmd_reset_dev(sc, index); if (err != 0) { DPRINTF("Device reset failed " "for slot %u.\n", index); } break; case USB_STATE_ADDRESSED: if (sc->sc_hw.devs[index].state == XHCI_ST_ADDRESSED) break; sc->sc_hw.devs[index].state = XHCI_ST_ADDRESSED; /* set configure mask to slot only */ xhci_configure_mask(udev, 1, 0); /* deconfigure all endpoints, except EP0 */ err = xhci_cmd_configure_ep(sc, 0, 1, index); if (err) { DPRINTF("Failed to deconfigure " "slot %u.\n", index); } break; case USB_STATE_CONFIGURED: if (sc->sc_hw.devs[index].state == XHCI_ST_CONFIGURED) break; /* set configured state */ sc->sc_hw.devs[index].state = XHCI_ST_CONFIGURED; /* reset number of contexts */ sc->sc_hw.devs[index].context_num = 0; usbd_get_page(&sc->sc_hw.devs[index].input_pc, 0, &buf_inp); xhci_configure_mask(udev, 3, 0); err = xhci_configure_device(udev); if (err != 0) { DPRINTF("Could not configure device " "at slot %u.\n", index); } err = xhci_cmd_evaluate_ctx(sc, buf_inp.physaddr, index); if (err != 0) { DPRINTF("Could not evaluate device " "context at slot %u.\n", index); } break; default: break; } XHCI_CMD_UNLOCK(sc); } static usb_error_t xhci_set_endpoint_mode(struct usb_device *udev, struct usb_endpoint *ep, uint8_t ep_mode) { switch (ep_mode) { case USB_EP_MODE_DEFAULT: return (0); case USB_EP_MODE_STREAMS: if (xhcistreams == 0 || (ep->edesc->bmAttributes & UE_XFERTYPE) != UE_BULK || udev->speed != USB_SPEED_SUPER) return (USB_ERR_INVAL); return (0); default: return (USB_ERR_INVAL); } } static const struct usb_bus_methods xhci_bus_methods = { .endpoint_init = xhci_ep_init, .endpoint_uninit = xhci_ep_uninit, .xfer_setup = xhci_xfer_setup, .xfer_unsetup = xhci_xfer_unsetup, .get_dma_delay = xhci_get_dma_delay, .device_init = xhci_device_init, .device_uninit = xhci_device_uninit, .device_resume = xhci_device_resume, .device_suspend = xhci_device_suspend, .set_hw_power = xhci_set_hw_power, .roothub_exec = xhci_roothub_exec, .xfer_poll = xhci_do_poll, .start_dma_delay = xhci_start_dma_delay, .set_address = xhci_set_address, .clear_stall = xhci_ep_clear_stall, .device_state_change = xhci_device_state_change, .set_hw_power_sleep = xhci_set_hw_power_sleep, .set_endpoint_mode = xhci_set_endpoint_mode, }; Index: projects/clang900-import/sys/dev/usb/usb_bus.h =================================================================== --- projects/clang900-import/sys/dev/usb/usb_bus.h (revision 352586) +++ projects/clang900-import/sys/dev/usb/usb_bus.h (revision 352587) @@ -1,136 +1,137 @@ /* $FreeBSD$ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _USB_BUS_H_ #define _USB_BUS_H_ struct usb_fs_privdata; /* * The following structure defines the USB explore message sent to the USB * explore process. */ struct usb_bus_msg { struct usb_proc_msg hdr; struct usb_bus *bus; }; /* * The following structure defines the USB statistics structure. */ struct usb_bus_stat { uint32_t uds_requests[4]; }; /* * The following structure defines an USB BUS. There is one USB BUS * for every Host or Device controller. */ struct usb_bus { struct usb_bus_stat stats_err; struct usb_bus_stat stats_ok; #if USB_HAVE_ROOT_MOUNT_HOLD struct root_hold_token *bus_roothold; #endif /* convenience macros */ #define USB_BUS_TT_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus) #define USB_BUS_CS_PROC(bus) USB_BUS_NON_GIANT_ISOC_PROC(bus) #if USB_HAVE_PER_BUS_PROCESS #define USB_BUS_GIANT_PROC(bus) (&(bus)->giant_callback_proc) #define USB_BUS_NON_GIANT_ISOC_PROC(bus) (&(bus)->non_giant_isoc_callback_proc) #define USB_BUS_NON_GIANT_BULK_PROC(bus) (&(bus)->non_giant_bulk_callback_proc) #define USB_BUS_EXPLORE_PROC(bus) (&(bus)->explore_proc) #define USB_BUS_CONTROL_XFER_PROC(bus) (&(bus)->control_xfer_proc) /* * There are three callback processes. One for Giant locked * callbacks. One for non-Giant locked non-periodic callbacks * and one for non-Giant locked periodic callbacks. This * should avoid congestion and reduce response time in most * cases. */ struct usb_process giant_callback_proc; struct usb_process non_giant_isoc_callback_proc; struct usb_process non_giant_bulk_callback_proc; /* Explore process */ struct usb_process explore_proc; /* Control request process */ struct usb_process control_xfer_proc; #endif struct usb_bus_msg explore_msg[2]; struct usb_bus_msg detach_msg[2]; struct usb_bus_msg attach_msg[2]; struct usb_bus_msg suspend_msg[2]; struct usb_bus_msg resume_msg[2]; struct usb_bus_msg reset_msg[2]; struct usb_bus_msg shutdown_msg[2]; #if USB_HAVE_UGEN struct usb_bus_msg cleanup_msg[2]; LIST_HEAD(,usb_fs_privdata) pd_cleanup_list; #endif /* * This mutex protects the USB hardware: */ struct mtx bus_mtx; struct mtx bus_spin_lock; struct usb_xfer_queue intr_q; struct usb_callout power_wdog; /* power management */ device_t parent; device_t bdev; /* filled by HC driver */ #if USB_HAVE_BUSDMA struct usb_dma_parent_tag dma_parent_tag[1]; struct usb_dma_tag dma_tags[USB_BUS_DMA_TAG_MAX]; #endif const struct usb_bus_methods *methods; /* filled by HC driver */ struct usb_device **devices; struct ifnet *ifp; /* only for USB Packet Filter */ usb_power_mask_t hw_power_state; /* see USB_HW_POWER_XXX */ usb_size_t uframe_usage[USB_HS_MICRO_FRAMES_MAX]; uint16_t isoc_time_last; /* in milliseconds */ uint8_t alloc_failed; /* Set if memory allocation failed. */ uint8_t driver_added_refcount; /* Current driver generation count */ enum usb_revision usbrev; /* USB revision. See "USB_REV_XXX". */ uint8_t devices_max; /* maximum number of USB devices */ uint8_t do_probe; /* set if USB should be re-probed */ uint8_t no_explore; /* don't explore USB ports */ uint8_t dma_bits; /* number of DMA address lines */ + uint8_t control_ep_quirk; /* need 64kByte buffer for data stage */ }; #endif /* _USB_BUS_H_ */ Index: projects/clang900-import/sys/dev/usb/usb_ioctl.h =================================================================== --- projects/clang900-import/sys/dev/usb/usb_ioctl.h (revision 352586) +++ projects/clang900-import/sys/dev/usb/usb_ioctl.h (revision 352587) @@ -1,349 +1,349 @@ /* $FreeBSD$ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. * Copyright (c) 1998 The NetBSD Foundation, Inc. All rights reserved. * Copyright (c) 1998 Lennart Augustsson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _USB_IOCTL_H_ #define _USB_IOCTL_H_ #ifndef USB_GLOBAL_INCLUDE_FILE #include #include /* Building "kdump" depends on these includes */ #include #include #endif #define USB_DEVICE_NAME "usbctl" #define USB_DEVICE_DIR "usb" #define USB_GENERIC_NAME "ugen" #define USB_TEMPLATE_SYSCTL "hw.usb.template" /* integer type */ /* * Align IOCTL structures to hide differences when running 32-bit * programs under 64-bit kernels: */ #ifdef COMPAT_32BIT #define USB_IOCTL_STRUCT_ALIGN(n) __aligned(n) #else #define USB_IOCTL_STRUCT_ALIGN(n) #endif /* Definition of valid template sysctl values */ enum { USB_TEMP_MSC, /* USB Mass Storage */ USB_TEMP_CDCE, /* USB CDC Ethernet */ USB_TEMP_MTP, /* Message Transfer Protocol */ USB_TEMP_MODEM, /* USB CDC Modem */ USB_TEMP_AUDIO, /* USB Audio */ USB_TEMP_KBD, /* USB Keyboard */ USB_TEMP_MOUSE, /* USB Mouse */ USB_TEMP_PHONE, /* USB Phone */ USB_TEMP_SERIALNET, /* USB CDC Ethernet and Modem */ USB_TEMP_MIDI, /* USB MIDI */ USB_TEMP_MULTI, /* USB Ethernet, serial, and storage */ USB_TEMP_CDCEEM, /* USB Ethernet Emulation Model */ USB_TEMP_MAX, }; struct usb_read_dir { #ifdef COMPAT_32BIT uint64_t urd_data; #else void *urd_data; #endif uint32_t urd_startentry; uint32_t urd_maxlen; } USB_IOCTL_STRUCT_ALIGN(8); struct usb_ctl_request { #ifdef COMPAT_32BIT uint64_t ucr_data; #else void *ucr_data; #endif uint16_t ucr_flags; uint16_t ucr_actlen; /* actual length transferred */ uint8_t ucr_addr; /* zero - currently not used */ struct usb_device_request ucr_request; } USB_IOCTL_STRUCT_ALIGN(8); struct usb_alt_interface { uint8_t uai_interface_index; uint8_t uai_alt_index; } USB_IOCTL_STRUCT_ALIGN(1); struct usb_gen_descriptor { #ifdef COMPAT_32BIT uint64_t ugd_data; #else void *ugd_data; #endif uint16_t ugd_lang_id; uint16_t ugd_maxlen; uint16_t ugd_actlen; uint16_t ugd_offset; uint8_t ugd_config_index; uint8_t ugd_string_index; uint8_t ugd_iface_index; uint8_t ugd_altif_index; uint8_t ugd_endpt_index; uint8_t ugd_report_type; uint8_t reserved[8]; } USB_IOCTL_STRUCT_ALIGN(8); struct usb_device_info { uint16_t udi_productNo; uint16_t udi_vendorNo; uint16_t udi_releaseNo; uint16_t udi_power; /* power consumption in mA, 0 if * selfpowered */ uint8_t udi_bus; uint8_t udi_addr; /* device address */ uint8_t udi_index; /* device index */ uint8_t udi_class; uint8_t udi_subclass; uint8_t udi_protocol; uint8_t udi_config_no; /* current config number */ uint8_t udi_config_index; /* current config index */ uint8_t udi_speed; /* see "USB_SPEED_XXX" */ uint8_t udi_mode; /* see "USB_MODE_XXX" */ uint8_t udi_nports; uint8_t udi_hubaddr; /* parent HUB address */ uint8_t udi_hubindex; /* parent HUB device index */ uint8_t udi_hubport; /* parent HUB port */ uint8_t udi_power_mode; /* see "USB_POWER_MODE_XXX" */ uint8_t udi_suspended; /* set if device is suspended */ uint8_t udi_reserved[16]; /* leave space for the future */ char udi_product[128]; char udi_vendor[128]; char udi_serial[64]; char udi_release[8]; } USB_IOCTL_STRUCT_ALIGN(2); #define USB_DEVICE_PORT_PATH_MAX 32 struct usb_device_port_path { uint8_t udp_bus; /* which bus we are on */ uint8_t udp_index; /* which device index */ uint8_t udp_port_level; /* how many levels: 0, 1, 2 ... */ uint8_t udp_port_no[USB_DEVICE_PORT_PATH_MAX]; } USB_IOCTL_STRUCT_ALIGN(1); struct usb_device_stats { uint32_t uds_requests_ok[4]; /* Indexed by transfer type UE_XXX */ uint32_t uds_requests_fail[4]; /* Indexed by transfer type UE_XXX */ } USB_IOCTL_STRUCT_ALIGN(4); struct usb_fs_start { uint8_t ep_index; } USB_IOCTL_STRUCT_ALIGN(1); struct usb_fs_stop { uint8_t ep_index; } USB_IOCTL_STRUCT_ALIGN(1); struct usb_fs_complete { uint8_t ep_index; } USB_IOCTL_STRUCT_ALIGN(1); /* This structure is used for all endpoint types */ struct usb_fs_endpoint { /* * NOTE: isochronous USB transfer only use one buffer, but can have * multiple frame lengths ! */ #ifdef COMPAT_32BIT uint64_t ppBuffer; uint64_t pLength; #else void **ppBuffer; /* pointer to userland buffers */ uint32_t *pLength; /* pointer to frame lengths, updated * to actual length */ #endif uint32_t nFrames; /* number of frames */ uint32_t aFrames; /* actual number of frames */ uint16_t flags; /* a single short frame will terminate */ #define USB_FS_FLAG_SINGLE_SHORT_OK 0x0001 /* multiple short frames are allowed */ #define USB_FS_FLAG_MULTI_SHORT_OK 0x0002 /* all frame(s) transmitted are short terminated */ #define USB_FS_FLAG_FORCE_SHORT 0x0004 /* will do a clear-stall before xfer */ #define USB_FS_FLAG_CLEAR_STALL 0x0008 uint16_t timeout; /* in milliseconds */ /* isocronous completion time in milliseconds - used for echo cancel */ uint16_t isoc_time_complete; /* timeout value for no timeout */ #define USB_FS_TIMEOUT_NONE 0 int status; /* see USB_ERR_XXX */ } USB_IOCTL_STRUCT_ALIGN(8); struct usb_fs_init { /* userland pointer to endpoints structure */ #ifdef COMPAT_32BIT uint64_t pEndpoints; #else struct usb_fs_endpoint *pEndpoints; #endif /* maximum number of endpoints */ uint8_t ep_index_max; } USB_IOCTL_STRUCT_ALIGN(8); struct usb_fs_uninit { uint8_t dummy; /* zero */ } USB_IOCTL_STRUCT_ALIGN(1); struct usb_fs_open { -#define USB_FS_MAX_BUFSIZE (1 << 18) +#define USB_FS_MAX_BUFSIZE (1 << 25) /* 32 MBytes */ uint32_t max_bufsize; #define USB_FS_MAX_FRAMES (1U << 12) #define USB_FS_MAX_FRAMES_PRE_SCALE (1U << 31) /* for ISOCHRONOUS transfers */ uint32_t max_frames; /* read and write */ uint16_t max_packet_length; /* read only */ uint8_t dev_index; /* currently unused */ uint8_t ep_index; uint8_t ep_no; /* bEndpointNumber */ } USB_IOCTL_STRUCT_ALIGN(4); struct usb_fs_open_stream { struct usb_fs_open fs_open; uint16_t stream_id; /* stream ID */ } USB_IOCTL_STRUCT_ALIGN(4); struct usb_fs_close { uint8_t ep_index; } USB_IOCTL_STRUCT_ALIGN(1); struct usb_fs_clear_stall_sync { uint8_t ep_index; } USB_IOCTL_STRUCT_ALIGN(1); struct usb_gen_quirk { uint16_t index; /* Quirk Index */ uint16_t vid; /* Vendor ID */ uint16_t pid; /* Product ID */ uint16_t bcdDeviceLow; /* Low Device Revision */ uint16_t bcdDeviceHigh; /* High Device Revision */ uint16_t reserved[2]; /* * String version of quirk including terminating zero. See * UQ_XXX in "usb_quirk.h". */ char quirkname[64 - 14]; } USB_IOCTL_STRUCT_ALIGN(2); /* USB controller */ #define USB_REQUEST _IOWR('U', 1, struct usb_ctl_request) #define USB_SETDEBUG _IOW ('U', 2, int) #define USB_DISCOVER _IO ('U', 3) #define USB_DEVICEINFO _IOWR('U', 4, struct usb_device_info) #define USB_DEVICESTATS _IOR ('U', 5, struct usb_device_stats) #define USB_DEVICEENUMERATE _IOW ('U', 6, int) /* Generic HID device */ #define USB_GET_REPORT_DESC _IOWR('U', 21, struct usb_gen_descriptor) #define USB_SET_IMMED _IOW ('U', 22, int) #define USB_GET_REPORT _IOWR('U', 23, struct usb_gen_descriptor) #define USB_SET_REPORT _IOW ('U', 24, struct usb_gen_descriptor) #define USB_GET_REPORT_ID _IOR ('U', 25, int) /* Generic USB device */ #define USB_GET_CONFIG _IOR ('U', 100, int) #define USB_SET_CONFIG _IOW ('U', 101, int) #define USB_GET_ALTINTERFACE _IOWR('U', 102, struct usb_alt_interface) #define USB_SET_ALTINTERFACE _IOWR('U', 103, struct usb_alt_interface) #define USB_GET_DEVICE_DESC _IOR ('U', 105, struct usb_device_descriptor) #define USB_GET_CONFIG_DESC _IOR ('U', 106, struct usb_config_descriptor) #define USB_GET_RX_INTERFACE_DESC _IOR ('U', 107, struct usb_interface_descriptor) #define USB_GET_RX_ENDPOINT_DESC _IOR ('U', 108, struct usb_endpoint_descriptor) #define USB_GET_FULL_DESC _IOWR('U', 109, struct usb_gen_descriptor) #define USB_GET_STRING_DESC _IOWR('U', 110, struct usb_gen_descriptor) #define USB_DO_REQUEST _IOWR('U', 111, struct usb_ctl_request) #define USB_GET_DEVICEINFO _IOR ('U', 112, struct usb_device_info) #define USB_SET_RX_SHORT_XFER _IOW ('U', 113, int) #define USB_SET_RX_TIMEOUT _IOW ('U', 114, int) #define USB_GET_RX_FRAME_SIZE _IOR ('U', 115, int) #define USB_GET_RX_BUFFER_SIZE _IOR ('U', 117, int) #define USB_SET_RX_BUFFER_SIZE _IOW ('U', 118, int) #define USB_SET_RX_STALL_FLAG _IOW ('U', 119, int) #define USB_SET_TX_STALL_FLAG _IOW ('U', 120, int) #define USB_GET_IFACE_DRIVER _IOWR('U', 121, struct usb_gen_descriptor) #define USB_CLAIM_INTERFACE _IOW ('U', 122, int) #define USB_RELEASE_INTERFACE _IOW ('U', 123, int) #define USB_IFACE_DRIVER_ACTIVE _IOW ('U', 124, int) #define USB_IFACE_DRIVER_DETACH _IOW ('U', 125, int) #define USB_GET_PLUGTIME _IOR ('U', 126, uint32_t) #define USB_READ_DIR _IOW ('U', 127, struct usb_read_dir) /* 128 - 133 unused */ #define USB_GET_DEV_PORT_PATH _IOR ('U', 134, struct usb_device_port_path) #define USB_GET_POWER_USAGE _IOR ('U', 135, int) #define USB_SET_TX_FORCE_SHORT _IOW ('U', 136, int) #define USB_SET_TX_TIMEOUT _IOW ('U', 137, int) #define USB_GET_TX_FRAME_SIZE _IOR ('U', 138, int) #define USB_GET_TX_BUFFER_SIZE _IOR ('U', 139, int) #define USB_SET_TX_BUFFER_SIZE _IOW ('U', 140, int) #define USB_GET_TX_INTERFACE_DESC _IOR ('U', 141, struct usb_interface_descriptor) #define USB_GET_TX_ENDPOINT_DESC _IOR ('U', 142, struct usb_endpoint_descriptor) #define USB_SET_PORT_ENABLE _IOW ('U', 143, int) #define USB_SET_PORT_DISABLE _IOW ('U', 144, int) #define USB_SET_POWER_MODE _IOW ('U', 145, int) #define USB_GET_POWER_MODE _IOR ('U', 146, int) #define USB_SET_TEMPLATE _IOW ('U', 147, int) #define USB_GET_TEMPLATE _IOR ('U', 148, int) /* Modem device */ #define USB_GET_CM_OVER_DATA _IOR ('U', 180, int) #define USB_SET_CM_OVER_DATA _IOW ('U', 181, int) /* GPIO control */ #define USB_GET_GPIO _IOR ('U', 182, int) #define USB_SET_GPIO _IOW ('U', 183, int) /* USB file system interface */ #define USB_FS_START _IOW ('U', 192, struct usb_fs_start) #define USB_FS_STOP _IOW ('U', 193, struct usb_fs_stop) #define USB_FS_COMPLETE _IOR ('U', 194, struct usb_fs_complete) #define USB_FS_INIT _IOW ('U', 195, struct usb_fs_init) #define USB_FS_UNINIT _IOW ('U', 196, struct usb_fs_uninit) #define USB_FS_OPEN _IOWR('U', 197, struct usb_fs_open) #define USB_FS_CLOSE _IOW ('U', 198, struct usb_fs_close) #define USB_FS_CLEAR_STALL_SYNC _IOW ('U', 199, struct usb_fs_clear_stall_sync) #define USB_FS_OPEN_STREAM _IOWR('U', 200, struct usb_fs_open_stream) /* USB quirk system interface */ #define USB_DEV_QUIRK_GET _IOWR('Q', 0, struct usb_gen_quirk) #define USB_QUIRK_NAME_GET _IOWR('Q', 1, struct usb_gen_quirk) #define USB_DEV_QUIRK_ADD _IOW ('Q', 2, struct usb_gen_quirk) #define USB_DEV_QUIRK_REMOVE _IOW ('Q', 3, struct usb_gen_quirk) #endif /* _USB_IOCTL_H_ */ Index: projects/clang900-import/sys/dev/usb/usb_transfer.c =================================================================== --- projects/clang900-import/sys/dev/usb/usb_transfer.c (revision 352586) +++ projects/clang900-import/sys/dev/usb/usb_transfer.c (revision 352587) @@ -1,3556 +1,3585 @@ /* $FreeBSD$ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef USB_GLOBAL_INCLUDE_FILE #include USB_GLOBAL_INCLUDE_FILE #else #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define USB_DEBUG_VAR usb_debug #include #include #include #include #include #include #include #include #include #include #endif /* USB_GLOBAL_INCLUDE_FILE */ struct usb_std_packet_size { struct { uint16_t min; /* inclusive */ uint16_t max; /* inclusive */ } range; uint16_t fixed[4]; }; static usb_callback_t usb_request_callback; static const struct usb_config usb_control_ep_cfg[USB_CTRL_XFER_MAX] = { /* This transfer is used for generic control endpoint transfers */ [0] = { .type = UE_CONTROL, .endpoint = 0x00, /* Control endpoint */ .direction = UE_DIR_ANY, .bufsize = USB_EP0_BUFSIZE, /* bytes */ .flags = {.proxy_buffer = 1,}, .callback = &usb_request_callback, .usb_mode = USB_MODE_DUAL, /* both modes */ }, /* This transfer is used for generic clear stall only */ [1] = { .type = UE_CONTROL, .endpoint = 0x00, /* Control pipe */ .direction = UE_DIR_ANY, .bufsize = sizeof(struct usb_device_request), .callback = &usb_do_clear_stall_callback, .timeout = 1000, /* 1 second */ .interval = 50, /* 50ms */ .usb_mode = USB_MODE_HOST, }, }; +static const struct usb_config usb_control_ep_quirk_cfg[USB_CTRL_XFER_MAX] = { + + /* This transfer is used for generic control endpoint transfers */ + + [0] = { + .type = UE_CONTROL, + .endpoint = 0x00, /* Control endpoint */ + .direction = UE_DIR_ANY, + .bufsize = 65535, /* bytes */ + .callback = &usb_request_callback, + .usb_mode = USB_MODE_DUAL, /* both modes */ + }, + + /* This transfer is used for generic clear stall only */ + + [1] = { + .type = UE_CONTROL, + .endpoint = 0x00, /* Control pipe */ + .direction = UE_DIR_ANY, + .bufsize = sizeof(struct usb_device_request), + .callback = &usb_do_clear_stall_callback, + .timeout = 1000, /* 1 second */ + .interval = 50, /* 50ms */ + .usb_mode = USB_MODE_HOST, + }, +}; + /* function prototypes */ static void usbd_update_max_frame_size(struct usb_xfer *); static void usbd_transfer_unsetup_sub(struct usb_xfer_root *, uint8_t); static void usbd_control_transfer_init(struct usb_xfer *); static int usbd_setup_ctrl_transfer(struct usb_xfer *); static void usb_callback_proc(struct usb_proc_msg *); static void usbd_callback_ss_done_defer(struct usb_xfer *); static void usbd_callback_wrapper(struct usb_xfer_queue *); static void usbd_transfer_start_cb(void *); static uint8_t usbd_callback_wrapper_sub(struct usb_xfer *); static void usbd_get_std_packet_size(struct usb_std_packet_size *ptr, uint8_t type, enum usb_dev_speed speed); /*------------------------------------------------------------------------* * usb_request_callback *------------------------------------------------------------------------*/ static void usb_request_callback(struct usb_xfer *xfer, usb_error_t error) { if (xfer->flags_int.usb_mode == USB_MODE_DEVICE) usb_handle_request_callback(xfer, error); else usbd_do_request_callback(xfer, error); } /*------------------------------------------------------------------------* * usbd_update_max_frame_size * * This function updates the maximum frame size, hence high speed USB * can transfer multiple consecutive packets. *------------------------------------------------------------------------*/ static void usbd_update_max_frame_size(struct usb_xfer *xfer) { /* compute maximum frame size */ /* this computation should not overflow 16-bit */ /* max = 15 * 1024 */ xfer->max_frame_size = xfer->max_packet_size * xfer->max_packet_count; } /*------------------------------------------------------------------------* * usbd_get_dma_delay * * The following function is called when we need to * synchronize with DMA hardware. * * Returns: * 0: no DMA delay required * Else: milliseconds of DMA delay *------------------------------------------------------------------------*/ usb_timeout_t usbd_get_dma_delay(struct usb_device *udev) { const struct usb_bus_methods *mtod; uint32_t temp; mtod = udev->bus->methods; temp = 0; if (mtod->get_dma_delay) { (mtod->get_dma_delay) (udev, &temp); /* * Round up and convert to milliseconds. Note that we use * 1024 milliseconds per second. to save a division. */ temp += 0x3FF; temp /= 0x400; } return (temp); } /*------------------------------------------------------------------------* * usbd_transfer_setup_sub_malloc * * This function will allocate one or more DMA'able memory chunks * according to "size", "align" and "count" arguments. "ppc" is * pointed to a linear array of USB page caches afterwards. * * If the "align" argument is equal to "1" a non-contiguous allocation * can happen. Else if the "align" argument is greater than "1", the * allocation will always be contiguous in memory. * * Returns: * 0: Success * Else: Failure *------------------------------------------------------------------------*/ #if USB_HAVE_BUSDMA uint8_t usbd_transfer_setup_sub_malloc(struct usb_setup_params *parm, struct usb_page_cache **ppc, usb_size_t size, usb_size_t align, usb_size_t count) { struct usb_page_cache *pc; struct usb_page *pg; void *buf; usb_size_t n_dma_pc; usb_size_t n_dma_pg; usb_size_t n_obj; usb_size_t x; usb_size_t y; usb_size_t r; usb_size_t z; USB_ASSERT(align > 0, ("Invalid alignment, 0x%08x\n", align)); USB_ASSERT(size > 0, ("Invalid size = 0\n")); if (count == 0) { return (0); /* nothing to allocate */ } /* * Make sure that the size is aligned properly. */ size = -((-size) & (-align)); /* * Try multi-allocation chunks to reduce the number of DMA * allocations, hence DMA allocations are slow. */ if (align == 1) { /* special case - non-cached multi page DMA memory */ n_dma_pc = count; n_dma_pg = (2 + (size / USB_PAGE_SIZE)); n_obj = 1; } else if (size >= USB_PAGE_SIZE) { n_dma_pc = count; n_dma_pg = 1; n_obj = 1; } else { /* compute number of objects per page */ #ifdef USB_DMA_SINGLE_ALLOC n_obj = 1; #else n_obj = (USB_PAGE_SIZE / size); #endif /* * Compute number of DMA chunks, rounded up * to nearest one: */ n_dma_pc = howmany(count, n_obj); n_dma_pg = 1; } /* * DMA memory is allocated once, but mapped twice. That's why * there is one list for auto-free and another list for * non-auto-free which only holds the mapping and not the * allocation. */ if (parm->buf == NULL) { /* reserve memory (auto-free) */ parm->dma_page_ptr += n_dma_pc * n_dma_pg; parm->dma_page_cache_ptr += n_dma_pc; /* reserve memory (no-auto-free) */ parm->dma_page_ptr += count * n_dma_pg; parm->xfer_page_cache_ptr += count; return (0); } for (x = 0; x != n_dma_pc; x++) { /* need to initialize the page cache */ parm->dma_page_cache_ptr[x].tag_parent = &parm->curr_xfer->xroot->dma_parent_tag; } for (x = 0; x != count; x++) { /* need to initialize the page cache */ parm->xfer_page_cache_ptr[x].tag_parent = &parm->curr_xfer->xroot->dma_parent_tag; } if (ppc != NULL) { if (n_obj != 1) *ppc = parm->xfer_page_cache_ptr; else *ppc = parm->dma_page_cache_ptr; } r = count; /* set remainder count */ z = n_obj * size; /* set allocation size */ pc = parm->xfer_page_cache_ptr; pg = parm->dma_page_ptr; if (n_obj == 1) { /* * Avoid mapping memory twice if only a single object * should be allocated per page cache: */ for (x = 0; x != n_dma_pc; x++) { if (usb_pc_alloc_mem(parm->dma_page_cache_ptr, pg, z, align)) { return (1); /* failure */ } /* Make room for one DMA page cache and "n_dma_pg" pages */ parm->dma_page_cache_ptr++; pg += n_dma_pg; } } else { for (x = 0; x != n_dma_pc; x++) { if (r < n_obj) { /* compute last remainder */ z = r * size; n_obj = r; } if (usb_pc_alloc_mem(parm->dma_page_cache_ptr, pg, z, align)) { return (1); /* failure */ } /* Set beginning of current buffer */ buf = parm->dma_page_cache_ptr->buffer; /* Make room for one DMA page cache and "n_dma_pg" pages */ parm->dma_page_cache_ptr++; pg += n_dma_pg; for (y = 0; (y != n_obj); y++, r--, pc++, pg += n_dma_pg) { /* Load sub-chunk into DMA */ if (usb_pc_dmamap_create(pc, size)) { return (1); /* failure */ } pc->buffer = USB_ADD_BYTES(buf, y * size); pc->page_start = pg; USB_MTX_LOCK(pc->tag_parent->mtx); if (usb_pc_load_mem(pc, size, 1 /* synchronous */ )) { USB_MTX_UNLOCK(pc->tag_parent->mtx); return (1); /* failure */ } USB_MTX_UNLOCK(pc->tag_parent->mtx); } } } parm->xfer_page_cache_ptr = pc; parm->dma_page_ptr = pg; return (0); } #endif /*------------------------------------------------------------------------* * usbd_transfer_setup_sub - transfer setup subroutine * * This function must be called from the "xfer_setup" callback of the * USB Host or Device controller driver when setting up an USB * transfer. This function will setup correct packet sizes, buffer * sizes, flags and more, that are stored in the "usb_xfer" * structure. *------------------------------------------------------------------------*/ void usbd_transfer_setup_sub(struct usb_setup_params *parm) { enum { REQ_SIZE = 8, MIN_PKT = 8, }; struct usb_xfer *xfer = parm->curr_xfer; const struct usb_config *setup = parm->curr_setup; struct usb_endpoint_ss_comp_descriptor *ecomp; struct usb_endpoint_descriptor *edesc; struct usb_std_packet_size std_size; usb_frcount_t n_frlengths; usb_frcount_t n_frbuffers; usb_frcount_t x; uint16_t maxp_old; uint8_t type; uint8_t zmps; /* * Sanity check. The following parameters must be initialized before * calling this function. */ if ((parm->hc_max_packet_size == 0) || (parm->hc_max_packet_count == 0) || (parm->hc_max_frame_size == 0)) { parm->err = USB_ERR_INVAL; goto done; } edesc = xfer->endpoint->edesc; ecomp = xfer->endpoint->ecomp; type = (edesc->bmAttributes & UE_XFERTYPE); xfer->flags = setup->flags; xfer->nframes = setup->frames; xfer->timeout = setup->timeout; xfer->callback = setup->callback; xfer->interval = setup->interval; xfer->endpointno = edesc->bEndpointAddress; xfer->max_packet_size = UGETW(edesc->wMaxPacketSize); xfer->max_packet_count = 1; /* make a shadow copy: */ xfer->flags_int.usb_mode = parm->udev->flags.usb_mode; parm->bufsize = setup->bufsize; switch (parm->speed) { case USB_SPEED_HIGH: switch (type) { case UE_ISOCHRONOUS: case UE_INTERRUPT: xfer->max_packet_count += (xfer->max_packet_size >> 11) & 3; /* check for invalid max packet count */ if (xfer->max_packet_count > 3) xfer->max_packet_count = 3; break; default: break; } xfer->max_packet_size &= 0x7FF; break; case USB_SPEED_SUPER: xfer->max_packet_count += (xfer->max_packet_size >> 11) & 3; if (ecomp != NULL) xfer->max_packet_count += ecomp->bMaxBurst; if ((xfer->max_packet_count == 0) || (xfer->max_packet_count > 16)) xfer->max_packet_count = 16; switch (type) { case UE_CONTROL: xfer->max_packet_count = 1; break; case UE_ISOCHRONOUS: if (ecomp != NULL) { uint8_t mult; mult = UE_GET_SS_ISO_MULT( ecomp->bmAttributes) + 1; if (mult > 3) mult = 3; xfer->max_packet_count *= mult; } break; default: break; } xfer->max_packet_size &= 0x7FF; break; default: break; } /* range check "max_packet_count" */ if (xfer->max_packet_count > parm->hc_max_packet_count) { xfer->max_packet_count = parm->hc_max_packet_count; } /* store max packet size value before filtering */ maxp_old = xfer->max_packet_size; /* filter "wMaxPacketSize" according to HC capabilities */ if ((xfer->max_packet_size > parm->hc_max_packet_size) || (xfer->max_packet_size == 0)) { xfer->max_packet_size = parm->hc_max_packet_size; } /* filter "wMaxPacketSize" according to standard sizes */ usbd_get_std_packet_size(&std_size, type, parm->speed); if (std_size.range.min || std_size.range.max) { if (xfer->max_packet_size < std_size.range.min) { xfer->max_packet_size = std_size.range.min; } if (xfer->max_packet_size > std_size.range.max) { xfer->max_packet_size = std_size.range.max; } } else { if (xfer->max_packet_size >= std_size.fixed[3]) { xfer->max_packet_size = std_size.fixed[3]; } else if (xfer->max_packet_size >= std_size.fixed[2]) { xfer->max_packet_size = std_size.fixed[2]; } else if (xfer->max_packet_size >= std_size.fixed[1]) { xfer->max_packet_size = std_size.fixed[1]; } else { /* only one possibility left */ xfer->max_packet_size = std_size.fixed[0]; } } /* * Check if the max packet size was outside its allowed range * and clamped to a valid value: */ if (maxp_old != xfer->max_packet_size) xfer->flags_int.maxp_was_clamped = 1; /* compute "max_frame_size" */ usbd_update_max_frame_size(xfer); /* check interrupt interval and transfer pre-delay */ if (type == UE_ISOCHRONOUS) { uint16_t frame_limit; xfer->interval = 0; /* not used, must be zero */ xfer->flags_int.isochronous_xfr = 1; /* set flag */ if (xfer->timeout == 0) { /* * set a default timeout in * case something goes wrong! */ xfer->timeout = 1000 / 4; } switch (parm->speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: frame_limit = USB_MAX_FS_ISOC_FRAMES_PER_XFER; xfer->fps_shift = 0; break; default: frame_limit = USB_MAX_HS_ISOC_FRAMES_PER_XFER; xfer->fps_shift = edesc->bInterval; if (xfer->fps_shift > 0) xfer->fps_shift--; if (xfer->fps_shift > 3) xfer->fps_shift = 3; if (xfer->flags.pre_scale_frames != 0) xfer->nframes <<= (3 - xfer->fps_shift); break; } if (xfer->nframes > frame_limit) { /* * this is not going to work * cross hardware */ parm->err = USB_ERR_INVAL; goto done; } if (xfer->nframes == 0) { /* * this is not a valid value */ parm->err = USB_ERR_ZERO_NFRAMES; goto done; } } else { /* * If a value is specified use that else check the * endpoint descriptor! */ if (type == UE_INTERRUPT) { uint32_t temp; if (xfer->interval == 0) { xfer->interval = edesc->bInterval; switch (parm->speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: break; default: /* 125us -> 1ms */ if (xfer->interval < 4) xfer->interval = 1; else if (xfer->interval > 16) xfer->interval = (1 << (16 - 4)); else xfer->interval = (1 << (xfer->interval - 4)); break; } } if (xfer->interval == 0) { /* * One millisecond is the smallest * interval we support: */ xfer->interval = 1; } xfer->fps_shift = 0; temp = 1; while ((temp != 0) && (temp < xfer->interval)) { xfer->fps_shift++; temp *= 2; } switch (parm->speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: break; default: xfer->fps_shift += 3; break; } } } /* * NOTE: we do not allow "max_packet_size" or "max_frame_size" * to be equal to zero when setting up USB transfers, hence * this leads to a lot of extra code in the USB kernel. */ if ((xfer->max_frame_size == 0) || (xfer->max_packet_size == 0)) { zmps = 1; if ((parm->bufsize <= MIN_PKT) && (type != UE_CONTROL) && (type != UE_BULK)) { /* workaround */ xfer->max_packet_size = MIN_PKT; xfer->max_packet_count = 1; parm->bufsize = 0; /* automatic setup length */ usbd_update_max_frame_size(xfer); } else { parm->err = USB_ERR_ZERO_MAXP; goto done; } } else { zmps = 0; } /* * check if we should setup a default * length: */ if (parm->bufsize == 0) { parm->bufsize = xfer->max_frame_size; if (type == UE_ISOCHRONOUS) { parm->bufsize *= xfer->nframes; } } /* * check if we are about to setup a proxy * type of buffer: */ if (xfer->flags.proxy_buffer) { /* round bufsize up */ parm->bufsize += (xfer->max_frame_size - 1); if (parm->bufsize < xfer->max_frame_size) { /* length wrapped around */ parm->err = USB_ERR_INVAL; goto done; } /* subtract remainder */ parm->bufsize -= (parm->bufsize % xfer->max_frame_size); /* add length of USB device request structure, if any */ if (type == UE_CONTROL) { parm->bufsize += REQ_SIZE; /* SETUP message */ } } xfer->max_data_length = parm->bufsize; /* Setup "n_frlengths" and "n_frbuffers" */ if (type == UE_ISOCHRONOUS) { n_frlengths = xfer->nframes; n_frbuffers = 1; } else { if (type == UE_CONTROL) { xfer->flags_int.control_xfr = 1; if (xfer->nframes == 0) { if (parm->bufsize <= REQ_SIZE) { /* * there will never be any data * stage */ xfer->nframes = 1; } else { xfer->nframes = 2; } } } else { if (xfer->nframes == 0) { xfer->nframes = 1; } } n_frlengths = xfer->nframes; n_frbuffers = xfer->nframes; } /* * check if we have room for the * USB device request structure: */ if (type == UE_CONTROL) { if (xfer->max_data_length < REQ_SIZE) { /* length wrapped around or too small bufsize */ parm->err = USB_ERR_INVAL; goto done; } xfer->max_data_length -= REQ_SIZE; } /* * Setup "frlengths" and shadow "frlengths" for keeping the * initial frame lengths when a USB transfer is complete. This * information is useful when computing isochronous offsets. */ xfer->frlengths = parm->xfer_length_ptr; parm->xfer_length_ptr += 2 * n_frlengths; /* setup "frbuffers" */ xfer->frbuffers = parm->xfer_page_cache_ptr; parm->xfer_page_cache_ptr += n_frbuffers; /* initialize max frame count */ xfer->max_frame_count = xfer->nframes; /* * check if we need to setup * a local buffer: */ if (!xfer->flags.ext_buffer) { #if USB_HAVE_BUSDMA struct usb_page_search page_info; struct usb_page_cache *pc; if (usbd_transfer_setup_sub_malloc(parm, &pc, parm->bufsize, 1, 1)) { parm->err = USB_ERR_NOMEM; } else if (parm->buf != NULL) { usbd_get_page(pc, 0, &page_info); xfer->local_buffer = page_info.buffer; usbd_xfer_set_frame_offset(xfer, 0, 0); if ((type == UE_CONTROL) && (n_frbuffers > 1)) { usbd_xfer_set_frame_offset(xfer, REQ_SIZE, 1); } } #else /* align data */ parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1)); if (parm->buf != NULL) { xfer->local_buffer = USB_ADD_BYTES(parm->buf, parm->size[0]); usbd_xfer_set_frame_offset(xfer, 0, 0); if ((type == UE_CONTROL) && (n_frbuffers > 1)) { usbd_xfer_set_frame_offset(xfer, REQ_SIZE, 1); } } parm->size[0] += parm->bufsize; /* align data again */ parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1)); #endif } /* * Compute maximum buffer size */ if (parm->bufsize_max < parm->bufsize) { parm->bufsize_max = parm->bufsize; } #if USB_HAVE_BUSDMA if (xfer->flags_int.bdma_enable) { /* * Setup "dma_page_ptr". * * Proof for formula below: * * Assume there are three USB frames having length "a", "b" and * "c". These USB frames will at maximum need "z" * "usb_page" structures. "z" is given by: * * z = ((a / USB_PAGE_SIZE) + 2) + ((b / USB_PAGE_SIZE) + 2) + * ((c / USB_PAGE_SIZE) + 2); * * Constraining "a", "b" and "c" like this: * * (a + b + c) <= parm->bufsize * * We know that: * * z <= ((parm->bufsize / USB_PAGE_SIZE) + (3*2)); * * Here is the general formula: */ xfer->dma_page_ptr = parm->dma_page_ptr; parm->dma_page_ptr += (2 * n_frbuffers); parm->dma_page_ptr += (parm->bufsize / USB_PAGE_SIZE); } #endif if (zmps) { /* correct maximum data length */ xfer->max_data_length = 0; } /* subtract USB frame remainder from "hc_max_frame_size" */ xfer->max_hc_frame_size = (parm->hc_max_frame_size - (parm->hc_max_frame_size % xfer->max_frame_size)); if (xfer->max_hc_frame_size == 0) { parm->err = USB_ERR_INVAL; goto done; } /* initialize frame buffers */ if (parm->buf) { for (x = 0; x != n_frbuffers; x++) { xfer->frbuffers[x].tag_parent = &xfer->xroot->dma_parent_tag; #if USB_HAVE_BUSDMA if (xfer->flags_int.bdma_enable && (parm->bufsize_max > 0)) { if (usb_pc_dmamap_create( xfer->frbuffers + x, parm->bufsize_max)) { parm->err = USB_ERR_NOMEM; goto done; } } #endif } } done: if (parm->err) { /* * Set some dummy values so that we avoid division by zero: */ xfer->max_hc_frame_size = 1; xfer->max_frame_size = 1; xfer->max_packet_size = 1; xfer->max_data_length = 0; xfer->nframes = 0; xfer->max_frame_count = 0; } } static uint8_t usbd_transfer_setup_has_bulk(const struct usb_config *setup_start, uint16_t n_setup) { while (n_setup--) { uint8_t type = setup_start[n_setup].type; if (type == UE_BULK || type == UE_BULK_INTR || type == UE_TYPE_ANY) return (1); } return (0); } /*------------------------------------------------------------------------* * usbd_transfer_setup - setup an array of USB transfers * * NOTE: You must always call "usbd_transfer_unsetup" after calling * "usbd_transfer_setup" if success was returned. * * The idea is that the USB device driver should pre-allocate all its * transfers by one call to this function. * * Return values: * 0: Success * Else: Failure *------------------------------------------------------------------------*/ usb_error_t usbd_transfer_setup(struct usb_device *udev, const uint8_t *ifaces, struct usb_xfer **ppxfer, const struct usb_config *setup_start, uint16_t n_setup, void *priv_sc, struct mtx *xfer_mtx) { const struct usb_config *setup_end = setup_start + n_setup; const struct usb_config *setup; struct usb_setup_params *parm; struct usb_endpoint *ep; struct usb_xfer_root *info; struct usb_xfer *xfer; void *buf = NULL; usb_error_t error = 0; uint16_t n; uint16_t refcount; uint8_t do_unlock; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "usbd_transfer_setup can sleep!"); /* do some checking first */ if (n_setup == 0) { DPRINTFN(6, "setup array has zero length!\n"); return (USB_ERR_INVAL); } if (ifaces == NULL) { DPRINTFN(6, "ifaces array is NULL!\n"); return (USB_ERR_INVAL); } if (xfer_mtx == NULL) { DPRINTFN(6, "using global lock\n"); xfer_mtx = &Giant; } /* more sanity checks */ for (setup = setup_start, n = 0; setup != setup_end; setup++, n++) { if (setup->bufsize == (usb_frlength_t)-1) { error = USB_ERR_BAD_BUFSIZE; DPRINTF("invalid bufsize\n"); } if (setup->callback == NULL) { error = USB_ERR_NO_CALLBACK; DPRINTF("no callback\n"); } ppxfer[n] = NULL; } if (error) return (error); /* Protect scratch area */ do_unlock = usbd_ctrl_lock(udev); refcount = 0; info = NULL; parm = &udev->scratch.xfer_setup[0].parm; memset(parm, 0, sizeof(*parm)); parm->udev = udev; parm->speed = usbd_get_speed(udev); parm->hc_max_packet_count = 1; if (parm->speed >= USB_SPEED_MAX) { parm->err = USB_ERR_INVAL; goto done; } /* setup all transfers */ while (1) { if (buf) { /* * Initialize the "usb_xfer_root" structure, * which is common for all our USB transfers. */ info = USB_ADD_BYTES(buf, 0); info->memory_base = buf; info->memory_size = parm->size[0]; #if USB_HAVE_BUSDMA info->dma_page_cache_start = USB_ADD_BYTES(buf, parm->size[4]); info->dma_page_cache_end = USB_ADD_BYTES(buf, parm->size[5]); #endif info->xfer_page_cache_start = USB_ADD_BYTES(buf, parm->size[5]); info->xfer_page_cache_end = USB_ADD_BYTES(buf, parm->size[2]); cv_init(&info->cv_drain, "WDRAIN"); info->xfer_mtx = xfer_mtx; #if USB_HAVE_BUSDMA usb_dma_tag_setup(&info->dma_parent_tag, parm->dma_tag_p, udev->bus->dma_parent_tag[0].tag, xfer_mtx, &usb_bdma_done_event, udev->bus->dma_bits, parm->dma_tag_max); #endif info->bus = udev->bus; info->udev = udev; TAILQ_INIT(&info->done_q.head); info->done_q.command = &usbd_callback_wrapper; #if USB_HAVE_BUSDMA TAILQ_INIT(&info->dma_q.head); info->dma_q.command = &usb_bdma_work_loop; #endif info->done_m[0].hdr.pm_callback = &usb_callback_proc; info->done_m[0].xroot = info; info->done_m[1].hdr.pm_callback = &usb_callback_proc; info->done_m[1].xroot = info; /* * In device side mode control endpoint * requests need to run from a separate * context, else there is a chance of * deadlock! */ - if (setup_start == usb_control_ep_cfg) + if (setup_start == usb_control_ep_cfg || + setup_start == usb_control_ep_quirk_cfg) info->done_p = USB_BUS_CONTROL_XFER_PROC(udev->bus); else if (xfer_mtx == &Giant) info->done_p = USB_BUS_GIANT_PROC(udev->bus); else if (usbd_transfer_setup_has_bulk(setup_start, n_setup)) info->done_p = USB_BUS_NON_GIANT_BULK_PROC(udev->bus); else info->done_p = USB_BUS_NON_GIANT_ISOC_PROC(udev->bus); } /* reset sizes */ parm->size[0] = 0; parm->buf = buf; parm->size[0] += sizeof(info[0]); for (setup = setup_start, n = 0; setup != setup_end; setup++, n++) { /* skip USB transfers without callbacks: */ if (setup->callback == NULL) { continue; } /* see if there is a matching endpoint */ ep = usbd_get_endpoint(udev, ifaces[setup->if_index], setup); /* * Check that the USB PIPE is valid and that * the endpoint mode is proper. * * Make sure we don't allocate a streams * transfer when such a combination is not * valid. */ if ((ep == NULL) || (ep->methods == NULL) || ((ep->ep_mode != USB_EP_MODE_STREAMS) && (ep->ep_mode != USB_EP_MODE_DEFAULT)) || (setup->stream_id != 0 && (setup->stream_id >= USB_MAX_EP_STREAMS || (ep->ep_mode != USB_EP_MODE_STREAMS)))) { if (setup->flags.no_pipe_ok) continue; if ((setup->usb_mode != USB_MODE_DUAL) && (setup->usb_mode != udev->flags.usb_mode)) continue; parm->err = USB_ERR_NO_PIPE; goto done; } /* align data properly */ parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1)); /* store current setup pointer */ parm->curr_setup = setup; if (buf) { /* * Common initialization of the * "usb_xfer" structure. */ xfer = USB_ADD_BYTES(buf, parm->size[0]); xfer->address = udev->address; xfer->priv_sc = priv_sc; xfer->xroot = info; usb_callout_init_mtx(&xfer->timeout_handle, &udev->bus->bus_mtx, 0); } else { /* * Setup a dummy xfer, hence we are * writing to the "usb_xfer" * structure pointed to by "xfer" * before we have allocated any * memory: */ xfer = &udev->scratch.xfer_setup[0].dummy; memset(xfer, 0, sizeof(*xfer)); refcount++; } /* set transfer endpoint pointer */ xfer->endpoint = ep; /* set transfer stream ID */ xfer->stream_id = setup->stream_id; parm->size[0] += sizeof(xfer[0]); parm->methods = xfer->endpoint->methods; parm->curr_xfer = xfer; /* * Call the Host or Device controller transfer * setup routine: */ (udev->bus->methods->xfer_setup) (parm); /* check for error */ if (parm->err) goto done; if (buf) { /* * Increment the endpoint refcount. This * basically prevents setting a new * configuration and alternate setting * when USB transfers are in use on * the given interface. Search the USB * code for "endpoint->refcount_alloc" if you * want more information. */ USB_BUS_LOCK(info->bus); if (xfer->endpoint->refcount_alloc >= USB_EP_REF_MAX) parm->err = USB_ERR_INVAL; xfer->endpoint->refcount_alloc++; if (xfer->endpoint->refcount_alloc == 0) panic("usbd_transfer_setup(): Refcount wrapped to zero\n"); USB_BUS_UNLOCK(info->bus); /* * Whenever we set ppxfer[] then we * also need to increment the * "setup_refcount": */ info->setup_refcount++; /* * Transfer is successfully setup and * can be used: */ ppxfer[n] = xfer; } /* check for error */ if (parm->err) goto done; } if (buf != NULL || parm->err != 0) goto done; /* if no transfers, nothing to do */ if (refcount == 0) goto done; /* align data properly */ parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1)); /* store offset temporarily */ parm->size[1] = parm->size[0]; /* * The number of DMA tags required depends on * the number of endpoints. The current estimate * for maximum number of DMA tags per endpoint * is three: * 1) for loading memory * 2) for allocating memory * 3) for fixing memory [UHCI] */ parm->dma_tag_max += 3 * MIN(n_setup, USB_EP_MAX); /* * DMA tags for QH, TD, Data and more. */ parm->dma_tag_max += 8; parm->dma_tag_p += parm->dma_tag_max; parm->size[0] += ((uint8_t *)parm->dma_tag_p) - ((uint8_t *)0); /* align data properly */ parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1)); /* store offset temporarily */ parm->size[3] = parm->size[0]; parm->size[0] += ((uint8_t *)parm->dma_page_ptr) - ((uint8_t *)0); /* align data properly */ parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1)); /* store offset temporarily */ parm->size[4] = parm->size[0]; parm->size[0] += ((uint8_t *)parm->dma_page_cache_ptr) - ((uint8_t *)0); /* store end offset temporarily */ parm->size[5] = parm->size[0]; parm->size[0] += ((uint8_t *)parm->xfer_page_cache_ptr) - ((uint8_t *)0); /* store end offset temporarily */ parm->size[2] = parm->size[0]; /* align data properly */ parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1)); parm->size[6] = parm->size[0]; parm->size[0] += ((uint8_t *)parm->xfer_length_ptr) - ((uint8_t *)0); /* align data properly */ parm->size[0] += ((-parm->size[0]) & (USB_HOST_ALIGN - 1)); /* allocate zeroed memory */ buf = malloc(parm->size[0], M_USB, M_WAITOK | M_ZERO); if (buf == NULL) { parm->err = USB_ERR_NOMEM; DPRINTFN(0, "cannot allocate memory block for " "configuration (%d bytes)\n", parm->size[0]); goto done; } parm->dma_tag_p = USB_ADD_BYTES(buf, parm->size[1]); parm->dma_page_ptr = USB_ADD_BYTES(buf, parm->size[3]); parm->dma_page_cache_ptr = USB_ADD_BYTES(buf, parm->size[4]); parm->xfer_page_cache_ptr = USB_ADD_BYTES(buf, parm->size[5]); parm->xfer_length_ptr = USB_ADD_BYTES(buf, parm->size[6]); } done: if (buf) { if (info->setup_refcount == 0) { /* * "usbd_transfer_unsetup_sub" will unlock * the bus mutex before returning ! */ USB_BUS_LOCK(info->bus); /* something went wrong */ usbd_transfer_unsetup_sub(info, 0); } } /* check if any errors happened */ if (parm->err) usbd_transfer_unsetup(ppxfer, n_setup); error = parm->err; if (do_unlock) usbd_ctrl_unlock(udev); return (error); } /*------------------------------------------------------------------------* * usbd_transfer_unsetup_sub - factored out code *------------------------------------------------------------------------*/ static void usbd_transfer_unsetup_sub(struct usb_xfer_root *info, uint8_t needs_delay) { #if USB_HAVE_BUSDMA struct usb_page_cache *pc; #endif USB_BUS_LOCK_ASSERT(info->bus, MA_OWNED); /* wait for any outstanding DMA operations */ if (needs_delay) { usb_timeout_t temp; temp = usbd_get_dma_delay(info->udev); if (temp != 0) { usb_pause_mtx(&info->bus->bus_mtx, USB_MS_TO_TICKS(temp)); } } /* make sure that our done messages are not queued anywhere */ usb_proc_mwait(info->done_p, &info->done_m[0], &info->done_m[1]); USB_BUS_UNLOCK(info->bus); #if USB_HAVE_BUSDMA /* free DMA'able memory, if any */ pc = info->dma_page_cache_start; while (pc != info->dma_page_cache_end) { usb_pc_free_mem(pc); pc++; } /* free DMA maps in all "xfer->frbuffers" */ pc = info->xfer_page_cache_start; while (pc != info->xfer_page_cache_end) { usb_pc_dmamap_destroy(pc); pc++; } /* free all DMA tags */ usb_dma_tag_unsetup(&info->dma_parent_tag); #endif cv_destroy(&info->cv_drain); /* * free the "memory_base" last, hence the "info" structure is * contained within the "memory_base"! */ free(info->memory_base, M_USB); } /*------------------------------------------------------------------------* * usbd_transfer_unsetup - unsetup/free an array of USB transfers * * NOTE: All USB transfers in progress will get called back passing * the error code "USB_ERR_CANCELLED" before this function * returns. *------------------------------------------------------------------------*/ void usbd_transfer_unsetup(struct usb_xfer **pxfer, uint16_t n_setup) { struct usb_xfer *xfer; struct usb_xfer_root *info; uint8_t needs_delay = 0; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "usbd_transfer_unsetup can sleep!"); while (n_setup--) { xfer = pxfer[n_setup]; if (xfer == NULL) continue; info = xfer->xroot; USB_XFER_LOCK(xfer); USB_BUS_LOCK(info->bus); /* * HINT: when you start/stop a transfer, it might be a * good idea to directly use the "pxfer[]" structure: * * usbd_transfer_start(sc->pxfer[0]); * usbd_transfer_stop(sc->pxfer[0]); * * That way, if your code has many parts that will not * stop running under the same lock, in other words * "xfer_mtx", the usbd_transfer_start and * usbd_transfer_stop functions will simply return * when they detect a NULL pointer argument. * * To avoid any races we clear the "pxfer[]" pointer * while holding the private mutex of the driver: */ pxfer[n_setup] = NULL; USB_BUS_UNLOCK(info->bus); USB_XFER_UNLOCK(xfer); usbd_transfer_drain(xfer); #if USB_HAVE_BUSDMA if (xfer->flags_int.bdma_enable) needs_delay = 1; #endif /* * NOTE: default endpoint does not have an * interface, even if endpoint->iface_index == 0 */ USB_BUS_LOCK(info->bus); xfer->endpoint->refcount_alloc--; USB_BUS_UNLOCK(info->bus); usb_callout_drain(&xfer->timeout_handle); USB_BUS_LOCK(info->bus); USB_ASSERT(info->setup_refcount != 0, ("Invalid setup " "reference count\n")); info->setup_refcount--; if (info->setup_refcount == 0) { usbd_transfer_unsetup_sub(info, needs_delay); } else { USB_BUS_UNLOCK(info->bus); } } } /*------------------------------------------------------------------------* * usbd_control_transfer_init - factored out code * * In USB Device Mode we have to wait for the SETUP packet which * containst the "struct usb_device_request" structure, before we can * transfer any data. In USB Host Mode we already have the SETUP * packet at the moment the USB transfer is started. This leads us to * having to setup the USB transfer at two different places in * time. This function just contains factored out control transfer * initialisation code, so that we don't duplicate the code. *------------------------------------------------------------------------*/ static void usbd_control_transfer_init(struct usb_xfer *xfer) { struct usb_device_request req; /* copy out the USB request header */ usbd_copy_out(xfer->frbuffers, 0, &req, sizeof(req)); /* setup remainder */ xfer->flags_int.control_rem = UGETW(req.wLength); /* copy direction to endpoint variable */ xfer->endpointno &= ~(UE_DIR_IN | UE_DIR_OUT); xfer->endpointno |= (req.bmRequestType & UT_READ) ? UE_DIR_IN : UE_DIR_OUT; } /*------------------------------------------------------------------------* * usbd_control_transfer_did_data * * This function returns non-zero if a control endpoint has * transferred the first DATA packet after the SETUP packet. * Else it returns zero. *------------------------------------------------------------------------*/ static uint8_t usbd_control_transfer_did_data(struct usb_xfer *xfer) { struct usb_device_request req; /* SETUP packet is not yet sent */ if (xfer->flags_int.control_hdr != 0) return (0); /* copy out the USB request header */ usbd_copy_out(xfer->frbuffers, 0, &req, sizeof(req)); /* compare remainder to the initial value */ return (xfer->flags_int.control_rem != UGETW(req.wLength)); } /*------------------------------------------------------------------------* * usbd_setup_ctrl_transfer * * This function handles initialisation of control transfers. Control * transfers are special in that regard that they can both transmit * and receive data. * * Return values: * 0: Success * Else: Failure *------------------------------------------------------------------------*/ static int usbd_setup_ctrl_transfer(struct usb_xfer *xfer) { usb_frlength_t len; /* Check for control endpoint stall */ if (xfer->flags.stall_pipe && xfer->flags_int.control_act) { /* the control transfer is no longer active */ xfer->flags_int.control_stall = 1; xfer->flags_int.control_act = 0; } else { /* don't stall control transfer by default */ xfer->flags_int.control_stall = 0; } /* Check for invalid number of frames */ if (xfer->nframes > 2) { /* * If you need to split a control transfer, you * have to do one part at a time. Only with * non-control transfers you can do multiple * parts a time. */ DPRINTFN(0, "Too many frames: %u\n", (unsigned int)xfer->nframes); goto error; } /* * Check if there is a control * transfer in progress: */ if (xfer->flags_int.control_act) { if (xfer->flags_int.control_hdr) { /* clear send header flag */ xfer->flags_int.control_hdr = 0; /* setup control transfer */ if (xfer->flags_int.usb_mode == USB_MODE_DEVICE) { usbd_control_transfer_init(xfer); } } /* get data length */ len = xfer->sumlen; } else { /* the size of the SETUP structure is hardcoded ! */ if (xfer->frlengths[0] != sizeof(struct usb_device_request)) { DPRINTFN(0, "Wrong framelength %u != %zu\n", xfer->frlengths[0], sizeof(struct usb_device_request)); goto error; } /* check USB mode */ if (xfer->flags_int.usb_mode == USB_MODE_DEVICE) { /* check number of frames */ if (xfer->nframes != 1) { /* * We need to receive the setup * message first so that we know the * data direction! */ DPRINTF("Misconfigured transfer\n"); goto error; } /* * Set a dummy "control_rem" value. This * variable will be overwritten later by a * call to "usbd_control_transfer_init()" ! */ xfer->flags_int.control_rem = 0xFFFF; } else { /* setup "endpoint" and "control_rem" */ usbd_control_transfer_init(xfer); } /* set transfer-header flag */ xfer->flags_int.control_hdr = 1; /* get data length */ len = (xfer->sumlen - sizeof(struct usb_device_request)); } /* update did data flag */ xfer->flags_int.control_did_data = usbd_control_transfer_did_data(xfer); /* check if there is a length mismatch */ if (len > xfer->flags_int.control_rem) { DPRINTFN(0, "Length (%d) greater than " "remaining length (%d)\n", len, xfer->flags_int.control_rem); goto error; } /* check if we are doing a short transfer */ if (xfer->flags.force_short_xfer) { xfer->flags_int.control_rem = 0; } else { if ((len != xfer->max_data_length) && (len != xfer->flags_int.control_rem) && (xfer->nframes != 1)) { DPRINTFN(0, "Short control transfer without " "force_short_xfer set\n"); goto error; } xfer->flags_int.control_rem -= len; } /* the status part is executed when "control_act" is 0 */ if ((xfer->flags_int.control_rem > 0) || (xfer->flags.manual_status)) { /* don't execute the STATUS stage yet */ xfer->flags_int.control_act = 1; /* sanity check */ if ((!xfer->flags_int.control_hdr) && (xfer->nframes == 1)) { /* * This is not a valid operation! */ DPRINTFN(0, "Invalid parameter " "combination\n"); goto error; } } else { /* time to execute the STATUS stage */ xfer->flags_int.control_act = 0; } return (0); /* success */ error: return (1); /* failure */ } /*------------------------------------------------------------------------* * usbd_transfer_submit - start USB hardware for the given transfer * * This function should only be called from the USB callback. *------------------------------------------------------------------------*/ void usbd_transfer_submit(struct usb_xfer *xfer) { struct usb_xfer_root *info; struct usb_bus *bus; usb_frcount_t x; info = xfer->xroot; bus = info->bus; DPRINTF("xfer=%p, endpoint=%p, nframes=%d, dir=%s\n", xfer, xfer->endpoint, xfer->nframes, USB_GET_DATA_ISREAD(xfer) ? "read" : "write"); #ifdef USB_DEBUG if (USB_DEBUG_VAR > 0) { USB_BUS_LOCK(bus); usb_dump_endpoint(xfer->endpoint); USB_BUS_UNLOCK(bus); } #endif USB_XFER_LOCK_ASSERT(xfer, MA_OWNED); USB_BUS_LOCK_ASSERT(bus, MA_NOTOWNED); /* Only open the USB transfer once! */ if (!xfer->flags_int.open) { xfer->flags_int.open = 1; DPRINTF("open\n"); USB_BUS_LOCK(bus); (xfer->endpoint->methods->open) (xfer); USB_BUS_UNLOCK(bus); } /* set "transferring" flag */ xfer->flags_int.transferring = 1; #if USB_HAVE_POWERD /* increment power reference */ usbd_transfer_power_ref(xfer, 1); #endif /* * Check if the transfer is waiting on a queue, most * frequently the "done_q": */ if (xfer->wait_queue) { USB_BUS_LOCK(bus); usbd_transfer_dequeue(xfer); USB_BUS_UNLOCK(bus); } /* clear "did_dma_delay" flag */ xfer->flags_int.did_dma_delay = 0; /* clear "did_close" flag */ xfer->flags_int.did_close = 0; #if USB_HAVE_BUSDMA /* clear "bdma_setup" flag */ xfer->flags_int.bdma_setup = 0; #endif /* by default we cannot cancel any USB transfer immediately */ xfer->flags_int.can_cancel_immed = 0; /* clear lengths and frame counts by default */ xfer->sumlen = 0; xfer->actlen = 0; xfer->aframes = 0; /* clear any previous errors */ xfer->error = 0; /* Check if the device is still alive */ if (info->udev->state < USB_STATE_POWERED) { USB_BUS_LOCK(bus); /* * Must return cancelled error code else * device drivers can hang. */ usbd_transfer_done(xfer, USB_ERR_CANCELLED); USB_BUS_UNLOCK(bus); return; } /* sanity check */ if (xfer->nframes == 0) { if (xfer->flags.stall_pipe) { /* * Special case - want to stall without transferring * any data: */ DPRINTF("xfer=%p nframes=0: stall " "or clear stall!\n", xfer); USB_BUS_LOCK(bus); xfer->flags_int.can_cancel_immed = 1; /* start the transfer */ usb_command_wrapper(&xfer->endpoint-> endpoint_q[xfer->stream_id], xfer); USB_BUS_UNLOCK(bus); return; } USB_BUS_LOCK(bus); usbd_transfer_done(xfer, USB_ERR_INVAL); USB_BUS_UNLOCK(bus); return; } /* compute some variables */ for (x = 0; x != xfer->nframes; x++) { /* make a copy of the frlenghts[] */ xfer->frlengths[x + xfer->max_frame_count] = xfer->frlengths[x]; /* compute total transfer length */ xfer->sumlen += xfer->frlengths[x]; if (xfer->sumlen < xfer->frlengths[x]) { /* length wrapped around */ USB_BUS_LOCK(bus); usbd_transfer_done(xfer, USB_ERR_INVAL); USB_BUS_UNLOCK(bus); return; } } /* clear some internal flags */ xfer->flags_int.short_xfer_ok = 0; xfer->flags_int.short_frames_ok = 0; /* check if this is a control transfer */ if (xfer->flags_int.control_xfr) { if (usbd_setup_ctrl_transfer(xfer)) { USB_BUS_LOCK(bus); usbd_transfer_done(xfer, USB_ERR_STALLED); USB_BUS_UNLOCK(bus); return; } } /* * Setup filtered version of some transfer flags, * in case of data read direction */ if (USB_GET_DATA_ISREAD(xfer)) { if (xfer->flags.short_frames_ok) { xfer->flags_int.short_xfer_ok = 1; xfer->flags_int.short_frames_ok = 1; } else if (xfer->flags.short_xfer_ok) { xfer->flags_int.short_xfer_ok = 1; /* check for control transfer */ if (xfer->flags_int.control_xfr) { /* * 1) Control transfers do not support * reception of multiple short USB * frames in host mode and device side * mode, with exception of: * * 2) Due to sometimes buggy device * side firmware we need to do a * STATUS stage in case of short * control transfers in USB host mode. * The STATUS stage then becomes the * "alt_next" to the DATA stage. */ xfer->flags_int.short_frames_ok = 1; } } } /* * Check if BUS-DMA support is enabled and try to load virtual * buffers into DMA, if any: */ #if USB_HAVE_BUSDMA if (xfer->flags_int.bdma_enable) { /* insert the USB transfer last in the BUS-DMA queue */ usb_command_wrapper(&xfer->xroot->dma_q, xfer); return; } #endif /* * Enter the USB transfer into the Host Controller or * Device Controller schedule: */ usbd_pipe_enter(xfer); } /*------------------------------------------------------------------------* * usbd_pipe_enter - factored out code *------------------------------------------------------------------------*/ void usbd_pipe_enter(struct usb_xfer *xfer) { struct usb_endpoint *ep; USB_XFER_LOCK_ASSERT(xfer, MA_OWNED); USB_BUS_LOCK(xfer->xroot->bus); ep = xfer->endpoint; DPRINTF("enter\n"); /* the transfer can now be cancelled */ xfer->flags_int.can_cancel_immed = 1; /* enter the transfer */ (ep->methods->enter) (xfer); /* check for transfer error */ if (xfer->error) { /* some error has happened */ usbd_transfer_done(xfer, 0); USB_BUS_UNLOCK(xfer->xroot->bus); return; } /* start the transfer */ usb_command_wrapper(&ep->endpoint_q[xfer->stream_id], xfer); USB_BUS_UNLOCK(xfer->xroot->bus); } /*------------------------------------------------------------------------* * usbd_transfer_start - start an USB transfer * * NOTE: Calling this function more than one time will only * result in a single transfer start, until the USB transfer * completes. *------------------------------------------------------------------------*/ void usbd_transfer_start(struct usb_xfer *xfer) { if (xfer == NULL) { /* transfer is gone */ return; } USB_XFER_LOCK_ASSERT(xfer, MA_OWNED); /* mark the USB transfer started */ if (!xfer->flags_int.started) { /* lock the BUS lock to avoid races updating flags_int */ USB_BUS_LOCK(xfer->xroot->bus); xfer->flags_int.started = 1; USB_BUS_UNLOCK(xfer->xroot->bus); } /* check if the USB transfer callback is already transferring */ if (xfer->flags_int.transferring) { return; } USB_BUS_LOCK(xfer->xroot->bus); /* call the USB transfer callback */ usbd_callback_ss_done_defer(xfer); USB_BUS_UNLOCK(xfer->xroot->bus); } /*------------------------------------------------------------------------* * usbd_transfer_stop - stop an USB transfer * * NOTE: Calling this function more than one time will only * result in a single transfer stop. * NOTE: When this function returns it is not safe to free nor * reuse any DMA buffers. See "usbd_transfer_drain()". *------------------------------------------------------------------------*/ void usbd_transfer_stop(struct usb_xfer *xfer) { struct usb_endpoint *ep; if (xfer == NULL) { /* transfer is gone */ return; } USB_XFER_LOCK_ASSERT(xfer, MA_OWNED); /* check if the USB transfer was ever opened */ if (!xfer->flags_int.open) { if (xfer->flags_int.started) { /* nothing to do except clearing the "started" flag */ /* lock the BUS lock to avoid races updating flags_int */ USB_BUS_LOCK(xfer->xroot->bus); xfer->flags_int.started = 0; USB_BUS_UNLOCK(xfer->xroot->bus); } return; } /* try to stop the current USB transfer */ USB_BUS_LOCK(xfer->xroot->bus); /* override any previous error */ xfer->error = USB_ERR_CANCELLED; /* * Clear "open" and "started" when both private and USB lock * is locked so that we don't get a race updating "flags_int" */ xfer->flags_int.open = 0; xfer->flags_int.started = 0; /* * Check if we can cancel the USB transfer immediately. */ if (xfer->flags_int.transferring) { if (xfer->flags_int.can_cancel_immed && (!xfer->flags_int.did_close)) { DPRINTF("close\n"); /* * The following will lead to an USB_ERR_CANCELLED * error code being passed to the USB callback. */ (xfer->endpoint->methods->close) (xfer); /* only close once */ xfer->flags_int.did_close = 1; } else { /* need to wait for the next done callback */ } } else { DPRINTF("close\n"); /* close here and now */ (xfer->endpoint->methods->close) (xfer); /* * Any additional DMA delay is done by * "usbd_transfer_unsetup()". */ /* * Special case. Check if we need to restart a blocked * endpoint. */ ep = xfer->endpoint; /* * If the current USB transfer is completing we need * to start the next one: */ if (ep->endpoint_q[xfer->stream_id].curr == xfer) { usb_command_wrapper( &ep->endpoint_q[xfer->stream_id], NULL); } } USB_BUS_UNLOCK(xfer->xroot->bus); } /*------------------------------------------------------------------------* * usbd_transfer_pending * * This function will check if an USB transfer is pending which is a * little bit complicated! * Return values: * 0: Not pending * 1: Pending: The USB transfer will receive a callback in the future. *------------------------------------------------------------------------*/ uint8_t usbd_transfer_pending(struct usb_xfer *xfer) { struct usb_xfer_root *info; struct usb_xfer_queue *pq; if (xfer == NULL) { /* transfer is gone */ return (0); } USB_XFER_LOCK_ASSERT(xfer, MA_OWNED); if (xfer->flags_int.transferring) { /* trivial case */ return (1); } USB_BUS_LOCK(xfer->xroot->bus); if (xfer->wait_queue) { /* we are waiting on a queue somewhere */ USB_BUS_UNLOCK(xfer->xroot->bus); return (1); } info = xfer->xroot; pq = &info->done_q; if (pq->curr == xfer) { /* we are currently scheduled for callback */ USB_BUS_UNLOCK(xfer->xroot->bus); return (1); } /* we are not pending */ USB_BUS_UNLOCK(xfer->xroot->bus); return (0); } /*------------------------------------------------------------------------* * usbd_transfer_drain * * This function will stop the USB transfer and wait for any * additional BUS-DMA and HW-DMA operations to complete. Buffers that * are loaded into DMA can safely be freed or reused after that this * function has returned. *------------------------------------------------------------------------*/ void usbd_transfer_drain(struct usb_xfer *xfer) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "usbd_transfer_drain can sleep!"); if (xfer == NULL) { /* transfer is gone */ return; } if (xfer->xroot->xfer_mtx != &Giant) { USB_XFER_LOCK_ASSERT(xfer, MA_NOTOWNED); } USB_XFER_LOCK(xfer); usbd_transfer_stop(xfer); while (usbd_transfer_pending(xfer) || xfer->flags_int.doing_callback) { /* * It is allowed that the callback can drop its * transfer mutex. In that case checking only * "usbd_transfer_pending()" is not enough to tell if * the USB transfer is fully drained. We also need to * check the internal "doing_callback" flag. */ xfer->flags_int.draining = 1; /* * Wait until the current outstanding USB * transfer is complete ! */ cv_wait(&xfer->xroot->cv_drain, xfer->xroot->xfer_mtx); } USB_XFER_UNLOCK(xfer); } struct usb_page_cache * usbd_xfer_get_frame(struct usb_xfer *xfer, usb_frcount_t frindex) { KASSERT(frindex < xfer->max_frame_count, ("frame index overflow")); return (&xfer->frbuffers[frindex]); } void * usbd_xfer_get_frame_buffer(struct usb_xfer *xfer, usb_frcount_t frindex) { struct usb_page_search page_info; KASSERT(frindex < xfer->max_frame_count, ("frame index overflow")); usbd_get_page(&xfer->frbuffers[frindex], 0, &page_info); return (page_info.buffer); } /*------------------------------------------------------------------------* * usbd_xfer_get_fps_shift * * The following function is only useful for isochronous transfers. It * returns how many times the frame execution rate has been shifted * down. * * Return value: * Success: 0..3 * Failure: 0 *------------------------------------------------------------------------*/ uint8_t usbd_xfer_get_fps_shift(struct usb_xfer *xfer) { return (xfer->fps_shift); } usb_frlength_t usbd_xfer_frame_len(struct usb_xfer *xfer, usb_frcount_t frindex) { KASSERT(frindex < xfer->max_frame_count, ("frame index overflow")); return (xfer->frlengths[frindex]); } /*------------------------------------------------------------------------* * usbd_xfer_set_frame_data * * This function sets the pointer of the buffer that should * loaded directly into DMA for the given USB frame. Passing "ptr" * equal to NULL while the corresponding "frlength" is greater * than zero gives undefined results! *------------------------------------------------------------------------*/ void usbd_xfer_set_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex, void *ptr, usb_frlength_t len) { KASSERT(frindex < xfer->max_frame_count, ("frame index overflow")); /* set virtual address to load and length */ xfer->frbuffers[frindex].buffer = ptr; usbd_xfer_set_frame_len(xfer, frindex, len); } void usbd_xfer_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex, void **ptr, int *len) { KASSERT(frindex < xfer->max_frame_count, ("frame index overflow")); if (ptr != NULL) *ptr = xfer->frbuffers[frindex].buffer; if (len != NULL) *len = xfer->frlengths[frindex]; } /*------------------------------------------------------------------------* * usbd_xfer_old_frame_length * * This function returns the framelength of the given frame at the * time the transfer was submitted. This function can be used to * compute the starting data pointer of the next isochronous frame * when an isochronous transfer has completed. *------------------------------------------------------------------------*/ usb_frlength_t usbd_xfer_old_frame_length(struct usb_xfer *xfer, usb_frcount_t frindex) { KASSERT(frindex < xfer->max_frame_count, ("frame index overflow")); return (xfer->frlengths[frindex + xfer->max_frame_count]); } void usbd_xfer_status(struct usb_xfer *xfer, int *actlen, int *sumlen, int *aframes, int *nframes) { if (actlen != NULL) *actlen = xfer->actlen; if (sumlen != NULL) *sumlen = xfer->sumlen; if (aframes != NULL) *aframes = xfer->aframes; if (nframes != NULL) *nframes = xfer->nframes; } /*------------------------------------------------------------------------* * usbd_xfer_set_frame_offset * * This function sets the frame data buffer offset relative to the beginning * of the USB DMA buffer allocated for this USB transfer. *------------------------------------------------------------------------*/ void usbd_xfer_set_frame_offset(struct usb_xfer *xfer, usb_frlength_t offset, usb_frcount_t frindex) { KASSERT(!xfer->flags.ext_buffer, ("Cannot offset data frame " "when the USB buffer is external\n")); KASSERT(frindex < xfer->max_frame_count, ("frame index overflow")); /* set virtual address to load */ xfer->frbuffers[frindex].buffer = USB_ADD_BYTES(xfer->local_buffer, offset); } void usbd_xfer_set_interval(struct usb_xfer *xfer, int i) { xfer->interval = i; } void usbd_xfer_set_timeout(struct usb_xfer *xfer, int t) { xfer->timeout = t; } void usbd_xfer_set_frames(struct usb_xfer *xfer, usb_frcount_t n) { xfer->nframes = n; } usb_frcount_t usbd_xfer_max_frames(struct usb_xfer *xfer) { return (xfer->max_frame_count); } usb_frlength_t usbd_xfer_max_len(struct usb_xfer *xfer) { return (xfer->max_data_length); } usb_frlength_t usbd_xfer_max_framelen(struct usb_xfer *xfer) { return (xfer->max_frame_size); } void usbd_xfer_set_frame_len(struct usb_xfer *xfer, usb_frcount_t frindex, usb_frlength_t len) { KASSERT(frindex < xfer->max_frame_count, ("frame index overflow")); xfer->frlengths[frindex] = len; } /*------------------------------------------------------------------------* * usb_callback_proc - factored out code * * This function performs USB callbacks. *------------------------------------------------------------------------*/ static void usb_callback_proc(struct usb_proc_msg *_pm) { struct usb_done_msg *pm = (void *)_pm; struct usb_xfer_root *info = pm->xroot; /* Change locking order */ USB_BUS_UNLOCK(info->bus); /* * We exploit the fact that the mutex is the same for all * callbacks that will be called from this thread: */ USB_MTX_LOCK(info->xfer_mtx); USB_BUS_LOCK(info->bus); /* Continue where we lost track */ usb_command_wrapper(&info->done_q, info->done_q.curr); USB_MTX_UNLOCK(info->xfer_mtx); } /*------------------------------------------------------------------------* * usbd_callback_ss_done_defer * * This function will defer the start, stop and done callback to the * correct thread. *------------------------------------------------------------------------*/ static void usbd_callback_ss_done_defer(struct usb_xfer *xfer) { struct usb_xfer_root *info = xfer->xroot; struct usb_xfer_queue *pq = &info->done_q; USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED); if (pq->curr != xfer) { usbd_transfer_enqueue(pq, xfer); } if (!pq->recurse_1) { /* * We have to postpone the callback due to the fact we * will have a Lock Order Reversal, LOR, if we try to * proceed ! */ (void) usb_proc_msignal(info->done_p, &info->done_m[0], &info->done_m[1]); } else { /* clear second recurse flag */ pq->recurse_2 = 0; } return; } /*------------------------------------------------------------------------* * usbd_callback_wrapper * * This is a wrapper for USB callbacks. This wrapper does some * auto-magic things like figuring out if we can call the callback * directly from the current context or if we need to wakeup the * interrupt process. *------------------------------------------------------------------------*/ static void usbd_callback_wrapper(struct usb_xfer_queue *pq) { struct usb_xfer *xfer = pq->curr; struct usb_xfer_root *info = xfer->xroot; USB_BUS_LOCK_ASSERT(info->bus, MA_OWNED); if ((pq->recurse_3 != 0 || mtx_owned(info->xfer_mtx) == 0) && USB_IN_POLLING_MODE_FUNC() == 0) { /* * Cases that end up here: * * 5) HW interrupt done callback or other source. * 6) HW completed transfer during callback */ DPRINTFN(3, "case 5 and 6\n"); /* * We have to postpone the callback due to the fact we * will have a Lock Order Reversal, LOR, if we try to * proceed! * * Postponing the callback also ensures that other USB * transfer queues get a chance. */ (void) usb_proc_msignal(info->done_p, &info->done_m[0], &info->done_m[1]); return; } /* * Cases that end up here: * * 1) We are starting a transfer * 2) We are prematurely calling back a transfer * 3) We are stopping a transfer * 4) We are doing an ordinary callback */ DPRINTFN(3, "case 1-4\n"); /* get next USB transfer in the queue */ info->done_q.curr = NULL; /* set flag in case of drain */ xfer->flags_int.doing_callback = 1; USB_BUS_UNLOCK(info->bus); USB_BUS_LOCK_ASSERT(info->bus, MA_NOTOWNED); /* set correct USB state for callback */ if (!xfer->flags_int.transferring) { xfer->usb_state = USB_ST_SETUP; if (!xfer->flags_int.started) { /* we got stopped before we even got started */ USB_BUS_LOCK(info->bus); goto done; } } else { if (usbd_callback_wrapper_sub(xfer)) { /* the callback has been deferred */ USB_BUS_LOCK(info->bus); goto done; } #if USB_HAVE_POWERD /* decrement power reference */ usbd_transfer_power_ref(xfer, -1); #endif xfer->flags_int.transferring = 0; if (xfer->error) { xfer->usb_state = USB_ST_ERROR; } else { /* set transferred state */ xfer->usb_state = USB_ST_TRANSFERRED; #if USB_HAVE_BUSDMA /* sync DMA memory, if any */ if (xfer->flags_int.bdma_enable && (!xfer->flags_int.bdma_no_post_sync)) { usb_bdma_post_sync(xfer); } #endif } } #if USB_HAVE_PF if (xfer->usb_state != USB_ST_SETUP) { USB_BUS_LOCK(info->bus); usbpf_xfertap(xfer, USBPF_XFERTAP_DONE); USB_BUS_UNLOCK(info->bus); } #endif /* call processing routine */ (xfer->callback) (xfer, xfer->error); /* pickup the USB mutex again */ USB_BUS_LOCK(info->bus); /* * Check if we got started after that we got cancelled, but * before we managed to do the callback. */ if ((!xfer->flags_int.open) && (xfer->flags_int.started) && (xfer->usb_state == USB_ST_ERROR)) { /* clear flag in case of drain */ xfer->flags_int.doing_callback = 0; /* try to loop, but not recursivly */ usb_command_wrapper(&info->done_q, xfer); return; } done: /* clear flag in case of drain */ xfer->flags_int.doing_callback = 0; /* * Check if we are draining. */ if (xfer->flags_int.draining && (!xfer->flags_int.transferring)) { /* "usbd_transfer_drain()" is waiting for end of transfer */ xfer->flags_int.draining = 0; cv_broadcast(&info->cv_drain); } /* do the next callback, if any */ usb_command_wrapper(&info->done_q, info->done_q.curr); } /*------------------------------------------------------------------------* * usb_dma_delay_done_cb * * This function is called when the DMA delay has been exectuded, and * will make sure that the callback is called to complete the USB * transfer. This code path is usually only used when there is an USB * error like USB_ERR_CANCELLED. *------------------------------------------------------------------------*/ void usb_dma_delay_done_cb(struct usb_xfer *xfer) { USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED); DPRINTFN(3, "Completed %p\n", xfer); /* queue callback for execution, again */ usbd_transfer_done(xfer, 0); } /*------------------------------------------------------------------------* * usbd_transfer_dequeue * * - This function is used to remove an USB transfer from a USB * transfer queue. * * - This function can be called multiple times in a row. *------------------------------------------------------------------------*/ void usbd_transfer_dequeue(struct usb_xfer *xfer) { struct usb_xfer_queue *pq; pq = xfer->wait_queue; if (pq) { TAILQ_REMOVE(&pq->head, xfer, wait_entry); xfer->wait_queue = NULL; } } /*------------------------------------------------------------------------* * usbd_transfer_enqueue * * - This function is used to insert an USB transfer into a USB * * transfer queue. * * - This function can be called multiple times in a row. *------------------------------------------------------------------------*/ void usbd_transfer_enqueue(struct usb_xfer_queue *pq, struct usb_xfer *xfer) { /* * Insert the USB transfer into the queue, if it is not * already on a USB transfer queue: */ if (xfer->wait_queue == NULL) { xfer->wait_queue = pq; TAILQ_INSERT_TAIL(&pq->head, xfer, wait_entry); } } /*------------------------------------------------------------------------* * usbd_transfer_done * * - This function is used to remove an USB transfer from the busdma, * pipe or interrupt queue. * * - This function is used to queue the USB transfer on the done * queue. * * - This function is used to stop any USB transfer timeouts. *------------------------------------------------------------------------*/ void usbd_transfer_done(struct usb_xfer *xfer, usb_error_t error) { struct usb_xfer_root *info = xfer->xroot; USB_BUS_LOCK_ASSERT(info->bus, MA_OWNED); DPRINTF("err=%s\n", usbd_errstr(error)); /* * If we are not transferring then just return. * This can happen during transfer cancel. */ if (!xfer->flags_int.transferring) { DPRINTF("not transferring\n"); /* end of control transfer, if any */ xfer->flags_int.control_act = 0; return; } /* only set transfer error, if not already set */ if (xfer->error == USB_ERR_NORMAL_COMPLETION) xfer->error = error; /* stop any callouts */ usb_callout_stop(&xfer->timeout_handle); /* * If we are waiting on a queue, just remove the USB transfer * from the queue, if any. We should have the required locks * locked to do the remove when this function is called. */ usbd_transfer_dequeue(xfer); #if USB_HAVE_BUSDMA if (mtx_owned(info->xfer_mtx)) { struct usb_xfer_queue *pq; /* * If the private USB lock is not locked, then we assume * that the BUS-DMA load stage has been passed: */ pq = &info->dma_q; if (pq->curr == xfer) { /* start the next BUS-DMA load, if any */ usb_command_wrapper(pq, NULL); } } #endif /* keep some statistics */ if (xfer->error) { info->bus->stats_err.uds_requests [xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE]++; } else { info->bus->stats_ok.uds_requests [xfer->endpoint->edesc->bmAttributes & UE_XFERTYPE]++; } /* call the USB transfer callback */ usbd_callback_ss_done_defer(xfer); } /*------------------------------------------------------------------------* * usbd_transfer_start_cb * * This function is called to start the USB transfer when * "xfer->interval" is greater than zero, and and the endpoint type is * BULK or CONTROL. *------------------------------------------------------------------------*/ static void usbd_transfer_start_cb(void *arg) { struct usb_xfer *xfer = arg; struct usb_endpoint *ep = xfer->endpoint; USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED); DPRINTF("start\n"); #if USB_HAVE_PF usbpf_xfertap(xfer, USBPF_XFERTAP_SUBMIT); #endif /* the transfer can now be cancelled */ xfer->flags_int.can_cancel_immed = 1; /* start USB transfer, if no error */ if (xfer->error == 0) (ep->methods->start) (xfer); /* check for transfer error */ if (xfer->error) { /* some error has happened */ usbd_transfer_done(xfer, 0); } } /*------------------------------------------------------------------------* * usbd_xfer_set_stall * * This function is used to set the stall flag outside the * callback. This function is NULL safe. *------------------------------------------------------------------------*/ void usbd_xfer_set_stall(struct usb_xfer *xfer) { if (xfer == NULL) { /* tearing down */ return; } USB_XFER_LOCK_ASSERT(xfer, MA_OWNED); /* avoid any races by locking the USB mutex */ USB_BUS_LOCK(xfer->xroot->bus); xfer->flags.stall_pipe = 1; USB_BUS_UNLOCK(xfer->xroot->bus); } int usbd_xfer_is_stalled(struct usb_xfer *xfer) { return (xfer->endpoint->is_stalled); } /*------------------------------------------------------------------------* * usbd_transfer_clear_stall * * This function is used to clear the stall flag outside the * callback. This function is NULL safe. *------------------------------------------------------------------------*/ void usbd_transfer_clear_stall(struct usb_xfer *xfer) { if (xfer == NULL) { /* tearing down */ return; } USB_XFER_LOCK_ASSERT(xfer, MA_OWNED); /* avoid any races by locking the USB mutex */ USB_BUS_LOCK(xfer->xroot->bus); xfer->flags.stall_pipe = 0; USB_BUS_UNLOCK(xfer->xroot->bus); } /*------------------------------------------------------------------------* * usbd_pipe_start * * This function is used to add an USB transfer to the pipe transfer list. *------------------------------------------------------------------------*/ void usbd_pipe_start(struct usb_xfer_queue *pq) { struct usb_endpoint *ep; struct usb_xfer *xfer; uint8_t type; xfer = pq->curr; ep = xfer->endpoint; USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED); /* * If the endpoint is already stalled we do nothing ! */ if (ep->is_stalled) { return; } /* * Check if we are supposed to stall the endpoint: */ if (xfer->flags.stall_pipe) { struct usb_device *udev; struct usb_xfer_root *info; /* clear stall command */ xfer->flags.stall_pipe = 0; /* get pointer to USB device */ info = xfer->xroot; udev = info->udev; /* * Only stall BULK and INTERRUPT endpoints. */ type = (ep->edesc->bmAttributes & UE_XFERTYPE); if ((type == UE_BULK) || (type == UE_INTERRUPT)) { uint8_t did_stall; did_stall = 1; if (udev->flags.usb_mode == USB_MODE_DEVICE) { (udev->bus->methods->set_stall) ( udev, ep, &did_stall); } else if (udev->ctrl_xfer[1]) { info = udev->ctrl_xfer[1]->xroot; usb_proc_msignal( USB_BUS_CS_PROC(info->bus), &udev->cs_msg[0], &udev->cs_msg[1]); } else { /* should not happen */ DPRINTFN(0, "No stall handler\n"); } /* * Check if we should stall. Some USB hardware * handles set- and clear-stall in hardware. */ if (did_stall) { /* * The transfer will be continued when * the clear-stall control endpoint * message is received. */ ep->is_stalled = 1; return; } } else if (type == UE_ISOCHRONOUS) { /* * Make sure any FIFO overflow or other FIFO * error conditions go away by resetting the * endpoint FIFO through the clear stall * method. */ if (udev->flags.usb_mode == USB_MODE_DEVICE) { (udev->bus->methods->clear_stall) (udev, ep); } } } /* Set or clear stall complete - special case */ if (xfer->nframes == 0) { /* we are complete */ xfer->aframes = 0; usbd_transfer_done(xfer, 0); return; } /* * Handled cases: * * 1) Start the first transfer queued. * * 2) Re-start the current USB transfer. */ /* * Check if there should be any * pre transfer start delay: */ if (xfer->interval > 0) { type = (ep->edesc->bmAttributes & UE_XFERTYPE); if ((type == UE_BULK) || (type == UE_CONTROL)) { usbd_transfer_timeout_ms(xfer, &usbd_transfer_start_cb, xfer->interval); return; } } DPRINTF("start\n"); #if USB_HAVE_PF usbpf_xfertap(xfer, USBPF_XFERTAP_SUBMIT); #endif /* the transfer can now be cancelled */ xfer->flags_int.can_cancel_immed = 1; /* start USB transfer, if no error */ if (xfer->error == 0) (ep->methods->start) (xfer); /* check for transfer error */ if (xfer->error) { /* some error has happened */ usbd_transfer_done(xfer, 0); } } /*------------------------------------------------------------------------* * usbd_transfer_timeout_ms * * This function is used to setup a timeout on the given USB * transfer. If the timeout has been deferred the callback given by * "cb" will get called after "ms" milliseconds. *------------------------------------------------------------------------*/ void usbd_transfer_timeout_ms(struct usb_xfer *xfer, void (*cb) (void *arg), usb_timeout_t ms) { USB_BUS_LOCK_ASSERT(xfer->xroot->bus, MA_OWNED); /* defer delay */ usb_callout_reset(&xfer->timeout_handle, USB_MS_TO_TICKS(ms) + USB_CALLOUT_ZERO_TICKS, cb, xfer); } /*------------------------------------------------------------------------* * usbd_callback_wrapper_sub * * - This function will update variables in an USB transfer after * that the USB transfer is complete. * * - This function is used to start the next USB transfer on the * ep transfer queue, if any. * * NOTE: In some special cases the USB transfer will not be removed from * the pipe queue, but remain first. To enforce USB transfer removal call * this function passing the error code "USB_ERR_CANCELLED". * * Return values: * 0: Success. * Else: The callback has been deferred. *------------------------------------------------------------------------*/ static uint8_t usbd_callback_wrapper_sub(struct usb_xfer *xfer) { struct usb_endpoint *ep; struct usb_bus *bus; usb_frcount_t x; bus = xfer->xroot->bus; if ((!xfer->flags_int.open) && (!xfer->flags_int.did_close)) { DPRINTF("close\n"); USB_BUS_LOCK(bus); (xfer->endpoint->methods->close) (xfer); USB_BUS_UNLOCK(bus); /* only close once */ xfer->flags_int.did_close = 1; return (1); /* wait for new callback */ } /* * If we have a non-hardware induced error we * need to do the DMA delay! */ if (xfer->error != 0 && !xfer->flags_int.did_dma_delay && (xfer->error == USB_ERR_CANCELLED || xfer->error == USB_ERR_TIMEOUT || bus->methods->start_dma_delay != NULL)) { usb_timeout_t temp; /* only delay once */ xfer->flags_int.did_dma_delay = 1; /* we can not cancel this delay */ xfer->flags_int.can_cancel_immed = 0; temp = usbd_get_dma_delay(xfer->xroot->udev); DPRINTFN(3, "DMA delay, %u ms, " "on %p\n", temp, xfer); if (temp != 0) { USB_BUS_LOCK(bus); /* * Some hardware solutions have dedicated * events when it is safe to free DMA'ed * memory. For the other hardware platforms we * use a static delay. */ if (bus->methods->start_dma_delay != NULL) { (bus->methods->start_dma_delay) (xfer); } else { usbd_transfer_timeout_ms(xfer, (void (*)(void *))&usb_dma_delay_done_cb, temp); } USB_BUS_UNLOCK(bus); return (1); /* wait for new callback */ } } /* check actual number of frames */ if (xfer->aframes > xfer->nframes) { if (xfer->error == 0) { panic("%s: actual number of frames, %d, is " "greater than initial number of frames, %d\n", __FUNCTION__, xfer->aframes, xfer->nframes); } else { /* just set some valid value */ xfer->aframes = xfer->nframes; } } /* compute actual length */ xfer->actlen = 0; for (x = 0; x != xfer->aframes; x++) { xfer->actlen += xfer->frlengths[x]; } /* * Frames that were not transferred get zero actual length in * case the USB device driver does not check the actual number * of frames transferred, "xfer->aframes": */ for (; x < xfer->nframes; x++) { usbd_xfer_set_frame_len(xfer, x, 0); } /* check actual length */ if (xfer->actlen > xfer->sumlen) { if (xfer->error == 0) { panic("%s: actual length, %d, is greater than " "initial length, %d\n", __FUNCTION__, xfer->actlen, xfer->sumlen); } else { /* just set some valid value */ xfer->actlen = xfer->sumlen; } } DPRINTFN(1, "xfer=%p endpoint=%p sts=%d alen=%d, slen=%d, afrm=%d, nfrm=%d\n", xfer, xfer->endpoint, xfer->error, xfer->actlen, xfer->sumlen, xfer->aframes, xfer->nframes); if (xfer->error) { /* end of control transfer, if any */ xfer->flags_int.control_act = 0; #if USB_HAVE_TT_SUPPORT switch (xfer->error) { case USB_ERR_NORMAL_COMPLETION: case USB_ERR_SHORT_XFER: case USB_ERR_STALLED: case USB_ERR_CANCELLED: /* nothing to do */ break; default: /* try to reset the TT, if any */ USB_BUS_LOCK(bus); uhub_tt_buffer_reset_async_locked(xfer->xroot->udev, xfer->endpoint); USB_BUS_UNLOCK(bus); break; } #endif /* check if we should block the execution queue */ if ((xfer->error != USB_ERR_CANCELLED) && (xfer->flags.pipe_bof)) { DPRINTFN(2, "xfer=%p: Block On Failure " "on endpoint=%p\n", xfer, xfer->endpoint); goto done; } } else { /* check for short transfers */ if (xfer->actlen < xfer->sumlen) { /* end of control transfer, if any */ xfer->flags_int.control_act = 0; if (!xfer->flags_int.short_xfer_ok) { xfer->error = USB_ERR_SHORT_XFER; if (xfer->flags.pipe_bof) { DPRINTFN(2, "xfer=%p: Block On Failure on " "Short Transfer on endpoint %p.\n", xfer, xfer->endpoint); goto done; } } } else { /* * Check if we are in the middle of a * control transfer: */ if (xfer->flags_int.control_act) { DPRINTFN(5, "xfer=%p: Control transfer " "active on endpoint=%p\n", xfer, xfer->endpoint); goto done; } } } ep = xfer->endpoint; /* * If the current USB transfer is completing we need to start the * next one: */ USB_BUS_LOCK(bus); if (ep->endpoint_q[xfer->stream_id].curr == xfer) { usb_command_wrapper(&ep->endpoint_q[xfer->stream_id], NULL); if (ep->endpoint_q[xfer->stream_id].curr != NULL || TAILQ_FIRST(&ep->endpoint_q[xfer->stream_id].head) != NULL) { /* there is another USB transfer waiting */ } else { /* this is the last USB transfer */ /* clear isochronous sync flag */ xfer->endpoint->is_synced = 0; } } USB_BUS_UNLOCK(bus); done: return (0); } /*------------------------------------------------------------------------* * usb_command_wrapper * * This function is used to execute commands non-recursivly on an USB * transfer. *------------------------------------------------------------------------*/ void usb_command_wrapper(struct usb_xfer_queue *pq, struct usb_xfer *xfer) { if (xfer) { /* * If the transfer is not already processing, * queue it! */ if (pq->curr != xfer) { usbd_transfer_enqueue(pq, xfer); if (pq->curr != NULL) { /* something is already processing */ DPRINTFN(6, "busy %p\n", pq->curr); return; } } } else { /* Get next element in queue */ pq->curr = NULL; } if (!pq->recurse_1) { /* clear third recurse flag */ pq->recurse_3 = 0; do { /* set two first recurse flags */ pq->recurse_1 = 1; pq->recurse_2 = 1; if (pq->curr == NULL) { xfer = TAILQ_FIRST(&pq->head); if (xfer) { TAILQ_REMOVE(&pq->head, xfer, wait_entry); xfer->wait_queue = NULL; pq->curr = xfer; } else { break; } } DPRINTFN(6, "cb %p (enter)\n", pq->curr); (pq->command) (pq); DPRINTFN(6, "cb %p (leave)\n", pq->curr); /* * Set third recurse flag to indicate * recursion happened: */ pq->recurse_3 = 1; } while (!pq->recurse_2); /* clear first recurse flag */ pq->recurse_1 = 0; } else { /* clear second recurse flag */ pq->recurse_2 = 0; } } /*------------------------------------------------------------------------* * usbd_ctrl_transfer_setup * * This function is used to setup the default USB control endpoint * transfer. *------------------------------------------------------------------------*/ void usbd_ctrl_transfer_setup(struct usb_device *udev) { struct usb_xfer *xfer; uint8_t no_resetup; uint8_t iface_index; /* check for root HUB */ if (udev->parent_hub == NULL) return; repeat: xfer = udev->ctrl_xfer[0]; if (xfer) { USB_XFER_LOCK(xfer); no_resetup = ((xfer->address == udev->address) && (udev->ctrl_ep_desc.wMaxPacketSize[0] == udev->ddesc.bMaxPacketSize)); if (udev->flags.usb_mode == USB_MODE_DEVICE) { if (no_resetup) { /* * NOTE: checking "xfer->address" and * starting the USB transfer must be * atomic! */ usbd_transfer_start(xfer); } } USB_XFER_UNLOCK(xfer); } else { no_resetup = 0; } if (no_resetup) { /* * All parameters are exactly the same like before. * Just return. */ return; } /* * Update wMaxPacketSize for the default control endpoint: */ udev->ctrl_ep_desc.wMaxPacketSize[0] = udev->ddesc.bMaxPacketSize; /* * Unsetup any existing USB transfer: */ usbd_transfer_unsetup(udev->ctrl_xfer, USB_CTRL_XFER_MAX); /* * Reset clear stall error counter. */ udev->clear_stall_errors = 0; /* * Try to setup a new USB transfer for the * default control endpoint: */ iface_index = 0; if (usbd_transfer_setup(udev, &iface_index, - udev->ctrl_xfer, usb_control_ep_cfg, USB_CTRL_XFER_MAX, NULL, + udev->ctrl_xfer, udev->bus->control_ep_quirk ? + usb_control_ep_quirk_cfg : usb_control_ep_cfg, USB_CTRL_XFER_MAX, NULL, &udev->device_mtx)) { DPRINTFN(0, "could not setup default " "USB transfer\n"); } else { goto repeat; } } /*------------------------------------------------------------------------* * usbd_clear_data_toggle - factored out code * * NOTE: the intention of this function is not to reset the hardware * data toggle. *------------------------------------------------------------------------*/ void usbd_clear_stall_locked(struct usb_device *udev, struct usb_endpoint *ep) { USB_BUS_LOCK_ASSERT(udev->bus, MA_OWNED); /* check that we have a valid case */ if (udev->flags.usb_mode == USB_MODE_HOST && udev->parent_hub != NULL && udev->bus->methods->clear_stall != NULL && ep->methods != NULL) { (udev->bus->methods->clear_stall) (udev, ep); } } /*------------------------------------------------------------------------* * usbd_clear_data_toggle - factored out code * * NOTE: the intention of this function is not to reset the hardware * data toggle on the USB device side. *------------------------------------------------------------------------*/ void usbd_clear_data_toggle(struct usb_device *udev, struct usb_endpoint *ep) { DPRINTFN(5, "udev=%p endpoint=%p\n", udev, ep); USB_BUS_LOCK(udev->bus); ep->toggle_next = 0; /* some hardware needs a callback to clear the data toggle */ usbd_clear_stall_locked(udev, ep); USB_BUS_UNLOCK(udev->bus); } /*------------------------------------------------------------------------* * usbd_clear_stall_callback - factored out clear stall callback * * Input parameters: * xfer1: Clear Stall Control Transfer * xfer2: Stalled USB Transfer * * This function is NULL safe. * * Return values: * 0: In progress * Else: Finished * * Clear stall config example: * * static const struct usb_config my_clearstall = { * .type = UE_CONTROL, * .endpoint = 0, * .direction = UE_DIR_ANY, * .interval = 50, //50 milliseconds * .bufsize = sizeof(struct usb_device_request), * .timeout = 1000, //1.000 seconds * .callback = &my_clear_stall_callback, // ** * .usb_mode = USB_MODE_HOST, * }; * * ** "my_clear_stall_callback" calls "usbd_clear_stall_callback" * passing the correct parameters. *------------------------------------------------------------------------*/ uint8_t usbd_clear_stall_callback(struct usb_xfer *xfer1, struct usb_xfer *xfer2) { struct usb_device_request req; if (xfer2 == NULL) { /* looks like we are tearing down */ DPRINTF("NULL input parameter\n"); return (0); } USB_XFER_LOCK_ASSERT(xfer1, MA_OWNED); USB_XFER_LOCK_ASSERT(xfer2, MA_OWNED); switch (USB_GET_STATE(xfer1)) { case USB_ST_SETUP: /* * pre-clear the data toggle to DATA0 ("umass.c" and * "ata-usb.c" depends on this) */ usbd_clear_data_toggle(xfer2->xroot->udev, xfer2->endpoint); /* setup a clear-stall packet */ req.bmRequestType = UT_WRITE_ENDPOINT; req.bRequest = UR_CLEAR_FEATURE; USETW(req.wValue, UF_ENDPOINT_HALT); req.wIndex[0] = xfer2->endpoint->edesc->bEndpointAddress; req.wIndex[1] = 0; USETW(req.wLength, 0); /* * "usbd_transfer_setup_sub()" will ensure that * we have sufficient room in the buffer for * the request structure! */ /* copy in the transfer */ usbd_copy_in(xfer1->frbuffers, 0, &req, sizeof(req)); /* set length */ xfer1->frlengths[0] = sizeof(req); xfer1->nframes = 1; usbd_transfer_submit(xfer1); return (0); case USB_ST_TRANSFERRED: break; default: /* Error */ if (xfer1->error == USB_ERR_CANCELLED) { return (0); } break; } return (1); /* Clear Stall Finished */ } /*------------------------------------------------------------------------* * usbd_transfer_poll * * The following function gets called from the USB keyboard driver and * UMASS when the system has paniced. * * NOTE: It is currently not possible to resume normal operation on * the USB controller which has been polled, due to clearing of the * "up_dsleep" and "up_msleep" flags. *------------------------------------------------------------------------*/ void usbd_transfer_poll(struct usb_xfer **ppxfer, uint16_t max) { struct usb_xfer *xfer; struct usb_xfer_root *xroot; struct usb_device *udev; struct usb_proc_msg *pm; struct usb_bus *bus; uint16_t n; uint16_t drop_bus_spin; uint16_t drop_bus; uint16_t drop_xfer; for (n = 0; n != max; n++) { /* Extra checks to avoid panic */ xfer = ppxfer[n]; if (xfer == NULL) continue; /* no USB transfer */ xroot = xfer->xroot; if (xroot == NULL) continue; /* no USB root */ udev = xroot->udev; if (udev == NULL) continue; /* no USB device */ bus = udev->bus; if (bus == NULL) continue; /* no BUS structure */ if (bus->methods == NULL) continue; /* no BUS methods */ if (bus->methods->xfer_poll == NULL) continue; /* no poll method */ drop_bus_spin = 0; drop_bus = 0; drop_xfer = 0; if (USB_IN_POLLING_MODE_FUNC() == 0) { /* make sure that the BUS spin mutex is not locked */ while (mtx_owned(&bus->bus_spin_lock)) { mtx_unlock_spin(&bus->bus_spin_lock); drop_bus_spin++; } /* make sure that the BUS mutex is not locked */ while (mtx_owned(&bus->bus_mtx)) { mtx_unlock(&bus->bus_mtx); drop_bus++; } /* make sure that the transfer mutex is not locked */ while (mtx_owned(xroot->xfer_mtx)) { mtx_unlock(xroot->xfer_mtx); drop_xfer++; } } /* Make sure cv_signal() and cv_broadcast() is not called */ USB_BUS_CONTROL_XFER_PROC(bus)->up_msleep = 0; USB_BUS_EXPLORE_PROC(bus)->up_msleep = 0; USB_BUS_GIANT_PROC(bus)->up_msleep = 0; USB_BUS_NON_GIANT_ISOC_PROC(bus)->up_msleep = 0; USB_BUS_NON_GIANT_BULK_PROC(bus)->up_msleep = 0; /* poll USB hardware */ (bus->methods->xfer_poll) (bus); USB_BUS_LOCK(xroot->bus); /* check for clear stall */ if (udev->ctrl_xfer[1] != NULL) { /* poll clear stall start */ pm = &udev->cs_msg[0].hdr; (pm->pm_callback) (pm); /* poll clear stall done thread */ pm = &udev->ctrl_xfer[1]-> xroot->done_m[0].hdr; (pm->pm_callback) (pm); } /* poll done thread */ pm = &xroot->done_m[0].hdr; (pm->pm_callback) (pm); USB_BUS_UNLOCK(xroot->bus); /* restore transfer mutex */ while (drop_xfer--) mtx_lock(xroot->xfer_mtx); /* restore BUS mutex */ while (drop_bus--) mtx_lock(&bus->bus_mtx); /* restore BUS spin mutex */ while (drop_bus_spin--) mtx_lock_spin(&bus->bus_spin_lock); } } static void usbd_get_std_packet_size(struct usb_std_packet_size *ptr, uint8_t type, enum usb_dev_speed speed) { static const uint16_t intr_range_max[USB_SPEED_MAX] = { [USB_SPEED_LOW] = 8, [USB_SPEED_FULL] = 64, [USB_SPEED_HIGH] = 1024, [USB_SPEED_VARIABLE] = 1024, [USB_SPEED_SUPER] = 1024, }; static const uint16_t isoc_range_max[USB_SPEED_MAX] = { [USB_SPEED_LOW] = 0, /* invalid */ [USB_SPEED_FULL] = 1023, [USB_SPEED_HIGH] = 1024, [USB_SPEED_VARIABLE] = 3584, [USB_SPEED_SUPER] = 1024, }; static const uint16_t control_min[USB_SPEED_MAX] = { [USB_SPEED_LOW] = 8, [USB_SPEED_FULL] = 8, [USB_SPEED_HIGH] = 64, [USB_SPEED_VARIABLE] = 512, [USB_SPEED_SUPER] = 512, }; static const uint16_t bulk_min[USB_SPEED_MAX] = { [USB_SPEED_LOW] = 8, [USB_SPEED_FULL] = 8, [USB_SPEED_HIGH] = 512, [USB_SPEED_VARIABLE] = 512, [USB_SPEED_SUPER] = 1024, }; uint16_t temp; memset(ptr, 0, sizeof(*ptr)); switch (type) { case UE_INTERRUPT: ptr->range.max = intr_range_max[speed]; break; case UE_ISOCHRONOUS: ptr->range.max = isoc_range_max[speed]; break; default: if (type == UE_BULK) temp = bulk_min[speed]; else /* UE_CONTROL */ temp = control_min[speed]; /* default is fixed */ ptr->fixed[0] = temp; ptr->fixed[1] = temp; ptr->fixed[2] = temp; ptr->fixed[3] = temp; if (speed == USB_SPEED_FULL) { /* multiple sizes */ ptr->fixed[1] = 16; ptr->fixed[2] = 32; ptr->fixed[3] = 64; } if ((speed == USB_SPEED_VARIABLE) && (type == UE_BULK)) { /* multiple sizes */ ptr->fixed[2] = 1024; ptr->fixed[3] = 1536; } break; } } void * usbd_xfer_softc(struct usb_xfer *xfer) { return (xfer->priv_sc); } void * usbd_xfer_get_priv(struct usb_xfer *xfer) { return (xfer->priv_fifo); } void usbd_xfer_set_priv(struct usb_xfer *xfer, void *ptr) { xfer->priv_fifo = ptr; } uint8_t usbd_xfer_state(struct usb_xfer *xfer) { return (xfer->usb_state); } void usbd_xfer_set_flag(struct usb_xfer *xfer, int flag) { switch (flag) { case USB_FORCE_SHORT_XFER: xfer->flags.force_short_xfer = 1; break; case USB_SHORT_XFER_OK: xfer->flags.short_xfer_ok = 1; break; case USB_MULTI_SHORT_OK: xfer->flags.short_frames_ok = 1; break; case USB_MANUAL_STATUS: xfer->flags.manual_status = 1; break; } } void usbd_xfer_clr_flag(struct usb_xfer *xfer, int flag) { switch (flag) { case USB_FORCE_SHORT_XFER: xfer->flags.force_short_xfer = 0; break; case USB_SHORT_XFER_OK: xfer->flags.short_xfer_ok = 0; break; case USB_MULTI_SHORT_OK: xfer->flags.short_frames_ok = 0; break; case USB_MANUAL_STATUS: xfer->flags.manual_status = 0; break; } } /* * The following function returns in milliseconds when the isochronous * transfer was completed by the hardware. The returned value wraps * around 65536 milliseconds. */ uint16_t usbd_xfer_get_timestamp(struct usb_xfer *xfer) { return (xfer->isoc_time_complete); } /* * The following function returns non-zero if the max packet size * field was clamped to a valid value. Else it returns zero. */ uint8_t usbd_xfer_maxp_was_clamped(struct usb_xfer *xfer) { return (xfer->flags_int.maxp_was_clamped); } Index: projects/clang900-import/sys/fs/msdosfs/msdosfs_denode.c =================================================================== --- projects/clang900-import/sys/fs/msdosfs/msdosfs_denode.c (revision 352586) +++ projects/clang900-import/sys/fs/msdosfs/msdosfs_denode.c (revision 352587) @@ -1,616 +1,617 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_denode.c,v 1.28 1998/02/10 14:10:00 mrg Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MSDOSFSNODE, "msdosfs_node", "MSDOSFS vnode private part"); static int de_vncmpf(struct vnode *vp, void *arg) { struct denode *de; uint64_t *a; a = arg; de = VTODE(vp); - return (de->de_inode != *a); + return (de->de_inode != *a) || (de->de_refcnt <= 0); } /* * If deget() succeeds it returns with the gotten denode locked(). * * pmp - address of msdosfsmount structure of the filesystem containing * the denode of interest. The address of * the msdosfsmount structure are used. * dirclust - which cluster bp contains, if dirclust is 0 (root directory) * diroffset is relative to the beginning of the root directory, * otherwise it is cluster relative. * diroffset - offset past begin of cluster of denode we want * depp - returns the address of the gotten denode. */ int deget(struct msdosfsmount *pmp, u_long dirclust, u_long diroffset, struct denode **depp) { int error; uint64_t inode; struct mount *mntp = pmp->pm_mountp; struct direntry *direntptr; struct denode *ldep; struct vnode *nvp, *xvp; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("deget(pmp %p, dirclust %lu, diroffset %lx, depp %p)\n", pmp, dirclust, diroffset, depp); #endif /* * On FAT32 filesystems, root is a (more or less) normal * directory */ if (FAT32(pmp) && dirclust == MSDOSFSROOT) dirclust = pmp->pm_rootdirblk; /* * See if the denode is in the denode cache. Use the location of * the directory entry to compute the hash value. For subdir use * address of "." entry. For root dir (if not FAT32) use cluster * MSDOSFSROOT, offset MSDOSFSROOT_OFS * - * NOTE: The check for de_refcnt > 0 below insures the denode being - * examined does not represent an unlinked but still open file. + * NOTE: de_vncmpf will explicitly skip any denodes that do not have + * a de_refcnt > 0. This insures that that we do not attempt to use + * a denode that represents an unlinked but still open file. * These files are not to be accessible even when the directory * entry that represented the file happens to be reused while the * deleted file is still open. */ inode = (uint64_t)pmp->pm_bpcluster * dirclust + diroffset; error = vfs_hash_get(mntp, inode, LK_EXCLUSIVE, curthread, &nvp, de_vncmpf, &inode); if (error) return (error); if (nvp != NULL) { *depp = VTODE(nvp); KASSERT((*depp)->de_dirclust == dirclust, ("wrong dirclust")); KASSERT((*depp)->de_diroffset == diroffset, ("wrong diroffset")); return (0); } ldep = malloc(sizeof(struct denode), M_MSDOSFSNODE, M_WAITOK | M_ZERO); /* * Directory entry was not in cache, have to create a vnode and * copy it from the passed disk buffer. */ /* getnewvnode() does a VREF() on the vnode */ error = getnewvnode("msdosfs", mntp, &msdosfs_vnodeops, &nvp); if (error) { *depp = NULL; free(ldep, M_MSDOSFSNODE); return error; } nvp->v_data = ldep; ldep->de_vnode = nvp; ldep->de_flag = 0; ldep->de_dirclust = dirclust; ldep->de_diroffset = diroffset; ldep->de_inode = inode; lockmgr(nvp->v_vnlock, LK_EXCLUSIVE, NULL); fc_purge(ldep, 0); /* init the FAT cache for this denode */ error = insmntque(nvp, mntp); if (error != 0) { free(ldep, M_MSDOSFSNODE); *depp = NULL; return (error); } error = vfs_hash_insert(nvp, inode, LK_EXCLUSIVE, curthread, &xvp, de_vncmpf, &inode); if (error) { *depp = NULL; return (error); } if (xvp != NULL) { *depp = xvp->v_data; return (0); } ldep->de_pmp = pmp; ldep->de_refcnt = 1; /* * Copy the directory entry into the denode area of the vnode. */ if ((dirclust == MSDOSFSROOT || (FAT32(pmp) && dirclust == pmp->pm_rootdirblk)) && diroffset == MSDOSFSROOT_OFS) { /* * Directory entry for the root directory. There isn't one, * so we manufacture one. We should probably rummage * through the root directory and find a label entry (if it * exists), and then use the time and date from that entry * as the time and date for the root denode. */ nvp->v_vflag |= VV_ROOT; /* should be further down XXX */ ldep->de_Attributes = ATTR_DIRECTORY; ldep->de_LowerCase = 0; if (FAT32(pmp)) ldep->de_StartCluster = pmp->pm_rootdirblk; /* de_FileSize will be filled in further down */ else { ldep->de_StartCluster = MSDOSFSROOT; ldep->de_FileSize = pmp->pm_rootdirsize * DEV_BSIZE; } /* * fill in time and date so that fattime2timespec() doesn't * spit up when called from msdosfs_getattr() with root * denode */ ldep->de_CHun = 0; ldep->de_CTime = 0x0000; /* 00:00:00 */ ldep->de_CDate = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT) | (1 << DD_DAY_SHIFT); /* Jan 1, 1980 */ ldep->de_ADate = ldep->de_CDate; ldep->de_MTime = ldep->de_CTime; ldep->de_MDate = ldep->de_CDate; /* leave the other fields as garbage */ } else { error = readep(pmp, dirclust, diroffset, &bp, &direntptr); if (error) { /* * The denode does not contain anything useful, so * it would be wrong to leave it on its hash chain. * Arrange for vput() to just forget about it. */ ldep->de_Name[0] = SLOT_DELETED; vput(nvp); *depp = NULL; return (error); } (void)DE_INTERNALIZE(ldep, direntptr); brelse(bp); } /* * Fill in a few fields of the vnode and finish filling in the * denode. Then return the address of the found denode. */ if (ldep->de_Attributes & ATTR_DIRECTORY) { /* * Since DOS directory entries that describe directories * have 0 in the filesize field, we take this opportunity * to find out the length of the directory and plug it into * the denode structure. */ u_long size; /* * XXX it sometimes happens that the "." entry has cluster * number 0 when it shouldn't. Use the actual cluster number * instead of what is written in directory entry. */ if (diroffset == 0 && ldep->de_StartCluster != dirclust) { #ifdef MSDOSFS_DEBUG printf("deget(): \".\" entry at clust %lu != %lu\n", dirclust, ldep->de_StartCluster); #endif ldep->de_StartCluster = dirclust; } nvp->v_type = VDIR; if (ldep->de_StartCluster != MSDOSFSROOT) { error = pcbmap(ldep, 0xffff, 0, &size, 0); if (error == E2BIG) { ldep->de_FileSize = de_cn2off(pmp, size); error = 0; } else { #ifdef MSDOSFS_DEBUG printf("deget(): pcbmap returned %d\n", error); #endif } } } else nvp->v_type = VREG; ldep->de_modrev = init_va_filerev(); *depp = ldep; return (0); } int deupdat(struct denode *dep, int waitfor) { struct direntry dir; struct timespec ts; struct buf *bp; struct direntry *dirp; int error; if (DETOV(dep)->v_mount->mnt_flag & MNT_RDONLY) { dep->de_flag &= ~(DE_UPDATE | DE_CREATE | DE_ACCESS | DE_MODIFIED); return (0); } vfs_timestamp(&ts); DETIMES(dep, &ts, &ts, &ts); if ((dep->de_flag & DE_MODIFIED) == 0 && waitfor == 0) return (0); dep->de_flag &= ~DE_MODIFIED; if (DETOV(dep)->v_vflag & VV_ROOT) return (EINVAL); if (dep->de_refcnt <= 0) return (0); error = readde(dep, &bp, &dirp); if (error) return (error); DE_EXTERNALIZE(&dir, dep); if (bcmp(dirp, &dir, sizeof(dir)) == 0) { if (waitfor == 0 || (bp->b_flags & B_DELWRI) == 0) { brelse(bp); return (0); } } else *dirp = dir; if ((DETOV(dep)->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) bp->b_flags |= B_CLUSTEROK; if (waitfor) error = bwrite(bp); else if (vm_page_count_severe() || buf_dirty_count_severe()) bawrite(bp); else bdwrite(bp); return (error); } /* * Truncate the file described by dep to the length specified by length. */ int detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred) { int error; int allerror; u_long eofentry; u_long chaintofree; daddr_t bn; int boff; int isadir = dep->de_Attributes & ATTR_DIRECTORY; struct buf *bp; struct msdosfsmount *pmp = dep->de_pmp; #ifdef MSDOSFS_DEBUG printf("detrunc(): file %s, length %lu, flags %x\n", dep->de_Name, length, flags); #endif /* * Disallow attempts to truncate the root directory since it is of * fixed size. That's just the way dos filesystems are. We use * the VROOT bit in the vnode because checking for the directory * bit and a startcluster of 0 in the denode is not adequate to * recognize the root directory at this point in a file or * directory's life. */ if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp)) { #ifdef MSDOSFS_DEBUG printf("detrunc(): can't truncate root directory, clust %ld, offset %ld\n", dep->de_dirclust, dep->de_diroffset); #endif return (EINVAL); } if (dep->de_FileSize < length) { vnode_pager_setsize(DETOV(dep), length); return deextend(dep, length, cred); } /* * If the desired length is 0 then remember the starting cluster of * the file and set the StartCluster field in the directory entry * to 0. If the desired length is not zero, then get the number of * the last cluster in the shortened file. Then get the number of * the first cluster in the part of the file that is to be freed. * Then set the next cluster pointer in the last cluster of the * file to CLUST_EOFE. */ if (length == 0) { chaintofree = dep->de_StartCluster; dep->de_StartCluster = 0; eofentry = ~0; } else { error = pcbmap(dep, de_clcount(pmp, length) - 1, 0, &eofentry, 0); if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): pcbmap fails %d\n", error); #endif return (error); } } fc_purge(dep, de_clcount(pmp, length)); /* * If the new length is not a multiple of the cluster size then we * must zero the tail end of the new last cluster in case it * becomes part of the file again because of a seek. */ if ((boff = length & pmp->pm_crbomask) != 0) { if (isadir) { bn = cntobn(pmp, eofentry); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, NOCRED, &bp); } else { error = bread(DETOV(dep), de_cluster(pmp, length), pmp->pm_bpcluster, cred, &bp); } if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): bread fails %d\n", error); #endif return (error); } memset(bp->b_data + boff, 0, pmp->pm_bpcluster - boff); if ((flags & IO_SYNC) != 0) bwrite(bp); else bdwrite(bp); } /* * Write out the updated directory entry. Even if the update fails * we free the trailing clusters. */ dep->de_FileSize = length; if (!isadir) dep->de_flag |= DE_UPDATE | DE_MODIFIED; allerror = vtruncbuf(DETOV(dep), length, pmp->pm_bpcluster); #ifdef MSDOSFS_DEBUG if (allerror) printf("detrunc(): vtruncbuf error %d\n", allerror); #endif error = deupdat(dep, !DOINGASYNC((DETOV(dep)))); if (error != 0 && allerror == 0) allerror = error; #ifdef MSDOSFS_DEBUG printf("detrunc(): allerror %d, eofentry %lu\n", allerror, eofentry); #endif /* * If we need to break the cluster chain for the file then do it * now. */ if (eofentry != ~0) { error = fatentry(FAT_GET_AND_SET, pmp, eofentry, &chaintofree, CLUST_EOFE); if (error) { #ifdef MSDOSFS_DEBUG printf("detrunc(): fatentry errors %d\n", error); #endif return (error); } fc_setcache(dep, FC_LASTFC, de_cluster(pmp, length - 1), eofentry); } /* * Now free the clusters removed from the file because of the * truncation. */ if (chaintofree != 0 && !MSDOSFSEOF(pmp, chaintofree)) freeclusterchain(pmp, chaintofree); return (allerror); } /* * Extend the file described by dep to length specified by length. */ int deextend(struct denode *dep, u_long length, struct ucred *cred) { struct msdosfsmount *pmp = dep->de_pmp; u_long count; int error; /* * The root of a DOS filesystem cannot be extended. */ if ((DETOV(dep)->v_vflag & VV_ROOT) && !FAT32(pmp)) return (EINVAL); /* * Directories cannot be extended. */ if (dep->de_Attributes & ATTR_DIRECTORY) return (EISDIR); if (length <= dep->de_FileSize) panic("deextend: file too large"); /* * Compute the number of clusters to allocate. */ count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize); if (count > 0) { if (count > pmp->pm_freeclustercount) return (ENOSPC); error = extendfile(dep, count, NULL, NULL, DE_CLEAR); if (error) { /* truncate the added clusters away again */ (void) detrunc(dep, dep->de_FileSize, 0, cred); return (error); } } dep->de_FileSize = length; dep->de_flag |= DE_UPDATE | DE_MODIFIED; return (deupdat(dep, !DOINGASYNC(DETOV(dep)))); } /* * Move a denode to its correct hash queue after the file it represents has * been moved to a new directory. */ void reinsert(struct denode *dep) { struct vnode *vp; /* * Fix up the denode cache. If the denode is for a directory, * there is nothing to do since the hash is based on the starting * cluster of the directory file and that hasn't changed. If for a * file the hash is based on the location of the directory entry, * so we must remove it from the cache and re-enter it with the * hash based on the new location of the directory entry. */ #if 0 if (dep->de_Attributes & ATTR_DIRECTORY) return; #endif vp = DETOV(dep); dep->de_inode = (uint64_t)dep->de_pmp->pm_bpcluster * dep->de_dirclust + dep->de_diroffset; vfs_hash_rehash(vp, dep->de_inode); } int msdosfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); #ifdef MSDOSFS_DEBUG printf("msdosfs_reclaim(): dep %p, file %s, refcnt %ld\n", dep, dep->de_Name, dep->de_refcnt); #endif /* * Remove the denode from its hash chain. */ vfs_hash_remove(vp); /* * Purge old data structures associated with the denode. */ #if 0 /* XXX */ dep->de_flag = 0; #endif free(dep, M_MSDOSFSNODE); vp->v_data = NULL; return (0); } int msdosfs_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); int error = 0; #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): dep %p, de_Name[0] %x\n", dep, dep->de_Name[0]); #endif /* * Ignore denodes related to stale file handles. */ if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY) goto out; /* * If the file has been deleted and it is on a read/write * filesystem, then truncate the file, and mark the directory slot * as empty. (This may not be necessary for the dos filesystem.) */ #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): dep %p, refcnt %ld, mntflag %llx, MNT_RDONLY %llx\n", dep, dep->de_refcnt, (unsigned long long)vp->v_mount->mnt_flag, (unsigned long long)MNT_RDONLY); #endif if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { error = detrunc(dep, (u_long) 0, 0, NOCRED); dep->de_flag |= DE_UPDATE; dep->de_Name[0] = SLOT_DELETED; } deupdat(dep, 0); out: /* * If we are done with the denode, reclaim it * so that it can be reused immediately. */ #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): v_usecount %d, de_Name[0] %x\n", vrefcnt(vp), dep->de_Name[0]); #endif if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY) vrecycle(vp); return (error); } Index: projects/clang900-import/sys/kern/imgact_elf.c =================================================================== --- projects/clang900-import/sys/kern/imgact_elf.c (revision 352586) +++ projects/clang900-import/sys/kern/imgact_elf.c (revision 352587) @@ -1,2744 +1,2765 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2017 Dell EMC * Copyright (c) 2000-2001, 2003 David O'Brien * Copyright (c) 1995-1996 Søren Schmidt * Copyright (c) 1996 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ELF_NOTE_ROUNDSIZE 4 #define OLD_EI_BRAND 8 static int __elfN(check_header)(const Elf_Ehdr *hdr); static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry); static int __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot); static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp); static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel); static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel); static boolean_t __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel, uint32_t *fctl0); static vm_prot_t __elfN(trans_prot)(Elf_Word); static Elf_Word __elfN(untrans_prot)(vm_prot_t); SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0, ""); #define CORE_BUF_SIZE (16 * 1024) int __elfN(fallback_brand) = -1; SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, fallback_brand, CTLFLAG_RWTUN, &__elfN(fallback_brand), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort"); static int elf_legacy_coredump = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, &elf_legacy_coredump, 0, "include all and only RW pages in core dumps"); int __elfN(nxstack) = #if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */ || \ (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__) || \ defined(__riscv) 1; #else 0; #endif SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, nxstack, CTLFLAG_RW, &__elfN(nxstack), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack"); #if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__)) int i386_read_exec = 0; SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0, "enable execution from readable segments"); #endif +static u_long __elfN(pie_base) = ET_DYN_LOAD_ADDR; +static int +sysctl_pie_base(SYSCTL_HANDLER_ARGS) +{ + u_long val; + int error; + + val = __elfN(pie_base); + error = sysctl_handle_long(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if ((val & PAGE_MASK) != 0) + return (EINVAL); + __elfN(pie_base) = val; + return (0); +} +SYSCTL_PROC(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, pie_base, + CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, + sysctl_pie_base, "LU", + "PIE load base without randomization"); + SYSCTL_NODE(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, aslr, CTLFLAG_RW, 0, ""); #define ASLR_NODE_OID __CONCAT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), _aslr) static int __elfN(aslr_enabled) = 0; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, enable, CTLFLAG_RWTUN, &__elfN(aslr_enabled), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable address map randomization"); static int __elfN(pie_aslr_enabled) = 0; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, pie_enable, CTLFLAG_RWTUN, &__elfN(pie_aslr_enabled), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable address map randomization for PIE binaries"); static int __elfN(aslr_honor_sbrk) = 1; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, honor_sbrk, CTLFLAG_RW, &__elfN(aslr_honor_sbrk), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": assume sbrk is used"); static int __elfN(aslr_stack_gap) = 3; SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, stack_gap, CTLFLAG_RW, &__elfN(aslr_stack_gap), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": maximum percentage of main stack to waste on a random gap"); static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; #define aligned(a, t) (rounddown2((u_long)(a), sizeof(t)) == (u_long)(a)) static const char FREEBSD_ABI_VENDOR[] = "FreeBSD"; Elf_Brandnote __elfN(freebsd_brandnote) = { .hdr.n_namesz = sizeof(FREEBSD_ABI_VENDOR), .hdr.n_descsz = sizeof(int32_t), .hdr.n_type = NT_FREEBSD_ABI_TAG, .vendor = FREEBSD_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = __elfN(freebsd_trans_osrel) }; static bool __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel) { uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE); *osrel = *(const int32_t *)(p); return (true); } static const char GNU_ABI_VENDOR[] = "GNU"; static int GNU_KFREEBSD_ABI_DESC = 3; Elf_Brandnote __elfN(kfreebsd_brandnote) = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = kfreebsd_trans_osrel }; static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE); desc = (const Elf32_Word *)p; if (desc[0] != GNU_KFREEBSD_ABI_DESC) return (false); /* * Debian GNU/kFreeBSD embed the earliest compatible kernel version * (__FreeBSD_version: Rxx) in the LSB way. */ *osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3]; return (true); } int __elfN(insert_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == NULL) { elf_brand_list[i] = entry; break; } } if (i == MAX_BRANDS) { printf("WARNING: %s: could not insert brandinfo entry: %p\n", __func__, entry); return (-1); } return (0); } int __elfN(remove_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == entry) { elf_brand_list[i] = NULL; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(brand_inuse)(Elf_Brandinfo *entry) { struct proc *p; int rval = FALSE; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_sysent == entry->sysvec) { rval = TRUE; break; } } sx_sunlock(&allproc_lock); return (rval); } static Elf_Brandinfo * __elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel, uint32_t *fctl0) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; Elf_Brandinfo *bi, *bi_m; boolean_t ret; int i, interp_name_len; interp_name_len = interp != NULL ? strlen(interp) + 1 : 0; /* * We support four types of branding -- (1) the ELF EI_OSABI field * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string * branding w/in the ELF header, (3) path of the `interp_path' * field, and (4) the ".note.ABI-tag" ELF section. */ /* Look for an ".note.ABI-tag" ELF section */ bi_m = NULL; for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL) continue; if (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0) continue; if (hdr->e_machine == bi->machine && (bi->flags & (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) { ret = __elfN(check_note)(imgp, bi->brand_note, osrel, fctl0); /* Give brand a chance to veto check_note's guess */ if (ret && bi->header_supported) ret = bi->header_supported(imgp); /* * If note checker claimed the binary, but the * interpreter path in the image does not * match default one for the brand, try to * search for other brands with the same * interpreter. Either there is better brand * with the right interpreter, or, failing * this, we return first brand which accepted * our note and, optionally, header. */ if (ret && bi_m == NULL && interp != NULL && (bi->interp_path == NULL || (strlen(bi->interp_path) + 1 != interp_name_len || strncmp(interp, bi->interp_path, interp_name_len) != 0))) { bi_m = bi; ret = 0; } if (ret) return (bi); } } if (bi_m != NULL) return (bi_m); /* If the executable has a brand, search for it in the brand list. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 || (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0)) continue; if (hdr->e_machine == bi->machine && (hdr->e_ident[EI_OSABI] == bi->brand || (bi->compat_3_brand != NULL && strcmp((const char *)&hdr->e_ident[OLD_EI_BRAND], bi->compat_3_brand) == 0))) { /* Looks good, but give brand a chance to veto */ if (bi->header_supported == NULL || bi->header_supported(imgp)) { /* * Again, prefer strictly matching * interpreter path. */ if (interp_name_len == 0 && bi->interp_path == NULL) return (bi); if (bi->interp_path != NULL && strlen(bi->interp_path) + 1 == interp_name_len && strncmp(interp, bi->interp_path, interp_name_len) == 0) return (bi); if (bi_m == NULL) bi_m = bi; } } } if (bi_m != NULL) return (bi_m); /* No known brand, see if the header is recognized by any brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY || bi->header_supported == NULL) continue; if (hdr->e_machine == bi->machine) { ret = bi->header_supported(imgp); if (ret) return (bi); } } /* Lacking a known brand, search for a recognized interpreter. */ if (interp != NULL) { for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || (bi->flags & (BI_BRAND_NOTE_MANDATORY | BI_BRAND_ONLY_STATIC)) != 0) continue; if (hdr->e_machine == bi->machine && bi->interp_path != NULL && /* ELF image p_filesz includes terminating zero */ strlen(bi->interp_path) + 1 == interp_name_len && strncmp(interp, bi->interp_path, interp_name_len) == 0 && (bi->header_supported == NULL || bi->header_supported(imgp))) return (bi); } } /* Lacking a recognized interpreter, try the default brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || (bi->flags & BI_BRAND_NOTE_MANDATORY) != 0 || (interp != NULL && (bi->flags & BI_BRAND_ONLY_STATIC) != 0)) continue; if (hdr->e_machine == bi->machine && __elfN(fallback_brand) == bi->brand && (bi->header_supported == NULL || bi->header_supported(imgp))) return (bi); } return (NULL); } static int __elfN(check_header)(const Elf_Ehdr *hdr) { Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_phentsize != sizeof(Elf_Phdr) || hdr->e_version != ELF_TARG_VER) return (ENOEXEC); /* * Make sure we have at least one brand for this machine. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && bi->machine == hdr->e_machine) break; } if (i == MAX_BRANDS) return (ENOEXEC); return (0); } static int __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot) { struct sf_buf *sf; int error; vm_offset_t off; /* * Create the page if it doesn't exist yet. Ignore errors. */ vm_map_fixed(map, NULL, 0, trunc_page(start), round_page(end) - trunc_page(start), VM_PROT_ALL, VM_PROT_ALL, MAP_CHECK_EXCL); /* * Find the page from the underlying object. */ if (object != NULL) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, end - start); vm_imgact_unmap_page(sf); if (error != 0) return (KERN_FAILURE); } return (KERN_SUCCESS); } static int __elfN(map_insert)(struct image_params *imgp, vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow) { struct sf_buf *sf; vm_offset_t off; vm_size_t sz; int error, locked, rv; if (start != trunc_page(start)) { rv = __elfN(map_partial)(map, object, offset, start, round_page(start), prot); if (rv != KERN_SUCCESS) return (rv); offset += round_page(start) - start; start = round_page(start); } if (end != round_page(end)) { rv = __elfN(map_partial)(map, object, offset + trunc_page(end) - start, trunc_page(end), end, prot); if (rv != KERN_SUCCESS) return (rv); end = trunc_page(end); } if (start >= end) return (KERN_SUCCESS); if ((offset & PAGE_MASK) != 0) { /* * The mapping is not page aligned. This means that we have * to copy the data. */ rv = vm_map_fixed(map, NULL, 0, start, end - start, prot | VM_PROT_WRITE, VM_PROT_ALL, MAP_CHECK_EXCL); if (rv != KERN_SUCCESS) return (rv); if (object == NULL) return (KERN_SUCCESS); for (; start < end; start += sz) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); sz = end - start; if (sz > PAGE_SIZE - off) sz = PAGE_SIZE - off; error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, sz); vm_imgact_unmap_page(sf); if (error != 0) return (KERN_FAILURE); offset += sz; } } else { vm_object_reference(object); rv = vm_map_fixed(map, object, offset, start, end - start, prot, VM_PROT_ALL, cow | MAP_CHECK_EXCL | (object != NULL ? MAP_VN_EXEC : 0)); if (rv != KERN_SUCCESS) { locked = VOP_ISLOCKED(imgp->vp); VOP_UNLOCK(imgp->vp, 0); vm_object_deallocate(object); vn_lock(imgp->vp, locked | LK_RETRY); return (rv); } else if (object != NULL) { MPASS(imgp->vp->v_object == object); VOP_SET_TEXT_CHECKED(imgp->vp); } } return (KERN_SUCCESS); } static int __elfN(load_section)(struct image_params *imgp, vm_ooffset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot) { struct sf_buf *sf; size_t map_len; vm_map_t map; vm_object_t object; vm_offset_t map_addr; int error, rv, cow; size_t copy_len; vm_ooffset_t file_addr; /* * It's necessary to fail if the filsz + offset taken from the * header is greater than the actual file pager object's size. * If we were to allow this, then the vm_map_find() below would * walk right off the end of the file object and into the ether. * * While I'm here, might as well check for something else that * is invalid: filsz cannot be greater than memsz. */ if ((filsz != 0 && (off_t)filsz + offset > imgp->attr->va_size) || filsz > memsz) { uprintf("elf_load_section: truncated ELF file\n"); return (ENOEXEC); } object = imgp->object; map = &imgp->proc->p_vmspace->vm_map; map_addr = trunc_page((vm_offset_t)vmaddr); file_addr = trunc_page(offset); /* * We have two choices. We can either clear the data in the last page * of an oversized mapping, or we can start the anon mapping a page * early and copy the initialized data into that first page. We * choose the second. */ if (filsz == 0) map_len = 0; else if (memsz > filsz) map_len = trunc_page(offset + filsz) - file_addr; else map_len = round_page(offset + filsz) - file_addr; if (map_len != 0) { /* cow flags: don't dump readonly sections in core */ cow = MAP_COPY_ON_WRITE | MAP_PREFAULT | (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP); rv = __elfN(map_insert)(imgp, map, object, file_addr, map_addr, map_addr + map_len, prot, cow); if (rv != KERN_SUCCESS) return (EINVAL); /* we can stop now if we've covered it all */ if (memsz == filsz) return (0); } /* * We have to get the remaining bit of the file into the first part * of the oversized map segment. This is normally because the .data * segment in the file is extended to provide bss. It's a neat idea * to try and save a page, but it's a pain in the behind to implement. */ copy_len = filsz == 0 ? 0 : (offset + filsz) - trunc_page(offset + filsz); map_addr = trunc_page((vm_offset_t)vmaddr + filsz); map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr; /* This had damn well better be true! */ if (map_len != 0) { rv = __elfN(map_insert)(imgp, map, NULL, 0, map_addr, map_addr + map_len, prot, 0); if (rv != KERN_SUCCESS) return (EINVAL); } if (copy_len != 0) { sf = vm_imgact_map_page(object, offset + filsz); if (sf == NULL) return (EIO); /* send the page fragment to user space */ error = copyout((caddr_t)sf_buf_kva(sf), (caddr_t)map_addr, copy_len); vm_imgact_unmap_page(sf); if (error != 0) return (error); } /* * Remove write access to the page if it was only granted by map_insert * to allow copyout. */ if ((prot & VM_PROT_WRITE) == 0) vm_map_protect(map, trunc_page(map_addr), round_page(map_addr + map_len), prot, FALSE); return (0); } static int __elfN(load_sections)(struct image_params *imgp, const Elf_Ehdr *hdr, const Elf_Phdr *phdr, u_long rbase, u_long *base_addrp) { vm_prot_t prot; u_long base_addr; bool first; int error, i; ASSERT_VOP_LOCKED(imgp->vp, __func__); base_addr = 0; first = true; for (i = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0) continue; /* Loadable segment */ prot = __elfN(trans_prot)(phdr[i].p_flags); error = __elfN(load_section)(imgp, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase, phdr[i].p_memsz, phdr[i].p_filesz, prot); if (error != 0) return (error); /* * Establish the base address if this is the first segment. */ if (first) { base_addr = trunc_page(phdr[i].p_vaddr + rbase); first = false; } } if (base_addrp != NULL) *base_addrp = base_addr; return (0); } /* * Load the file "file" into memory. It may be either a shared object * or an executable. * * The "addr" reference parameter is in/out. On entry, it specifies * the address where a shared object should be loaded. If the file is * an executable, this value is ignored. On exit, "addr" specifies * where the file was actually loaded. * * The "entry" reference parameter is out only. On exit, it specifies * the entry point for the loaded file. */ static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry) { struct { struct nameidata nd; struct vattr attr; struct image_params image_params; } *tempdata; const Elf_Ehdr *hdr = NULL; const Elf_Phdr *phdr = NULL; struct nameidata *nd; struct vattr *attr; struct image_params *imgp; u_long rbase; u_long base_addr = 0; int error; #ifdef CAPABILITY_MODE /* * XXXJA: This check can go away once we are sufficiently confident * that the checks in namei() are correct. */ if (IN_CAPABILITY_MODE(curthread)) return (ECAPMODE); #endif tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK | M_ZERO); nd = &tempdata->nd; attr = &tempdata->attr; imgp = &tempdata->image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->attr = attr; NDINIT(nd, LOOKUP, ISOPEN | FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, file, curthread); if ((error = namei(nd)) != 0) { nd->ni_vp = NULL; goto fail; } NDFREE(nd, NDF_ONLY_PNBUF); imgp->vp = nd->ni_vp; /* * Check permissions, modes, uid, etc on the file, and "open" it. */ error = exec_check_permissions(imgp); if (error) goto fail; error = exec_map_first_page(imgp); if (error) goto fail; imgp->object = nd->ni_vp->v_object; hdr = (const Elf_Ehdr *)imgp->image_header; if ((error = __elfN(check_header)(hdr)) != 0) goto fail; if (hdr->e_type == ET_DYN) rbase = *addr; else if (hdr->e_type == ET_EXEC) rbase = 0; else { error = ENOEXEC; goto fail; } /* Only support headers that fit within first page for now */ if ((hdr->e_phoff > PAGE_SIZE) || (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) { error = ENOEXEC; goto fail; } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) { error = ENOEXEC; goto fail; } error = __elfN(load_sections)(imgp, hdr, phdr, rbase, &base_addr); if (error != 0) goto fail; *addr = base_addr; *entry = (unsigned long)hdr->e_entry + rbase; fail: if (imgp->firstpage) exec_unmap_first_page(imgp); if (nd->ni_vp) { if (imgp->textset) VOP_UNSET_TEXT_CHECKED(nd->ni_vp); vput(nd->ni_vp); } free(tempdata, M_TEMP); return (error); } static u_long __CONCAT(rnd_, __elfN(base))(vm_map_t map __unused, u_long minv, u_long maxv, u_int align) { u_long rbase, res; MPASS(vm_map_min(map) <= minv); MPASS(maxv <= vm_map_max(map)); MPASS(minv < maxv); MPASS(minv + align < maxv); arc4rand(&rbase, sizeof(rbase), 0); res = roundup(minv, (u_long)align) + rbase % (maxv - minv); res &= ~((u_long)align - 1); if (res >= maxv) res -= align; KASSERT(res >= minv, ("res %#lx < minv %#lx, maxv %#lx rbase %#lx", res, minv, maxv, rbase)); KASSERT(res < maxv, ("res %#lx > maxv %#lx, minv %#lx rbase %#lx", res, maxv, minv, rbase)); return (res); } static int __elfN(enforce_limits)(struct image_params *imgp, const Elf_Ehdr *hdr, const Elf_Phdr *phdr, u_long et_dyn_addr) { struct vmspace *vmspace; const char *err_str; u_long text_size, data_size, total_size, text_addr, data_addr; u_long seg_size, seg_addr; int i; err_str = NULL; text_size = data_size = total_size = text_addr = data_addr = 0; for (i = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type != PT_LOAD || phdr[i].p_memsz == 0) continue; seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr); seg_size = round_page(phdr[i].p_memsz + phdr[i].p_vaddr + et_dyn_addr - seg_addr); /* * Make the largest executable segment the official * text segment and all others data. * * Note that obreak() assumes that data_addr + data_size == end * of data load area, and the ELF file format expects segments * to be sorted by address. If multiple data segments exist, * the last one will be used. */ if ((phdr[i].p_flags & PF_X) != 0 && text_size < seg_size) { text_size = seg_size; text_addr = seg_addr; } else { data_size = seg_size; data_addr = seg_addr; } total_size += seg_size; } if (data_addr == 0 && data_size == 0) { data_addr = text_addr; data_size = text_size; } /* * Check limits. It should be safe to check the * limits after loading the segments since we do * not actually fault in all the segments pages. */ PROC_LOCK(imgp->proc); if (data_size > lim_cur_proc(imgp->proc, RLIMIT_DATA)) err_str = "Data segment size exceeds process limit"; else if (text_size > maxtsiz) err_str = "Text segment size exceeds system limit"; else if (total_size > lim_cur_proc(imgp->proc, RLIMIT_VMEM)) err_str = "Total segment size exceeds process limit"; else if (racct_set(imgp->proc, RACCT_DATA, data_size) != 0) err_str = "Data segment size exceeds resource limit"; else if (racct_set(imgp->proc, RACCT_VMEM, total_size) != 0) err_str = "Total segment size exceeds resource limit"; PROC_UNLOCK(imgp->proc); if (err_str != NULL) { uprintf("%s\n", err_str); return (ENOMEM); } vmspace = imgp->proc->p_vmspace; vmspace->vm_tsize = text_size >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; vmspace->vm_dsize = data_size >> PAGE_SHIFT; vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; return (0); } static int __elfN(get_interp)(struct image_params *imgp, const Elf_Phdr *phdr, char **interpp, bool *free_interpp) { struct thread *td; char *interp; int error, interp_name_len; KASSERT(phdr->p_type == PT_INTERP, ("%s: p_type %u != PT_INTERP", __func__, phdr->p_type)); ASSERT_VOP_LOCKED(imgp->vp, __func__); td = curthread; /* Path to interpreter */ if (phdr->p_filesz < 2 || phdr->p_filesz > MAXPATHLEN) { uprintf("Invalid PT_INTERP\n"); return (ENOEXEC); } interp_name_len = phdr->p_filesz; if (phdr->p_offset > PAGE_SIZE || interp_name_len > PAGE_SIZE - phdr->p_offset) { /* * The vnode lock might be needed by the pagedaemon to * clean pages owned by the vnode. Do not allow sleep * waiting for memory with the vnode locked, instead * try non-sleepable allocation first, and if it * fails, go to the slow path were we drop the lock * and do M_WAITOK. A text reference prevents * modifications to the vnode content. */ interp = malloc(interp_name_len + 1, M_TEMP, M_NOWAIT); if (interp == NULL) { VOP_UNLOCK(imgp->vp, 0); interp = malloc(interp_name_len + 1, M_TEMP, M_WAITOK); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } error = vn_rdwr(UIO_READ, imgp->vp, interp, interp_name_len, phdr->p_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td); if (error != 0) { free(interp, M_TEMP); uprintf("i/o error PT_INTERP %d\n", error); return (error); } interp[interp_name_len] = '\0'; *interpp = interp; *free_interpp = true; return (0); } interp = __DECONST(char *, imgp->image_header) + phdr->p_offset; if (interp[interp_name_len - 1] != '\0') { uprintf("Invalid PT_INTERP\n"); return (ENOEXEC); } *interpp = interp; *free_interpp = false; return (0); } static int __elfN(load_interp)(struct image_params *imgp, const Elf_Brandinfo *brand_info, const char *interp, u_long *addr, u_long *entry) { char *path; int error; if (brand_info->emul_path != NULL && brand_info->emul_path[0] != '\0') { path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path, interp); error = __elfN(load_file)(imgp->proc, path, addr, entry); free(path, M_TEMP); if (error == 0) return (0); } if (brand_info->interp_newpath != NULL && (brand_info->interp_path == NULL || strcmp(interp, brand_info->interp_path) == 0)) { error = __elfN(load_file)(imgp->proc, brand_info->interp_newpath, addr, entry); if (error == 0) return (0); } error = __elfN(load_file)(imgp->proc, interp, addr, entry); if (error == 0) return (0); uprintf("ELF interpreter %s not found, error %d\n", interp, error); return (error); } /* * Impossible et_dyn_addr initial value indicating that the real base * must be calculated later with some randomization applied. */ #define ET_DYN_ADDR_RAND 1 static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) { struct thread *td; const Elf_Ehdr *hdr; const Elf_Phdr *phdr; Elf_Auxargs *elf_auxargs; struct vmspace *vmspace; vm_map_t map; char *interp; Elf_Brandinfo *brand_info; struct sysentvec *sv; u_long addr, baddr, et_dyn_addr, entry, proghdr; u_long maxalign, mapsz, maxv, maxv1; uint32_t fctl0; int32_t osrel; bool free_interp; int error, i, n; hdr = (const Elf_Ehdr *)imgp->image_header; /* * Do we have a valid ELF header ? * * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later * if particular brand doesn't support it. */ if (__elfN(check_header)(hdr) != 0 || (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)) return (-1); /* * From here on down, we return an errno, not -1, as we've * detected an ELF file. */ if ((hdr->e_phoff > PAGE_SIZE) || (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) { /* Only support headers in first page for now */ uprintf("Program headers not in the first page\n"); return (ENOEXEC); } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) { uprintf("Unaligned program headers\n"); return (ENOEXEC); } n = error = 0; baddr = 0; osrel = 0; fctl0 = 0; entry = proghdr = 0; interp = NULL; free_interp = false; td = curthread; maxalign = PAGE_SIZE; mapsz = 0; for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: if (n == 0) baddr = phdr[i].p_vaddr; if (phdr[i].p_align > maxalign) maxalign = phdr[i].p_align; mapsz += phdr[i].p_memsz; n++; /* * If this segment contains the program headers, * remember their virtual address for the AT_PHDR * aux entry. Static binaries don't usually include * a PT_PHDR entry. */ if (phdr[i].p_offset == 0 && hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize <= phdr[i].p_filesz) proghdr = phdr[i].p_vaddr + hdr->e_phoff; break; case PT_INTERP: /* Path to interpreter */ if (interp != NULL) { uprintf("Multiple PT_INTERP headers\n"); error = ENOEXEC; goto ret; } error = __elfN(get_interp)(imgp, &phdr[i], &interp, &free_interp); if (error != 0) goto ret; break; case PT_GNU_STACK: if (__elfN(nxstack)) imgp->stack_prot = __elfN(trans_prot)(phdr[i].p_flags); imgp->stack_sz = phdr[i].p_memsz; break; case PT_PHDR: /* Program header table info */ proghdr = phdr[i].p_vaddr; break; } } brand_info = __elfN(get_brandinfo)(imgp, interp, &osrel, &fctl0); if (brand_info == NULL) { uprintf("ELF binary type \"%u\" not known.\n", hdr->e_ident[EI_OSABI]); error = ENOEXEC; goto ret; } sv = brand_info->sysvec; et_dyn_addr = 0; if (hdr->e_type == ET_DYN) { if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0) { uprintf("Cannot execute shared object\n"); error = ENOEXEC; goto ret; } /* * Honour the base load address from the dso if it is * non-zero for some reason. */ if (baddr == 0) { if ((sv->sv_flags & SV_ASLR) == 0 || (fctl0 & NT_FREEBSD_FCTL_ASLR_DISABLE) != 0) - et_dyn_addr = ET_DYN_LOAD_ADDR; + et_dyn_addr = __elfN(pie_base); else if ((__elfN(pie_aslr_enabled) && (imgp->proc->p_flag2 & P2_ASLR_DISABLE) == 0) || (imgp->proc->p_flag2 & P2_ASLR_ENABLE) != 0) et_dyn_addr = ET_DYN_ADDR_RAND; else - et_dyn_addr = ET_DYN_LOAD_ADDR; + et_dyn_addr = __elfN(pie_base); } } /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. * * The VV_TEXT flag prevents modifications to the executable while * the vnode is unlocked. */ VOP_UNLOCK(imgp->vp, 0); /* * Decide whether to enable randomization of user mappings. * First, reset user preferences for the setid binaries. * Then, account for the support of the randomization by the * ABI, by user preferences, and make special treatment for * PIE binaries. */ if (imgp->credential_setid) { PROC_LOCK(imgp->proc); imgp->proc->p_flag2 &= ~(P2_ASLR_ENABLE | P2_ASLR_DISABLE); PROC_UNLOCK(imgp->proc); } if ((sv->sv_flags & SV_ASLR) == 0 || (imgp->proc->p_flag2 & P2_ASLR_DISABLE) != 0 || (fctl0 & NT_FREEBSD_FCTL_ASLR_DISABLE) != 0) { KASSERT(et_dyn_addr != ET_DYN_ADDR_RAND, ("et_dyn_addr == RAND and !ASLR")); } else if ((imgp->proc->p_flag2 & P2_ASLR_ENABLE) != 0 || (__elfN(aslr_enabled) && hdr->e_type == ET_EXEC) || et_dyn_addr == ET_DYN_ADDR_RAND) { imgp->map_flags |= MAP_ASLR; /* * If user does not care about sbrk, utilize the bss * grow region for mappings as well. We can select * the base for the image anywere and still not suffer * from the fragmentation. */ if (!__elfN(aslr_honor_sbrk) || (imgp->proc->p_flag2 & P2_ASLR_IGNSTART) != 0) imgp->map_flags |= MAP_ASLR_IGNSTART; } error = exec_new_vmspace(imgp, sv); vmspace = imgp->proc->p_vmspace; map = &vmspace->vm_map; imgp->proc->p_sysent = sv; maxv = vm_map_max(map) - lim_max(td, RLIMIT_STACK); if (et_dyn_addr == ET_DYN_ADDR_RAND) { KASSERT((map->flags & MAP_ASLR) != 0, ("ET_DYN_ADDR_RAND but !MAP_ASLR")); et_dyn_addr = __CONCAT(rnd_, __elfN(base))(map, vm_map_min(map) + mapsz + lim_max(td, RLIMIT_DATA), /* reserve half of the address space to interpreter */ maxv / 2, 1UL << flsl(maxalign)); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto ret; error = __elfN(load_sections)(imgp, hdr, phdr, et_dyn_addr, NULL); if (error != 0) goto ret; error = __elfN(enforce_limits)(imgp, hdr, phdr, et_dyn_addr); if (error != 0) goto ret; entry = (u_long)hdr->e_entry + et_dyn_addr; /* * We load the dynamic linker where a userland call * to mmap(0, ...) would put it. The rationale behind this * calculation is that it leaves room for the heap to grow to * its maximum allowed size. */ addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(td, RLIMIT_DATA)); if ((map->flags & MAP_ASLR) != 0) { maxv1 = maxv / 2 + addr / 2; MPASS(maxv1 >= addr); /* No overflow */ map->anon_loc = __CONCAT(rnd_, __elfN(base))(map, addr, maxv1, MAXPAGESIZES > 1 ? pagesizes[1] : pagesizes[0]); } else { map->anon_loc = addr; } imgp->entry_addr = entry; if (interp != NULL) { VOP_UNLOCK(imgp->vp, 0); if ((map->flags & MAP_ASLR) != 0) { /* Assume that interpeter fits into 1/4 of AS */ maxv1 = maxv / 2 + addr / 2; MPASS(maxv1 >= addr); /* No overflow */ addr = __CONCAT(rnd_, __elfN(base))(map, addr, maxv1, PAGE_SIZE); } error = __elfN(load_interp)(imgp, brand_info, interp, &addr, &imgp->entry_addr); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto ret; } else addr = et_dyn_addr; /* * Construct auxargs table (used by the fixup routine) */ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_NOWAIT); if (elf_auxargs == NULL) { VOP_UNLOCK(imgp->vp, 0); elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } elf_auxargs->execfd = -1; elf_auxargs->phdr = proghdr + et_dyn_addr; elf_auxargs->phent = hdr->e_phentsize; elf_auxargs->phnum = hdr->e_phnum; elf_auxargs->pagesz = PAGE_SIZE; elf_auxargs->base = addr; elf_auxargs->flags = 0; elf_auxargs->entry = entry; elf_auxargs->hdr_eflags = hdr->e_flags; imgp->auxargs = elf_auxargs; imgp->interpreted = 0; imgp->reloc_base = addr; imgp->proc->p_osrel = osrel; imgp->proc->p_fctl0 = fctl0; imgp->proc->p_elf_machine = hdr->e_machine; imgp->proc->p_elf_flags = hdr->e_flags; ret: if (free_interp) free(interp, M_TEMP); return (error); } #define suword __CONCAT(suword, __ELF_WORD_SIZE) int __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp) { Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; Elf_Auxinfo *argarray, *pos; Elf_Addr *base, *auxbase; int error; base = (Elf_Addr *)*stack_base; auxbase = base + imgp->args->argc + 1 + imgp->args->envc + 1; argarray = pos = malloc(AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, AT_EHDRFLAGS, args->hdr_eflags); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp); AUXARGS_ENTRY(pos, AT_OSRELDATE, imgp->proc->p_ucred->cr_prison->pr_osreldate); if (imgp->canary != 0) { AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary); AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen); } AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus); if (imgp->pagesizes != 0) { AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes); AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen); } if (imgp->sysent->sv_timekeep_base != 0) { AUXARGS_ENTRY(pos, AT_TIMEKEEP, imgp->sysent->sv_timekeep_base); } AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : imgp->sysent->sv_stackprot); if (imgp->sysent->sv_hwcap != NULL) AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap); if (imgp->sysent->sv_hwcap2 != NULL) AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= AT_COUNT, ("Too many auxargs")); error = copyout(argarray, auxbase, sizeof(*argarray) * AT_COUNT); free(argarray, M_TEMP); if (error != 0) return (error); base--; if (suword(base, imgp->args->argc) == -1) return (EFAULT); *stack_base = (register_t *)base; return (0); } /* * Code for generating ELF core dumps. */ typedef void (*segment_callback)(vm_map_entry_t, void *); /* Closure for cb_put_phdr(). */ struct phdr_closure { Elf_Phdr *phdr; /* Program header to fill in */ Elf_Off offset; /* Offset of segment in core file */ }; /* Closure for cb_size_segment(). */ struct sseg_closure { int count; /* Count of writable segments. */ size_t size; /* Total size of all writable segments. */ }; typedef void (*outfunc_t)(void *, struct sbuf *, size_t *); struct note_info { int type; /* Note type. */ outfunc_t outfunc; /* Output function. */ void *outarg; /* Argument for the output function. */ size_t outsize; /* Output size. */ TAILQ_ENTRY(note_info) link; /* Link to the next note info. */ }; TAILQ_HEAD(note_info_list, note_info); /* Coredump output parameters. */ struct coredump_params { off_t offset; struct ucred *active_cred; struct ucred *file_cred; struct thread *td; struct vnode *vp; struct compressor *comp; }; extern int compress_user_cores; extern int compress_user_cores_level; static void cb_put_phdr(vm_map_entry_t, void *); static void cb_size_segment(vm_map_entry_t, void *); static int core_write(struct coredump_params *, const void *, size_t, off_t, enum uio_seg); static void each_dumpable_segment(struct thread *, segment_callback, void *); static int __elfN(corehdr)(struct coredump_params *, int, void *, size_t, struct note_info_list *, size_t); static void __elfN(prepare_notes)(struct thread *, struct note_info_list *, size_t *); static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t); static void __elfN(putnote)(struct note_info *, struct sbuf *); static size_t register_note(struct note_info_list *, int, outfunc_t, void *); static int sbuf_drain_core_output(void *, const char *, int); static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *); static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *); static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *); static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *); static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *); static void __elfN(note_ptlwpinfo)(void *, struct sbuf *, size_t *); static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *); static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *); static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *); static void note_procstat_files(void *, struct sbuf *, size_t *); static void note_procstat_groups(void *, struct sbuf *, size_t *); static void note_procstat_osrel(void *, struct sbuf *, size_t *); static void note_procstat_rlimit(void *, struct sbuf *, size_t *); static void note_procstat_umask(void *, struct sbuf *, size_t *); static void note_procstat_vmmap(void *, struct sbuf *, size_t *); /* * Write out a core segment to the compression stream. */ static int compress_chunk(struct coredump_params *p, char *base, char *buf, u_int len) { u_int chunk_len; int error; while (len > 0) { chunk_len = MIN(len, CORE_BUF_SIZE); /* * We can get EFAULT error here. * In that case zero out the current chunk of the segment. */ error = copyin(base, buf, chunk_len); if (error != 0) bzero(buf, chunk_len); error = compressor_write(p->comp, buf, chunk_len); if (error != 0) break; base += chunk_len; len -= chunk_len; } return (error); } static int core_compressed_write(void *base, size_t len, off_t offset, void *arg) { return (core_write((struct coredump_params *)arg, base, len, offset, UIO_SYSSPACE)); } static int core_write(struct coredump_params *p, const void *base, size_t len, off_t offset, enum uio_seg seg) { return (vn_rdwr_inchunks(UIO_WRITE, p->vp, __DECONST(void *, base), len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED, p->active_cred, p->file_cred, NULL, p->td)); } static int core_output(void *base, size_t len, off_t offset, struct coredump_params *p, void *tmpbuf) { int error; if (p->comp != NULL) return (compress_chunk(p, base, tmpbuf, len)); /* * EFAULT is a non-fatal error that we can get, for example, * if the segment is backed by a file but extends beyond its * end. */ error = core_write(p, base, len, offset, UIO_USERSPACE); if (error == EFAULT) { log(LOG_WARNING, "Failed to fully fault in a core file segment " "at VA %p with size 0x%zx to be written at offset 0x%jx " "for process %s\n", base, len, offset, curproc->p_comm); /* * Write a "real" zero byte at the end of the target region * in the case this is the last segment. * The intermediate space will be implicitly zero-filled. */ error = core_write(p, zero_region, 1, offset + len - 1, UIO_SYSSPACE); } return (error); } /* * Drain into a core file. */ static int sbuf_drain_core_output(void *arg, const char *data, int len) { struct coredump_params *p; int error, locked; p = (struct coredump_params *)arg; /* * Some kern_proc out routines that print to this sbuf may * call us with the process lock held. Draining with the * non-sleepable lock held is unsafe. The lock is needed for * those routines when dumping a live process. In our case we * can safely release the lock before draining and acquire * again after. */ locked = PROC_LOCKED(p->td->td_proc); if (locked) PROC_UNLOCK(p->td->td_proc); if (p->comp != NULL) error = compressor_write(p->comp, __DECONST(char *, data), len); else error = core_write(p, __DECONST(void *, data), len, p->offset, UIO_SYSSPACE); if (locked) PROC_LOCK(p->td->td_proc); if (error != 0) return (-error); p->offset += len; return (len); } int __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) { struct ucred *cred = td->td_ucred; int error = 0; struct sseg_closure seginfo; struct note_info_list notelst; struct coredump_params params; struct note_info *ninfo; void *hdr, *tmpbuf; size_t hdrsize, notesz, coresize; hdr = NULL; tmpbuf = NULL; TAILQ_INIT(¬elst); /* Size the program segments. */ seginfo.count = 0; seginfo.size = 0; each_dumpable_segment(td, cb_size_segment, &seginfo); /* * Collect info about the core file header area. */ hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count); if (seginfo.count + 1 >= PN_XNUM) hdrsize += sizeof(Elf_Shdr); __elfN(prepare_notes)(td, ¬elst, ¬esz); coresize = round_page(hdrsize + notesz) + seginfo.size; /* Set up core dump parameters. */ params.offset = 0; params.active_cred = cred; params.file_cred = NOCRED; params.td = td; params.vp = vp; params.comp = NULL; #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_CORE, coresize); PROC_UNLOCK(td->td_proc); if (error != 0) { error = EFAULT; goto done; } } #endif if (coresize >= limit) { error = EFAULT; goto done; } /* Create a compression stream if necessary. */ if (compress_user_cores != 0) { params.comp = compressor_init(core_compressed_write, compress_user_cores, CORE_BUF_SIZE, compress_user_cores_level, ¶ms); if (params.comp == NULL) { error = EFAULT; goto done; } tmpbuf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO); } /* * Allocate memory for building the header, fill it up, * and write it out following the notes. */ hdr = malloc(hdrsize, M_TEMP, M_WAITOK); error = __elfN(corehdr)(¶ms, seginfo.count, hdr, hdrsize, ¬elst, notesz); /* Write the contents of all of the writable segments. */ if (error == 0) { Elf_Phdr *php; off_t offset; int i; php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; offset = round_page(hdrsize + notesz); for (i = 0; i < seginfo.count; i++) { error = core_output((caddr_t)(uintptr_t)php->p_vaddr, php->p_filesz, offset, ¶ms, tmpbuf); if (error != 0) break; offset += php->p_filesz; php++; } if (error == 0 && params.comp != NULL) error = compressor_flush(params.comp); } if (error) { log(LOG_WARNING, "Failed to write core file for process %s (error %d)\n", curproc->p_comm, error); } done: free(tmpbuf, M_TEMP); if (params.comp != NULL) compressor_fini(params.comp); while ((ninfo = TAILQ_FIRST(¬elst)) != NULL) { TAILQ_REMOVE(¬elst, ninfo, link); free(ninfo, M_TEMP); } if (hdr != NULL) free(hdr, M_TEMP); return (error); } /* * A callback for each_dumpable_segment() to write out the segment's * program header entry. */ static void cb_put_phdr(vm_map_entry_t entry, void *closure) { struct phdr_closure *phc = (struct phdr_closure *)closure; Elf_Phdr *phdr = phc->phdr; phc->offset = round_page(phc->offset); phdr->p_type = PT_LOAD; phdr->p_offset = phc->offset; phdr->p_vaddr = entry->start; phdr->p_paddr = 0; phdr->p_filesz = phdr->p_memsz = entry->end - entry->start; phdr->p_align = PAGE_SIZE; phdr->p_flags = __elfN(untrans_prot)(entry->protection); phc->offset += phdr->p_filesz; phc->phdr++; } /* * A callback for each_dumpable_segment() to gather information about * the number of segments and their total size. */ static void cb_size_segment(vm_map_entry_t entry, void *closure) { struct sseg_closure *ssc = (struct sseg_closure *)closure; ssc->count++; ssc->size += entry->end - entry->start; } /* * For each writable segment in the process's memory map, call the given * function with a pointer to the map entry and some arbitrary * caller-supplied data. */ static void each_dumpable_segment(struct thread *td, segment_callback func, void *closure) { struct proc *p = td->td_proc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; vm_object_t backing_object, object; boolean_t ignore_entry; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { /* * Don't dump inaccessible mappings, deal with legacy * coredump mode. * * Note that read-only segments related to the elf binary * are marked MAP_ENTRY_NOCOREDUMP now so we no longer * need to arbitrarily ignore such segments. */ if (elf_legacy_coredump) { if ((entry->protection & VM_PROT_RW) != VM_PROT_RW) continue; } else { if ((entry->protection & VM_PROT_ALL) == 0) continue; } /* * Dont include memory segment in the coredump if * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in * madvise(2). Do not dump submaps (i.e. parts of the * kernel map). */ if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP)) continue; if ((object = entry->object.vm_object) == NULL) continue; /* Ignore memory-mapped devices and such things. */ VM_OBJECT_RLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_RLOCK(backing_object); VM_OBJECT_RUNLOCK(object); object = backing_object; } ignore_entry = object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE && object->type != OBJT_PHYS; VM_OBJECT_RUNLOCK(object); if (ignore_entry) continue; (*func)(entry, closure); } vm_map_unlock_read(map); } /* * Write the core file header to the file, including padding up to * the page boundary. */ static int __elfN(corehdr)(struct coredump_params *p, int numsegs, void *hdr, size_t hdrsize, struct note_info_list *notelst, size_t notesz) { struct note_info *ninfo; struct sbuf *sb; int error; /* Fill in the header. */ bzero(hdr, hdrsize); __elfN(puthdr)(p->td, hdr, hdrsize, numsegs, notesz); sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_drain_core_output, p); sbuf_start_section(sb, NULL); sbuf_bcat(sb, hdr, hdrsize); TAILQ_FOREACH(ninfo, notelst, link) __elfN(putnote)(ninfo, sb); /* Align up to a page boundary for the program segments. */ sbuf_end_section(sb, -1, PAGE_SIZE, 0); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static void __elfN(prepare_notes)(struct thread *td, struct note_info_list *list, size_t *sizep) { struct proc *p; struct thread *thr; size_t size; p = td->td_proc; size = 0; size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p); /* * To have the debugger select the right thread (LWP) as the initial * thread, we dump the state of the thread passed to us in td first. * This is the thread that causes the core dump and thus likely to * be the right thread one wants to have selected in the debugger. */ thr = td; while (thr != NULL) { size += register_note(list, NT_PRSTATUS, __elfN(note_prstatus), thr); size += register_note(list, NT_FPREGSET, __elfN(note_fpregset), thr); size += register_note(list, NT_THRMISC, __elfN(note_thrmisc), thr); size += register_note(list, NT_PTLWPINFO, __elfN(note_ptlwpinfo), thr); size += register_note(list, -1, __elfN(note_threadmd), thr); thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) : TAILQ_NEXT(thr, td_plist); if (thr == td) thr = TAILQ_NEXT(thr, td_plist); } size += register_note(list, NT_PROCSTAT_PROC, __elfN(note_procstat_proc), p); size += register_note(list, NT_PROCSTAT_FILES, note_procstat_files, p); size += register_note(list, NT_PROCSTAT_VMMAP, note_procstat_vmmap, p); size += register_note(list, NT_PROCSTAT_GROUPS, note_procstat_groups, p); size += register_note(list, NT_PROCSTAT_UMASK, note_procstat_umask, p); size += register_note(list, NT_PROCSTAT_RLIMIT, note_procstat_rlimit, p); size += register_note(list, NT_PROCSTAT_OSREL, note_procstat_osrel, p); size += register_note(list, NT_PROCSTAT_PSSTRINGS, __elfN(note_procstat_psstrings), p); size += register_note(list, NT_PROCSTAT_AUXV, __elfN(note_procstat_auxv), p); *sizep = size; } static void __elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs, size_t notesz) { Elf_Ehdr *ehdr; Elf_Phdr *phdr; Elf_Shdr *shdr; struct phdr_closure phc; ehdr = (Elf_Ehdr *)hdr; ehdr->e_ident[EI_MAG0] = ELFMAG0; ehdr->e_ident[EI_MAG1] = ELFMAG1; ehdr->e_ident[EI_MAG2] = ELFMAG2; ehdr->e_ident[EI_MAG3] = ELFMAG3; ehdr->e_ident[EI_CLASS] = ELF_CLASS; ehdr->e_ident[EI_DATA] = ELF_DATA; ehdr->e_ident[EI_VERSION] = EV_CURRENT; ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD; ehdr->e_ident[EI_ABIVERSION] = 0; ehdr->e_ident[EI_PAD] = 0; ehdr->e_type = ET_CORE; ehdr->e_machine = td->td_proc->p_elf_machine; ehdr->e_version = EV_CURRENT; ehdr->e_entry = 0; ehdr->e_phoff = sizeof(Elf_Ehdr); ehdr->e_flags = td->td_proc->p_elf_flags; ehdr->e_ehsize = sizeof(Elf_Ehdr); ehdr->e_phentsize = sizeof(Elf_Phdr); ehdr->e_shentsize = sizeof(Elf_Shdr); ehdr->e_shstrndx = SHN_UNDEF; if (numsegs + 1 < PN_XNUM) { ehdr->e_phnum = numsegs + 1; ehdr->e_shnum = 0; } else { ehdr->e_phnum = PN_XNUM; ehdr->e_shnum = 1; ehdr->e_shoff = ehdr->e_phoff + (numsegs + 1) * ehdr->e_phentsize; KASSERT(ehdr->e_shoff == hdrsize - sizeof(Elf_Shdr), ("e_shoff: %zu, hdrsize - shdr: %zu", (size_t)ehdr->e_shoff, hdrsize - sizeof(Elf_Shdr))); shdr = (Elf_Shdr *)((char *)hdr + ehdr->e_shoff); memset(shdr, 0, sizeof(*shdr)); /* * A special first section is used to hold large segment and * section counts. This was proposed by Sun Microsystems in * Solaris and has been adopted by Linux; the standard ELF * tools are already familiar with the technique. * * See table 7-7 of the Solaris "Linker and Libraries Guide" * (or 12-7 depending on the version of the document) for more * details. */ shdr->sh_type = SHT_NULL; shdr->sh_size = ehdr->e_shnum; shdr->sh_link = ehdr->e_shstrndx; shdr->sh_info = numsegs + 1; } /* * Fill in the program header entries. */ phdr = (Elf_Phdr *)((char *)hdr + ehdr->e_phoff); /* The note segement. */ phdr->p_type = PT_NOTE; phdr->p_offset = hdrsize; phdr->p_vaddr = 0; phdr->p_paddr = 0; phdr->p_filesz = notesz; phdr->p_memsz = 0; phdr->p_flags = PF_R; phdr->p_align = ELF_NOTE_ROUNDSIZE; phdr++; /* All the writable segments from the program. */ phc.phdr = phdr; phc.offset = round_page(hdrsize + notesz); each_dumpable_segment(td, cb_put_phdr, &phc); } static size_t register_note(struct note_info_list *list, int type, outfunc_t out, void *arg) { struct note_info *ninfo; size_t size, notesize; size = 0; out(arg, NULL, &size); ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK); ninfo->type = type; ninfo->outfunc = out; ninfo->outarg = arg; ninfo->outsize = size; TAILQ_INSERT_TAIL(list, ninfo, link); if (type == -1) return (size); notesize = sizeof(Elf_Note) + /* note header */ roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) + /* note name */ roundup2(size, ELF_NOTE_ROUNDSIZE); /* note description */ return (notesize); } static size_t append_note_data(const void *src, void *dst, size_t len) { size_t padded_len; padded_len = roundup2(len, ELF_NOTE_ROUNDSIZE); if (dst != NULL) { bcopy(src, dst, len); bzero((char *)dst + len, padded_len - len); } return (padded_len); } size_t __elfN(populate_note)(int type, void *src, void *dst, size_t size, void **descp) { Elf_Note *note; char *buf; size_t notesize; buf = dst; if (buf != NULL) { note = (Elf_Note *)buf; note->n_namesz = sizeof(FREEBSD_ABI_VENDOR); note->n_descsz = size; note->n_type = type; buf += sizeof(*note); buf += append_note_data(FREEBSD_ABI_VENDOR, buf, sizeof(FREEBSD_ABI_VENDOR)); append_note_data(src, buf, size); if (descp != NULL) *descp = buf; } notesize = sizeof(Elf_Note) + /* note header */ roundup2(sizeof(FREEBSD_ABI_VENDOR), ELF_NOTE_ROUNDSIZE) + /* note name */ roundup2(size, ELF_NOTE_ROUNDSIZE); /* note description */ return (notesize); } static void __elfN(putnote)(struct note_info *ninfo, struct sbuf *sb) { Elf_Note note; ssize_t old_len, sect_len; size_t new_len, descsz, i; if (ninfo->type == -1) { ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize); return; } note.n_namesz = sizeof(FREEBSD_ABI_VENDOR); note.n_descsz = ninfo->outsize; note.n_type = ninfo->type; sbuf_bcat(sb, ¬e, sizeof(note)); sbuf_start_section(sb, &old_len); sbuf_bcat(sb, FREEBSD_ABI_VENDOR, sizeof(FREEBSD_ABI_VENDOR)); sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0); if (note.n_descsz == 0) return; sbuf_start_section(sb, &old_len); ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize); sect_len = sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0); if (sect_len < 0) return; new_len = (size_t)sect_len; descsz = roundup(note.n_descsz, ELF_NOTE_ROUNDSIZE); if (new_len < descsz) { /* * It is expected that individual note emitters will correctly * predict their expected output size and fill up to that size * themselves, padding in a format-specific way if needed. * However, in case they don't, just do it here with zeros. */ for (i = 0; i < descsz - new_len; i++) sbuf_putc(sb, 0); } else if (new_len > descsz) { /* * We can't always truncate sb -- we may have drained some * of it already. */ KASSERT(new_len == descsz, ("%s: Note type %u changed as we " "read it (%zu > %zu). Since it is longer than " "expected, this coredump's notes are corrupt. THIS " "IS A BUG in the note_procstat routine for type %u.\n", __func__, (unsigned)note.n_type, new_len, descsz, (unsigned)note.n_type)); } } /* * Miscellaneous note out functions. */ #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 #include #include typedef struct prstatus32 elf_prstatus_t; typedef struct prpsinfo32 elf_prpsinfo_t; typedef struct fpreg32 elf_prfpregset_t; typedef struct fpreg32 elf_fpregset_t; typedef struct reg32 elf_gregset_t; typedef struct thrmisc32 elf_thrmisc_t; #define ELF_KERN_PROC_MASK KERN_PROC_MASK32 typedef struct kinfo_proc32 elf_kinfo_proc_t; typedef uint32_t elf_ps_strings_t; #else typedef prstatus_t elf_prstatus_t; typedef prpsinfo_t elf_prpsinfo_t; typedef prfpregset_t elf_prfpregset_t; typedef prfpregset_t elf_fpregset_t; typedef gregset_t elf_gregset_t; typedef thrmisc_t elf_thrmisc_t; #define ELF_KERN_PROC_MASK 0 typedef struct kinfo_proc elf_kinfo_proc_t; typedef vm_offset_t elf_ps_strings_t; #endif static void __elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep) { struct sbuf sbarg; size_t len; char *cp, *end; struct proc *p; elf_prpsinfo_t *psinfo; int error; p = (struct proc *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(*psinfo), ("invalid size")); psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK); psinfo->pr_version = PRPSINFO_VERSION; psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t); strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname)); PROC_LOCK(p); if (p->p_args != NULL) { len = sizeof(psinfo->pr_psargs) - 1; if (len > p->p_args->ar_length) len = p->p_args->ar_length; memcpy(psinfo->pr_psargs, p->p_args->ar_args, len); PROC_UNLOCK(p); error = 0; } else { _PHOLD(p); PROC_UNLOCK(p); sbuf_new(&sbarg, psinfo->pr_psargs, sizeof(psinfo->pr_psargs), SBUF_FIXEDLEN); error = proc_getargv(curthread, p, &sbarg); PRELE(p); if (sbuf_finish(&sbarg) == 0) len = sbuf_len(&sbarg) - 1; else len = sizeof(psinfo->pr_psargs) - 1; sbuf_delete(&sbarg); } if (error || len == 0) strlcpy(psinfo->pr_psargs, p->p_comm, sizeof(psinfo->pr_psargs)); else { KASSERT(len < sizeof(psinfo->pr_psargs), ("len is too long: %zu vs %zu", len, sizeof(psinfo->pr_psargs))); cp = psinfo->pr_psargs; end = cp + len - 1; for (;;) { cp = memchr(cp, '\0', end - cp); if (cp == NULL) break; *cp = ' '; } } psinfo->pr_pid = p->p_pid; sbuf_bcat(sb, psinfo, sizeof(*psinfo)); free(psinfo, M_TEMP); } *sizep = sizeof(*psinfo); } static void __elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; elf_prstatus_t *status; td = (struct thread *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(*status), ("invalid size")); status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK); status->pr_version = PRSTATUS_VERSION; status->pr_statussz = sizeof(elf_prstatus_t); status->pr_gregsetsz = sizeof(elf_gregset_t); status->pr_fpregsetsz = sizeof(elf_fpregset_t); status->pr_osreldate = osreldate; status->pr_cursig = td->td_proc->p_sig; status->pr_pid = td->td_tid; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 fill_regs32(td, &status->pr_reg); #else fill_regs(td, &status->pr_reg); #endif sbuf_bcat(sb, status, sizeof(*status)); free(status, M_TEMP); } *sizep = sizeof(*status); } static void __elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; elf_prfpregset_t *fpregset; td = (struct thread *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(*fpregset), ("invalid size")); fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK); #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 fill_fpregs32(td, fpregset); #else fill_fpregs(td, fpregset); #endif sbuf_bcat(sb, fpregset, sizeof(*fpregset)); free(fpregset, M_TEMP); } *sizep = sizeof(*fpregset); } static void __elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; elf_thrmisc_t thrmisc; td = (struct thread *)arg; if (sb != NULL) { KASSERT(*sizep == sizeof(thrmisc), ("invalid size")); bzero(&thrmisc._pad, sizeof(thrmisc._pad)); strcpy(thrmisc.pr_tname, td->td_name); sbuf_bcat(sb, &thrmisc, sizeof(thrmisc)); } *sizep = sizeof(thrmisc); } static void __elfN(note_ptlwpinfo)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; size_t size; int structsize; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 struct ptrace_lwpinfo32 pl; #else struct ptrace_lwpinfo pl; #endif td = (struct thread *)arg; size = sizeof(structsize) + sizeof(pl); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(pl); sbuf_bcat(sb, &structsize, sizeof(structsize)); bzero(&pl, sizeof(pl)); pl.pl_lwpid = td->td_tid; pl.pl_event = PL_EVENT_NONE; pl.pl_sigmask = td->td_sigmask; pl.pl_siglist = td->td_siglist; if (td->td_si.si_signo != 0) { pl.pl_event = PL_EVENT_SIGNAL; pl.pl_flags |= PL_FLAG_SI; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 siginfo_to_siginfo32(&td->td_si, &pl.pl_siginfo); #else pl.pl_siginfo = td->td_si; #endif } strcpy(pl.pl_tdname, td->td_name); /* XXX TODO: supply more information in struct ptrace_lwpinfo*/ sbuf_bcat(sb, &pl, sizeof(pl)); } *sizep = size; } /* * Allow for MD specific notes, as well as any MD * specific preparations for writing MI notes. */ static void __elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep) { struct thread *td; void *buf; size_t size; td = (struct thread *)arg; size = *sizep; if (size != 0 && sb != NULL) buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK); else buf = NULL; size = 0; __elfN(dump_thread)(td, buf, &size); KASSERT(sb == NULL || *sizep == size, ("invalid size")); if (size != 0 && sb != NULL) sbuf_bcat(sb, buf, size); free(buf, M_TEMP); *sizep = size; } #ifdef KINFO_PROC_SIZE CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); #endif static void __elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + p->p_numthreads * sizeof(elf_kinfo_proc_t); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(elf_kinfo_proc_t); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_out(p, sb, ELF_KERN_PROC_MASK); } *sizep = size; } #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static void note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size, sect_sz, i; ssize_t start_len, sect_len; int structsize, filedesc_flags; if (coredump_pack_fileinfo) filedesc_flags = KERN_FILEDESC_PACK_KINFO; else filedesc_flags = 0; p = (struct proc *)arg; structsize = sizeof(struct kinfo_file); if (sb == NULL) { size = 0; sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_count_drain, &size); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_filedesc_out(p, sb, -1, filedesc_flags); sbuf_finish(sb); sbuf_delete(sb); *sizep = size; } else { sbuf_start_section(sb, &start_len); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_filedesc_out(p, sb, *sizep - sizeof(structsize), filedesc_flags); sect_len = sbuf_end_section(sb, start_len, 0, 0); if (sect_len < 0) return; sect_sz = sect_len; KASSERT(sect_sz <= *sizep, ("kern_proc_filedesc_out did not respect maxlen; " "requested %zu, got %zu", *sizep - sizeof(structsize), sect_sz - sizeof(structsize))); for (i = 0; i < *sizep - sect_sz && sb->s_error == 0; i++) sbuf_putc(sb, 0); } } #ifdef KINFO_VMENTRY_SIZE CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE); #endif static void note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize, vmmap_flags; if (coredump_pack_vmmapinfo) vmmap_flags = KERN_VMMAP_PACK_KINFO; else vmmap_flags = 0; p = (struct proc *)arg; structsize = sizeof(struct kinfo_vmentry); if (sb == NULL) { size = 0; sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_count_drain, &size); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_vmmap_out(p, sb, -1, vmmap_flags); sbuf_finish(sb); sbuf_delete(sb); *sizep = size; } else { sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); kern_proc_vmmap_out(p, sb, *sizep - sizeof(structsize), vmmap_flags); } } static void note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(gid_t); sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups * sizeof(gid_t)); } *sizep = size; } static void note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(p->p_fd->fd_cmask); sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask)); } *sizep = size; } static void note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; struct rlimit rlim[RLIM_NLIMITS]; size_t size; int structsize, i; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(rlim); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(rlim); sbuf_bcat(sb, &structsize, sizeof(structsize)); PROC_LOCK(p); for (i = 0; i < RLIM_NLIMITS; i++) lim_rlimit_proc(p, i, &rlim[i]); PROC_UNLOCK(p); sbuf_bcat(sb, rlim, sizeof(rlim)); } *sizep = size; } static void note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(p->p_osrel); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(p->p_osrel); sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel)); } *sizep = size; } static void __elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; elf_ps_strings_t ps_strings; size_t size; int structsize; p = (struct proc *)arg; size = sizeof(structsize) + sizeof(ps_strings); if (sb != NULL) { KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(ps_strings); #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 ps_strings = PTROUT(p->p_sysent->sv_psstrings); #else ps_strings = p->p_sysent->sv_psstrings; #endif sbuf_bcat(sb, &structsize, sizeof(structsize)); sbuf_bcat(sb, &ps_strings, sizeof(ps_strings)); } *sizep = size; } static void __elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep) { struct proc *p; size_t size; int structsize; p = (struct proc *)arg; if (sb == NULL) { size = 0; sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sbuf_set_drain(sb, sbuf_count_drain, &size); sbuf_bcat(sb, &structsize, sizeof(structsize)); PHOLD(p); proc_getauxv(curthread, p, sb); PRELE(p); sbuf_finish(sb); sbuf_delete(sb); *sizep = size; } else { structsize = sizeof(Elf_Auxinfo); sbuf_bcat(sb, &structsize, sizeof(structsize)); PHOLD(p); proc_getauxv(curthread, p, sb); PRELE(p); } } static boolean_t __elfN(parse_notes)(struct image_params *imgp, Elf_Note *checknote, const char *note_vendor, const Elf_Phdr *pnote, boolean_t (*cb)(const Elf_Note *, void *, boolean_t *), void *cb_arg) { const Elf_Note *note, *note0, *note_end; const char *note_name; char *buf; int i, error; boolean_t res; /* We need some limit, might as well use PAGE_SIZE. */ if (pnote == NULL || pnote->p_filesz > PAGE_SIZE) return (FALSE); ASSERT_VOP_LOCKED(imgp->vp, "parse_notes"); if (pnote->p_offset > PAGE_SIZE || pnote->p_filesz > PAGE_SIZE - pnote->p_offset) { buf = malloc(pnote->p_filesz, M_TEMP, M_NOWAIT); if (buf == NULL) { VOP_UNLOCK(imgp->vp, 0); buf = malloc(pnote->p_filesz, M_TEMP, M_WAITOK); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } error = vn_rdwr(UIO_READ, imgp->vp, buf, pnote->p_filesz, pnote->p_offset, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, NOCRED, NULL, curthread); if (error != 0) { uprintf("i/o error PT_NOTE\n"); goto retf; } note = note0 = (const Elf_Note *)buf; note_end = (const Elf_Note *)(buf + pnote->p_filesz); } else { note = note0 = (const Elf_Note *)(imgp->image_header + pnote->p_offset); note_end = (const Elf_Note *)(imgp->image_header + pnote->p_offset + pnote->p_filesz); buf = NULL; } for (i = 0; i < 100 && note >= note0 && note < note_end; i++) { if (!aligned(note, Elf32_Addr) || (const char *)note_end - (const char *)note < sizeof(Elf_Note)) { goto retf; } if (note->n_namesz != checknote->n_namesz || note->n_descsz != checknote->n_descsz || note->n_type != checknote->n_type) goto nextnote; note_name = (const char *)(note + 1); if (note_name + checknote->n_namesz >= (const char *)note_end || strncmp(note_vendor, note_name, checknote->n_namesz) != 0) goto nextnote; if (cb(note, cb_arg, &res)) goto ret; nextnote: note = (const Elf_Note *)((const char *)(note + 1) + roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) + roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE)); } retf: res = FALSE; ret: free(buf, M_TEMP); return (res); } struct brandnote_cb_arg { Elf_Brandnote *brandnote; int32_t *osrel; }; static boolean_t brandnote_cb(const Elf_Note *note, void *arg0, boolean_t *res) { struct brandnote_cb_arg *arg; arg = arg0; /* * Fetch the osreldate for binary from the ELF OSABI-note if * necessary. */ *res = (arg->brandnote->flags & BN_TRANSLATE_OSREL) != 0 && arg->brandnote->trans_osrel != NULL ? arg->brandnote->trans_osrel(note, arg->osrel) : TRUE; return (TRUE); } static Elf_Note fctl_note = { .n_namesz = sizeof(FREEBSD_ABI_VENDOR), .n_descsz = sizeof(uint32_t), .n_type = NT_FREEBSD_FEATURE_CTL, }; struct fctl_cb_arg { uint32_t *fctl0; }; static boolean_t note_fctl_cb(const Elf_Note *note, void *arg0, boolean_t *res) { struct fctl_cb_arg *arg; const Elf32_Word *desc; uintptr_t p; arg = arg0; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE); desc = (const Elf32_Word *)p; *arg->fctl0 = desc[0]; return (TRUE); } /* * Try to find the appropriate ABI-note section for checknote, fetch * the osreldate and feature control flags for binary from the ELF * OSABI-note. Only the first page of the image is searched, the same * as for headers. */ static boolean_t __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote, int32_t *osrel, uint32_t *fctl0) { const Elf_Phdr *phdr; const Elf_Ehdr *hdr; struct brandnote_cb_arg b_arg; struct fctl_cb_arg f_arg; int i, j; hdr = (const Elf_Ehdr *)imgp->image_header; phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); b_arg.brandnote = brandnote; b_arg.osrel = osrel; f_arg.fctl0 = fctl0; for (i = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_NOTE && __elfN(parse_notes)(imgp, &brandnote->hdr, brandnote->vendor, &phdr[i], brandnote_cb, &b_arg)) { for (j = 0; j < hdr->e_phnum; j++) { if (phdr[j].p_type == PT_NOTE && __elfN(parse_notes)(imgp, &fctl_note, FREEBSD_ABI_VENDOR, &phdr[j], note_fctl_cb, &f_arg)) break; } return (TRUE); } } return (FALSE); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw __elfN(execsw) = { .ex_imgact = __CONCAT(exec_, __elfN(imgact)), .ex_name = __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) }; EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw)); static vm_prot_t __elfN(trans_prot)(Elf_Word flags) { vm_prot_t prot; prot = 0; if (flags & PF_X) prot |= VM_PROT_EXECUTE; if (flags & PF_W) prot |= VM_PROT_WRITE; if (flags & PF_R) prot |= VM_PROT_READ; #if __ELF_WORD_SIZE == 32 && (defined(__amd64__) || defined(__i386__)) if (i386_read_exec && (flags & PF_R)) prot |= VM_PROT_EXECUTE; #endif return (prot); } static Elf_Word __elfN(untrans_prot)(vm_prot_t prot) { Elf_Word flags; flags = 0; if (prot & VM_PROT_EXECUTE) flags |= PF_X; if (prot & VM_PROT_READ) flags |= PF_R; if (prot & VM_PROT_WRITE) flags |= PF_W; return (flags); } void __elfN(stackgap)(struct image_params *imgp, u_long *stack_base) { u_long range, rbase, gap; int pct; if ((imgp->map_flags & MAP_ASLR) == 0) return; pct = __elfN(aslr_stack_gap); if (pct == 0) return; if (pct > 50) pct = 50; range = imgp->eff_stack_sz * pct / 100; arc4rand(&rbase, sizeof(rbase), 0); gap = rbase % range; gap &= ~(sizeof(u_long) - 1); *stack_base -= gap; } Index: projects/clang900-import/sys/kern/kern_timeout.c =================================================================== --- projects/clang900-import/sys/kern/kern_timeout.c (revision 352586) +++ projects/clang900-import/sys/kern/kern_timeout.c (revision 352587) @@ -1,1718 +1,1720 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_callout_profiling.h" #include "opt_ddb.h" #if defined(__arm__) #include "opt_timer.h" #endif #include "opt_rss.h" #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #include #endif #ifdef SMP #include #endif #ifndef NO_EVENTTIMERS DPCPU_DECLARE(sbintime_t, hardclocktime); #endif SDT_PROVIDER_DEFINE(callout_execute); SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *"); SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *"); #ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); static int avg_gcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0, "Average number of Giant callouts made per softclock call. Units = 1/1000"); static int avg_lockcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, "Average number of lock callouts made per softclock call. Units = 1/1000"); static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); static int avg_depth_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, "Average number of direct callouts examined per callout_process call. " "Units = 1/1000"); static int avg_lockcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " "callout_process call. Units = 1/1000"); static int avg_mpcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, 0, "Average number of MP direct callouts made per callout_process call. " "Units = 1/1000"); #endif static int ncallout; SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0, "Number of entries in callwheel and size of timeout() preallocation"); #ifdef RSS static int pin_default_swi = 1; static int pin_pcpu_swi = 1; #else static int pin_default_swi = 0; static int pin_pcpu_swi = 0; #endif SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi, 0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)"); SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi, 0, "Pin the per-CPU swis (except PCPU 0, which is also default"); /* * TODO: * allocate more timeout table slots when table overflows. */ u_int callwheelsize, callwheelmask; /* * The callout cpu exec entities represent informations necessary for * describing the state of callouts currently running on the CPU and the ones * necessary for migrating callouts to the new callout cpu. In particular, * the first entry of the array cc_exec_entity holds informations for callout * running in SWI thread context, while the second one holds informations * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ struct cc_exec { struct callout *cc_curr; void (*cc_drain)(void *); void *cc_last_func; void *cc_last_arg; #ifdef SMP void (*ce_migration_func)(void *); void *ce_migration_arg; sbintime_t ce_migration_time; sbintime_t ce_migration_prec; int ce_migration_cpu; #endif bool cc_cancel; bool cc_waiting; }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. */ struct callout_cpu { struct mtx_padalign cc_lock; struct cc_exec cc_exec_entity[2]; struct callout *cc_next; struct callout *cc_callout; struct callout_list *cc_callwheel; struct callout_tailq cc_expireq; struct callout_slist cc_callfree; sbintime_t cc_firstevent; sbintime_t cc_lastscan; void *cc_cookie; u_int cc_bucket; u_int cc_inited; char cc_ktr_event_name[20]; }; #define callout_migrating(c) ((c)->c_iflags & CALLOUT_DFRMIGRATION) #define cc_exec_curr(cc, dir) cc->cc_exec_entity[dir].cc_curr #define cc_exec_last_func(cc, dir) cc->cc_exec_entity[dir].cc_last_func #define cc_exec_last_arg(cc, dir) cc->cc_exec_entity[dir].cc_last_arg #define cc_exec_drain(cc, dir) cc->cc_exec_entity[dir].cc_drain #define cc_exec_next(cc) cc->cc_next #define cc_exec_cancel(cc, dir) cc->cc_exec_entity[dir].cc_cancel #define cc_exec_waiting(cc, dir) cc->cc_exec_entity[dir].cc_waiting #ifdef SMP #define cc_migration_func(cc, dir) cc->cc_exec_entity[dir].ce_migration_func #define cc_migration_arg(cc, dir) cc->cc_exec_entity[dir].ce_migration_arg #define cc_migration_cpu(cc, dir) cc->cc_exec_entity[dir].ce_migration_cpu #define cc_migration_time(cc, dir) cc->cc_exec_entity[dir].ce_migration_time #define cc_migration_prec(cc, dir) cc->cc_exec_entity[dir].ce_migration_prec struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU #define CC_CPU(cpu) (&cc_cpu[(cpu)]) #define CC_SELF() CC_CPU(PCPU_GET(cpuid)) #else struct callout_cpu cc_cpu; #define CC_CPU(cpu) &cc_cpu #define CC_SELF() &cc_cpu #endif #define CC_LOCK(cc) mtx_lock_spin(&(cc)->cc_lock) #define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock) #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) static int timeout_cpu; static void callout_cpu_init(struct callout_cpu *cc, int cpu); static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: * cc_curr - If a callout is in progress, it is cc_curr. * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when * cc_curr is non-NULL. */ /* * Resets the execution entity tied to a specific callout cpu. */ static void cc_cce_cleanup(struct callout_cpu *cc, int direct) { cc_exec_curr(cc, direct) = NULL; cc_exec_cancel(cc, direct) = false; cc_exec_waiting(cc, direct) = false; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } /* * Checks if migration is requested by a specific callout cpu. */ static int cc_cce_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP return (cc_migration_cpu(cc, direct) != CPUBLOCK); #else return (0); #endif } /* * Kernel low level callwheel initialization * called on the BSP during kernel startup. */ static void callout_callwheel_init(void *dummy) { struct callout_cpu *cc; /* * Calculate the size of the callout wheel and the preallocated * timeout() structures. * XXX: Clip callout to result of previous function of maxusers * maximum 384. This is still huge, but acceptable. */ memset(CC_CPU(curcpu), 0, sizeof(cc_cpu)); ncallout = imin(16 + maxproc + maxfiles, 18508); TUNABLE_INT_FETCH("kern.ncallout", &ncallout); /* * Calculate callout wheel size, should be next power of two higher * than 'ncallout'. */ callwheelsize = 1 << fls(ncallout); callwheelmask = callwheelsize - 1; /* * Fetch whether we're pinning the swi's or not. */ TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi); TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi); /* * Only BSP handles timeout(9) and receives a preallocation. * * XXX: Once all timeout(9) consumers are converted this can * be removed. */ timeout_cpu = PCPU_GET(cpuid); cc = CC_CPU(timeout_cpu); cc->cc_callout = malloc(ncallout * sizeof(struct callout), M_CALLOUT, M_WAITOK); callout_cpu_init(cc, timeout_cpu); } SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL); /* * Initialize the per-cpu callout structures. */ static void callout_cpu_init(struct callout_cpu *cc, int cpu) { struct callout *c; int i; mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); SLIST_INIT(&cc->cc_callfree); cc->cc_inited = 1; - cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize, - M_CALLOUT, M_WAITOK); + cc->cc_callwheel = malloc_domainset(sizeof(struct callout_list) * + callwheelsize, M_CALLOUT, + DOMAINSET_PREF(pcpu_find(cpu)->pc_domain), M_WAITOK); for (i = 0; i < callwheelsize; i++) LIST_INIT(&cc->cc_callwheel[i]); TAILQ_INIT(&cc->cc_expireq); cc->cc_firstevent = SBT_MAX; for (i = 0; i < 2; i++) cc_cce_cleanup(cc, i); snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name), "callwheel cpu %d", cpu); if (cc->cc_callout == NULL) /* Only BSP handles timeout(9) */ return; for (i = 0; i < ncallout; i++) { c = &cc->cc_callout[i]; callout_init(c, 0); c->c_iflags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } } #ifdef SMP /* * Switches the cpu tied to a specific callout. * The function expects a locked incoming callout cpu and returns with * locked outcoming callout cpu. */ static struct callout_cpu * callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu) { struct callout_cpu *new_cc; MPASS(c != NULL && cc != NULL); CC_LOCK_ASSERT(cc); /* * Avoid interrupts and preemption firing after the callout cpu * is blocked in order to avoid deadlocks as the new thread * may be willing to acquire the callout cpu lock. */ c->c_cpu = CPUBLOCK; spinlock_enter(); CC_UNLOCK(cc); new_cc = CC_CPU(new_cpu); CC_LOCK(new_cc); spinlock_exit(); c->c_cpu = new_cpu; return (new_cc); } #endif /* * Start standard softclock thread. */ static void start_softclock(void *dummy) { struct callout_cpu *cc; char name[MAXCOMLEN]; #ifdef SMP int cpu; struct intr_event *ie; #endif cc = CC_CPU(timeout_cpu); snprintf(name, sizeof(name), "clock (%d)", timeout_cpu); if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); if (pin_default_swi && (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) { printf("%s: timeout clock couldn't be pinned to cpu %d\n", __func__, timeout_cpu); } #ifdef SMP CPU_FOREACH(cpu) { if (cpu == timeout_cpu) continue; cc = CC_CPU(cpu); cc->cc_callout = NULL; /* Only BSP handles timeout(9). */ callout_cpu_init(cc, cpu); snprintf(name, sizeof(name), "clock (%d)", cpu); ie = NULL; if (swi_add(&ie, name, softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) { printf("%s: per-cpu clock couldn't be pinned to " "cpu %d\n", __func__, cpu); } } #endif } SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); #define CC_HASH_SHIFT 8 static inline u_int callout_hash(sbintime_t sbt) { return (sbt >> (32 - CC_HASH_SHIFT)); } static inline u_int callout_get_bucket(sbintime_t sbt) { return (callout_hash(sbt) & callwheelmask); } void callout_process(sbintime_t now) { struct callout *tmp, *tmpn; struct callout_cpu *cc; struct callout_list *sc; sbintime_t first, last, max, tmp_max; uint32_t lookahead; u_int firstb, lastb, nowb; #ifdef CALLOUT_PROFILING int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; #endif cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); /* Compute the buckets of the last scan and present times. */ firstb = callout_hash(cc->cc_lastscan); cc->cc_lastscan = now; nowb = callout_hash(now); /* Compute the last bucket and minimum time of the bucket after it. */ if (nowb == firstb) lookahead = (SBT_1S / 16); else if (nowb - firstb == 1) lookahead = (SBT_1S / 8); else lookahead = (SBT_1S / 2); first = last = now; first += (lookahead / 2); last += lookahead; last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); lastb = callout_hash(last) - 1; max = last; /* * Check if we wrapped around the entire wheel from the last scan. * In case, we need to scan entirely the wheel for pending callouts. */ if (lastb - firstb >= callwheelsize) { lastb = firstb + callwheelsize - 1; if (nowb - firstb >= callwheelsize) nowb = lastb; } /* Iterate callwheel from firstb to nowb and then up to lastb. */ do { sc = &cc->cc_callwheel[firstb & callwheelmask]; tmp = LIST_FIRST(sc); while (tmp != NULL) { /* Run the callout if present time within allowed. */ if (tmp->c_time <= now) { /* * Consumer told us the callout may be run * directly from hardware interrupt context. */ if (tmp->c_iflags & CALLOUT_DIRECT) { #ifdef CALLOUT_PROFILING ++depth_dir; #endif cc_exec_next(cc) = LIST_NEXT(tmp, c_links.le); cc->cc_bucket = firstb & callwheelmask; LIST_REMOVE(tmp, c_links.le); softclock_call_cc(tmp, cc, #ifdef CALLOUT_PROFILING &mpcalls_dir, &lockcalls_dir, NULL, #endif 1); tmp = cc_exec_next(cc); cc_exec_next(cc) = NULL; } else { tmpn = LIST_NEXT(tmp, c_links.le); LIST_REMOVE(tmp, c_links.le); TAILQ_INSERT_TAIL(&cc->cc_expireq, tmp, c_links.tqe); tmp->c_iflags |= CALLOUT_PROCESSED; tmp = tmpn; } continue; } /* Skip events from distant future. */ if (tmp->c_time >= max) goto next; /* * Event minimal time is bigger than present maximal * time, so it cannot be aggregated. */ if (tmp->c_time > last) { lastb = nowb; goto next; } /* Update first and last time, respecting this event. */ if (tmp->c_time < first) first = tmp->c_time; tmp_max = tmp->c_time + tmp->c_precision; if (tmp_max < last) last = tmp_max; next: tmp = LIST_NEXT(tmp, c_links.le); } /* Proceed with the next bucket. */ firstb++; /* * Stop if we looked after present time and found * some event we can't execute at now. * Stop if we looked far enough into the future. */ } while (((int)(firstb - lastb)) <= 0); cc->cc_firstevent = last; #ifndef NO_EVENTTIMERS cpu_new_callout(curcpu, last, first); #endif #ifdef CALLOUT_PROFILING avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; #endif mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. */ if (!TAILQ_EMPTY(&cc->cc_expireq)) swi_sched(cc->cc_cookie, 0); } static struct callout_cpu * callout_lock(struct callout *c) { struct callout_cpu *cc; int cpu; for (;;) { cpu = c->c_cpu; #ifdef SMP if (cpu == CPUBLOCK) { while (c->c_cpu == CPUBLOCK) cpu_spinwait(); continue; } #endif cc = CC_CPU(cpu); CC_LOCK(cc); if (cpu == c->c_cpu) break; CC_UNLOCK(cc); } return (cc); } static void callout_cc_add(struct callout *c, struct callout_cpu *cc, sbintime_t sbt, sbintime_t precision, void (*func)(void *), void *arg, int cpu, int flags) { int bucket; CC_LOCK_ASSERT(cc); if (sbt < cc->cc_lastscan) sbt = cc->cc_lastscan; c->c_arg = arg; c->c_iflags |= CALLOUT_PENDING; c->c_iflags &= ~CALLOUT_PROCESSED; c->c_flags |= CALLOUT_ACTIVE; if (flags & C_DIRECT_EXEC) c->c_iflags |= CALLOUT_DIRECT; c->c_func = func; c->c_time = sbt; c->c_precision = precision; bucket = callout_get_bucket(c->c_time); CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", c, (int)(c->c_precision >> 32), (u_int)(c->c_precision & 0xffffffff)); LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); if (cc->cc_bucket == bucket) cc_exec_next(cc) = c; #ifndef NO_EVENTTIMERS /* * Inform the eventtimers(4) subsystem there's a new callout * that has been inserted, but only if really required. */ if (SBT_MAX - c->c_time < c->c_precision) c->c_precision = SBT_MAX - c->c_time; sbt = c->c_time + c->c_precision; if (sbt < cc->cc_firstevent) { cc->cc_firstevent = sbt; cpu_new_callout(cpu, sbt, c->c_time); } #endif } static void callout_cc_del(struct callout *c, struct callout_cpu *cc) { if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0) return; c->c_func = NULL; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct) { struct rm_priotracker tracker; void (*c_func)(void *); void *c_arg; struct lock_class *class; struct lock_object *c_lock; uintptr_t lock_status; int c_iflags; #ifdef SMP struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; int flags, new_cpu; sbintime_t new_prec, new_time; #endif #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbintime_t sbt1, sbt2; struct timespec ts2; static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */ static timeout_t *lastfunc; #endif KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING, ("softclock_call_cc: pend %p %x", c, c->c_iflags)); KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE, ("softclock_call_cc: act %p %x", c, c->c_flags)); class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; lock_status = 0; if (c->c_flags & CALLOUT_SHAREDLOCK) { if (class == &lock_class_rm) lock_status = (uintptr_t)&tracker; else lock_status = 1; } c_lock = c->c_lock; c_func = c->c_func; c_arg = c->c_arg; c_iflags = c->c_iflags; if (c->c_iflags & CALLOUT_LOCAL_ALLOC) c->c_iflags = CALLOUT_LOCAL_ALLOC; else c->c_iflags &= ~CALLOUT_PENDING; cc_exec_curr(cc, direct) = c; cc_exec_last_func(cc, direct) = c_func; cc_exec_last_arg(cc, direct) = c_arg; cc_exec_cancel(cc, direct) = false; cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, lock_status); /* * The callout may have been cancelled * while we switched locks. */ if (cc_exec_cancel(cc, direct)) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ cc_exec_cancel(cc, direct) = true; if (c_lock == &Giant.lock_object) { #ifdef CALLOUT_PROFILING (*gcalls)++; #endif CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p", c, c_func, c_arg); } else { #ifdef CALLOUT_PROFILING (*lockcalls)++; #endif CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { #ifdef CALLOUT_PROFILING (*mpcalls)++; #endif CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running", "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt1 = sbinuptime(); #endif THREAD_NO_SLEEPING(); SDT_PROBE1(callout_execute, , , callout__start, c); c_func(c_arg); SDT_PROBE1(callout_execute, , , callout__end, c); THREAD_SLEEPING_OK(); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt2 = sbinuptime(); sbt2 -= sbt1; if (sbt2 > maxdt) { if (lastfunc != c_func || sbt2 > maxdt * 2) { ts2 = sbttots(sbt2); printf( "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } maxdt = sbt2; lastfunc = c_func; } #endif KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle"); CTR1(KTR_CALLOUT, "callout %p finished", c); if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0) class->lc_unlock(c_lock); skip: CC_LOCK(cc); KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr")); cc_exec_curr(cc, direct) = NULL; if (cc_exec_drain(cc, direct)) { void (*drain)(void *); drain = cc_exec_drain(cc, direct); cc_exec_drain(cc, direct) = NULL; CC_UNLOCK(cc); drain(c_arg); CC_LOCK(cc); } if (cc_exec_waiting(cc, direct)) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ if (cc_cce_migrating(cc, direct)) { cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not * destroyed but that is not easy. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; } cc_exec_waiting(cc, direct) = false; CC_UNLOCK(cc); wakeup(&cc_exec_waiting(cc, direct)); CC_LOCK(cc); } else if (cc_cce_migrating(cc, direct)) { KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0, ("Migrating legacy callout %p", c)); #ifdef SMP /* * If the callout was scheduled for * migration just perform it now. */ new_cpu = cc_migration_cpu(cc, direct); new_time = cc_migration_time(cc, direct); new_prec = cc_migration_prec(cc, direct); new_func = cc_migration_func(cc, direct); new_arg = cc_migration_arg(cc, direct); cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed * but that is not easy. * * As first thing, handle deferred callout stops. */ if (!callout_migrating(c)) { CTR3(KTR_CALLOUT, "deferred cancelled %p func %p arg %p", c, new_func, new_arg); callout_cc_del(c, cc); return; } c->c_iflags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); flags = (direct) ? C_DIRECT_EXEC : 0; callout_cc_add(c, new_cc, new_time, new_prec, new_func, new_arg, new_cpu, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else panic("migration should not happen"); #endif } /* * If the current callout is locally allocated (from * timeout(9)) then put it on the freelist. * * Note: we need to check the cached copy of c_iflags because * if it was not local, then it's not safe to deref the * callout pointer. */ KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 || c->c_iflags == CALLOUT_LOCAL_ALLOC, ("corrupted callout")); if (c_iflags & CALLOUT_LOCAL_ALLOC) callout_cc_del(c, cc); } /* * The callout mechanism is based on the work of Adam M. Costello and * George Varghese, published in a technical report entitled "Redesigning * the BSD Callout and Timer Facilities" and modified slightly for inclusion * in FreeBSD by Justin T. Gibbs. The original work on the data structures * used in this implementation was published by G. Varghese and T. Lauck in * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for * the Efficient Implementation of a Timer Facility" in the Proceedings of * the 11th ACM Annual Symposium on Operating Systems Principles, * Austin, Texas Nov 1987. */ /* * Software (low priority) clock interrupt. * Run periodic events from timeout queue. */ void softclock(void *arg) { struct callout_cpu *cc; struct callout *c; #ifdef CALLOUT_PROFILING int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0; #endif cc = (struct callout_cpu *)arg; CC_LOCK(cc); while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); softclock_call_cc(c, cc, #ifdef CALLOUT_PROFILING &mpcalls, &lockcalls, &gcalls, #endif 0); #ifdef CALLOUT_PROFILING ++depth; #endif } #ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; #endif CC_UNLOCK(cc); } /* * timeout -- * Execute a function after a specified length of time. * * untimeout -- * Cancel previous timeout function call. * * callout_handle_init -- * Initialize a handle so that using it with untimeout is benign. * * See AT&T BCI Driver Reference Manual for specification. This * implementation differs from that one in that although an * identification value is returned from timeout, the original * arguments to timeout as well as the identifier are used to * identify entries for untimeout. */ struct callout_handle timeout(timeout_t *ftn, void *arg, int to_ticks) { struct callout_cpu *cc; struct callout *new; struct callout_handle handle; cc = CC_CPU(timeout_cpu); CC_LOCK(cc); /* Fill in the next free callout structure. */ new = SLIST_FIRST(&cc->cc_callfree); if (new == NULL) /* XXX Attempt to malloc first */ panic("timeout table full"); SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle); callout_reset(new, to_ticks, ftn, arg); handle.callout = new; CC_UNLOCK(cc); return (handle); } void untimeout(timeout_t *ftn, void *arg, struct callout_handle handle) { struct callout_cpu *cc; /* * Check for a handle that was initialized * by callout_handle_init, but never used * for a real timeout. */ if (handle.callout == NULL) return; cc = callout_lock(handle.callout); if (handle.callout->c_func == ftn && handle.callout->c_arg == arg) callout_stop(handle.callout); CC_UNLOCK(cc); } void callout_handle_init(struct callout_handle *handle) { handle->callout = NULL; } void callout_when(sbintime_t sbt, sbintime_t precision, int flags, sbintime_t *res, sbintime_t *prec_res) { sbintime_t to_sbt, to_pr; if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) { *res = sbt; *prec_res = precision; return; } if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt) sbt = tick_sbt; if ((flags & C_HARDCLOCK) != 0 || #ifdef NO_EVENTTIMERS sbt >= sbt_timethreshold) { to_sbt = getsbinuptime(); /* Add safety belt for the case of hz > 1000. */ to_sbt += tc_tick_sbt - tick_sbt; #else sbt >= sbt_tickthreshold) { /* * Obtain the time of the last hardclock() call on * this CPU directly from the kern_clocksource.c. * This value is per-CPU, but it is equal for all * active ones. */ #ifdef __LP64__ to_sbt = DPCPU_GET(hardclocktime); #else spinlock_enter(); to_sbt = DPCPU_GET(hardclocktime); spinlock_exit(); #endif #endif if (cold && to_sbt == 0) to_sbt = sbinuptime(); if ((flags & C_HARDCLOCK) == 0) to_sbt += tick_sbt; } else to_sbt = sbinuptime(); if (SBT_MAX - to_sbt < sbt) to_sbt = SBT_MAX; else to_sbt += sbt; *res = to_sbt; to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : sbt >> C_PRELGET(flags)); *prec_res = to_pr > precision ? to_pr : precision; } /* * New interface; clients allocate their own callout structures. * * callout_reset() - establish or change a timeout * callout_stop() - disestablish a timeout * callout_init() - initialize a callout structure so that it can * safely be passed to callout_reset() and callout_stop() * * defines three convenience macros: * * callout_active() - returns truth if callout has not been stopped, * drained, or deactivated since the last time the callout was * reset. * callout_pending() - returns truth if callout is still waiting for timeout * callout_deactivate() - marks the callout as having been serviced */ int callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec, void (*ftn)(void *), void *arg, int cpu, int flags) { sbintime_t to_sbt, precision; struct callout_cpu *cc; int cancelled, direct; int ignore_cpu=0; cancelled = 0; if (cpu == -1) { ignore_cpu = 1; } else if ((cpu >= MAXCPU) || ((CC_CPU(cpu))->cc_inited == 0)) { /* Invalid CPU spec */ panic("Invalid CPU in callout %d", cpu); } callout_when(sbt, prec, flags, &to_sbt, &precision); /* * This flag used to be added by callout_cc_add, but the * first time you call this we could end up with the * wrong direct flag if we don't do it before we add. */ if (flags & C_DIRECT_EXEC) { direct = 1; } else { direct = 0; } KASSERT(!direct || c->c_lock == NULL, ("%s: direct callout %p has lock", __func__, c)); cc = callout_lock(c); /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced or handle the case where the user does * not care. */ if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) || ignore_cpu) { cpu = c->c_cpu; } if (cc_exec_curr(cc, direct) == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ if (c->c_lock != NULL && !cc_exec_cancel(cc, direct)) cancelled = cc_exec_cancel(cc, direct) = true; if (cc_exec_waiting(cc, direct) || cc_exec_drain(cc, direct)) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. */ CTR4(KTR_CALLOUT, "%s %p func %p arg %p", cancelled ? "cancelled" : "failed to cancel", c, c->c_func, c->c_arg); CC_UNLOCK(cc); return (cancelled); } #ifdef SMP if (callout_migrating(c)) { /* * This only occurs when a second callout_reset_sbt_on * is made after a previous one moved it into * deferred migration (below). Note we do *not* change * the prev_cpu even though the previous target may * be different. */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; cancelled = 1; CC_UNLOCK(cc); return (cancelled); } #endif } if (c->c_iflags & CALLOUT_PENDING) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } cancelled = 1; c->c_iflags &= ~ CALLOUT_PENDING; c->c_flags &= ~ CALLOUT_ACTIVE; } #ifdef SMP /* * If the callout must migrate try to perform it immediately. * If the callout is currently running, just defer the migration * to a more appropriate moment. */ if (c->c_cpu != cpu) { if (cc_exec_curr(cc, direct) == c) { /* * Pending will have been removed since we are * actually executing the callout on another * CPU. That callout should be waiting on the * lock the caller holds. If we set both * active/and/pending after we return and the * lock on the executing callout proceeds, it * will then see pending is true and return. * At the return from the actual callout execution * the migration will occur in softclock_call_cc * and this new callout will be placed on the * new CPU via a call to callout_cpu_switch() which * will get the lock on the right CPU followed * by a call callout_cc_add() which will add it there. * (see above in softclock_call_cc()). */ cc_migration_cpu(cc, direct) = cpu; cc_migration_time(cc, direct) = to_sbt; cc_migration_prec(cc, direct) = precision; cc_migration_func(cc, direct) = ftn; cc_migration_arg(cc, direct) = arg; c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING); c->c_flags |= CALLOUT_ACTIVE; CTR6(KTR_CALLOUT, "migration of %p func %p arg %p in %d.%08x to %u deferred", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } cc = callout_cpu_switch(c, cc, cpu); } #endif callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); } /* * Common idioms that can be optimized in the future. */ int callout_schedule_on(struct callout *c, int to_ticks, int cpu) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu); } int callout_schedule(struct callout *c, int to_ticks) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu); } int _callout_stop_safe(struct callout *c, int flags, void (*drain)(void *)) { struct callout_cpu *cc, *old_cc; struct lock_class *class; int direct, sq_locked, use_lock; int cancelled, not_on_a_list; if ((flags & CS_DRAIN) != 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock, "calling %s", __func__); /* * Some old subsystems don't hold Giant while running a callout_stop(), * so just discard this check for the moment. */ if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) { if (c->c_lock == &Giant.lock_object) use_lock = mtx_owned(&Giant); else { use_lock = 1; class = LOCK_CLASS(c->c_lock); class->lc_assert(c->c_lock, LA_XLOCKED); } } else use_lock = 0; if (c->c_iflags & CALLOUT_DIRECT) { direct = 1; } else { direct = 0; } sq_locked = 0; old_cc = NULL; again: cc = callout_lock(c); if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) == (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) && ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) { /* * Special case where this slipped in while we * were migrating *as* the callout is about to * execute. The caller probably holds the lock * the callout wants. * * Get rid of the migration first. Then set * the flag that tells this code *not* to * try to remove it from any lists (its not * on one yet). When the callout wheel runs, * it will ignore this callout. */ c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; not_on_a_list = 1; } else { not_on_a_list = 0; } /* * If the callout was migrating while the callout cpu lock was * dropped, just drop the sleepqueue lock and check the states * again. */ if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); sleepq_release(&cc_exec_waiting(old_cc, direct)); sq_locked = 0; old_cc = NULL; goto again; #else panic("migration should not happen"); #endif } /* * If the callout is running, try to stop it or drain it. */ if (cc_exec_curr(cc, direct) == c) { /* * Succeed we to stop it or not, we must clear the * active flag - this is what API users expect. If we're * draining and the callout is currently executing, first wait * until it finishes. */ if ((flags & CS_DRAIN) == 0) c->c_flags &= ~CALLOUT_ACTIVE; if ((flags & CS_DRAIN) != 0) { /* * The current callout is running (or just * about to run) and blocking is allowed, so * just wait for the current invocation to * finish. */ while (cc_exec_curr(cc, direct) == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid * a LOR between cc_lock and sleepqueue * chain spinlocks. This piece of code * emulates a msleep_spin() call actually. * * If we already have the sleepqueue chain * locked, then we can safely block. If we * don't already have it locked, however, * we have to drop the cc_lock to lock * it. This opens several races, so we * restart at the beginning once we have * both locks. If nothing has changed, then * we will end up back here with sq_locked * set. */ if (!sq_locked) { CC_UNLOCK(cc); sleepq_lock( &cc_exec_waiting(cc, direct)); sq_locked = 1; old_cc = cc; goto again; } /* * Migration could be cancelled here, but * as long as it is still not sure when it * will be packed up, just let softclock() * take care of it. */ cc_exec_waiting(cc, direct) = true; DROP_GIANT(); CC_UNLOCK(cc); sleepq_add( &cc_exec_waiting(cc, direct), &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); sleepq_wait( &cc_exec_waiting(cc, direct), 0); sq_locked = 0; old_cc = NULL; /* Reacquire locks previously released. */ PICKUP_GIANT(); CC_LOCK(cc); } c->c_flags &= ~CALLOUT_ACTIVE; } else if (use_lock && !cc_exec_cancel(cc, direct) && (drain == NULL)) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout * and return. After our caller drops the * lock, the callout will be skipped in * softclock(). This *only* works with a * callout_stop() *not* callout_drain() or * callout_async_drain(). */ cc_exec_cancel(cc, direct) = true; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); KASSERT(!cc_cce_migrating(cc, direct), ("callout wrongly scheduled for migration")); if (callout_migrating(c)) { c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif } CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); return (1); } else if (callout_migrating(c)) { /* * The callout is currently being serviced * and the "next" callout is scheduled at * its completion with a migration. We remove * the migration flag so it *won't* get rescheduled, * but we can't stop the one thats running so * we return 0. */ c->c_iflags &= ~CALLOUT_DFRMIGRATION; #ifdef SMP /* * We can't call cc_cce_cleanup here since * if we do it will remove .ce_curr and * its still running. This will prevent a * reschedule of the callout when the * execution completes. */ cc_migration_cpu(cc, direct) = CPUBLOCK; cc_migration_time(cc, direct) = 0; cc_migration_prec(cc, direct) = 0; cc_migration_func(cc, direct) = NULL; cc_migration_arg(cc, direct) = NULL; #endif CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { cc_exec_drain(cc, direct) = drain; } CC_UNLOCK(cc); return ((flags & CS_EXECUTING) != 0); } CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); if (drain) { cc_exec_drain(cc, direct) = drain; } KASSERT(!sq_locked, ("sleepqueue chain still locked")); cancelled = ((flags & CS_EXECUTING) != 0); } else cancelled = 1; if (sq_locked) sleepq_release(&cc_exec_waiting(cc, direct)); if ((c->c_iflags & CALLOUT_PENDING) == 0) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); /* * For not scheduled and not executing callout return * negative value. */ if (cc_exec_curr(cc, direct) != c) cancelled = -1; CC_UNLOCK(cc); return (cancelled); } c->c_iflags &= ~CALLOUT_PENDING; c->c_flags &= ~CALLOUT_ACTIVE; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); if (not_on_a_list == 0) { if ((c->c_iflags & CALLOUT_PROCESSED) == 0) { if (cc_exec_next(cc) == c) cc_exec_next(cc) = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); } } callout_cc_del(c, cc); CC_UNLOCK(cc); return (cancelled); } void callout_init(struct callout *c, int mpsafe) { bzero(c, sizeof *c); if (mpsafe) { c->c_lock = NULL; c->c_iflags = CALLOUT_RETURNUNLOCKED; } else { c->c_lock = &Giant.lock_object; c->c_iflags = 0; } c->c_cpu = timeout_cpu; } void _callout_init_lock(struct callout *c, struct lock_object *lock, int flags) { bzero(c, sizeof *c); c->c_lock = lock; KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, ("callout_init_lock: bad flags %d", flags)); KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock")); KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags & (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class", __func__)); c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK); c->c_cpu = timeout_cpu; } #ifdef APM_FIXUP_CALLTODO /* * Adjust the kernel calltodo timeout list. This routine is used after * an APM resume to recalculate the calltodo timer list values with the * number of hz's we have been sleeping. The next hardclock() will detect * that there are fired timers and run softclock() to execute them. * * Please note, I have not done an exhaustive analysis of what code this * might break. I am motivated to have my select()'s and alarm()'s that * have expired during suspend firing upon resume so that the applications * which set the timer can do the maintanence the timer was for as close * as possible to the originally intended time. Testing this code for a * week showed that resuming from a suspend resulted in 22 to 25 timers * firing, which seemed independent on whether the suspend was 2 hours or * 2 days. Your milage may vary. - Ken Key */ void adjust_timeout_calltodo(struct timeval *time_change) { struct callout *p; unsigned long delta_ticks; /* * How many ticks were we asleep? * (stolen from tvtohz()). */ /* Don't do anything */ if (time_change->tv_sec < 0) return; else if (time_change->tv_sec <= LONG_MAX / 1000000) delta_ticks = howmany(time_change->tv_sec * 1000000 + time_change->tv_usec, tick) + 1; else if (time_change->tv_sec <= LONG_MAX / hz) delta_ticks = time_change->tv_sec * hz + howmany(time_change->tv_usec, tick) + 1; else delta_ticks = LONG_MAX; if (delta_ticks > INT_MAX) delta_ticks = INT_MAX; /* * Now rip through the timer calltodo list looking for timers * to expire. */ /* don't collide with softclock() */ CC_LOCK(cc); for (p = calltodo.c_next; p != NULL; p = p->c_next) { p->c_time -= delta_ticks; /* Break if the timer had more time on it than delta_ticks */ if (p->c_time > 0) break; /* take back the ticks the timer didn't use (p->c_time <= 0) */ delta_ticks = -p->c_time; } CC_UNLOCK(cc); return; } #endif /* APM_FIXUP_CALLTODO */ static int flssbt(sbintime_t sbt) { sbt += (uint64_t)sbt >> 1; if (sizeof(long) >= sizeof(sbintime_t)) return (flsl(sbt)); if (sbt >= SBT_1S) return (flsl(((uint64_t)sbt) >> 32) + 32); return (flsl(sbt)); } /* * Dump immediate statistic snapshot of the scheduled callouts. */ static int sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) { struct callout *tmp; struct callout_cpu *cc; struct callout_list *sc; sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; int ct[64], cpr[64], ccpbk[32]; int error, val, i, count, tcum, pcum, maxc, c, medc; #ifdef SMP int cpu; #endif val = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); count = maxc = 0; st = spr = maxt = maxpr = 0; bzero(ccpbk, sizeof(ccpbk)); bzero(ct, sizeof(ct)); bzero(cpr, sizeof(cpr)); now = sbinuptime(); #ifdef SMP CPU_FOREACH(cpu) { cc = CC_CPU(cpu); #else cc = CC_CPU(timeout_cpu); #endif CC_LOCK(cc); for (i = 0; i < callwheelsize; i++) { sc = &cc->cc_callwheel[i]; c = 0; LIST_FOREACH(tmp, sc, c_links.le) { c++; t = tmp->c_time - now; if (t < 0) t = 0; st += t / SBT_1US; spr += tmp->c_precision / SBT_1US; if (t > maxt) maxt = t; if (tmp->c_precision > maxpr) maxpr = tmp->c_precision; ct[flssbt(t)]++; cpr[flssbt(tmp->c_precision)]++; } if (c > maxc) maxc = c; ccpbk[fls(c + c / 2)]++; count += c; } CC_UNLOCK(cc); #ifdef SMP } #endif for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++) tcum += ct[i]; medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++) pcum += cpr[i]; medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, c = 0; i < 32 && c < count / 2; i++) c += ccpbk[i]; medc = (i >= 2) ? (1 << (i - 2)) : 0; printf("Scheduled callouts statistic snapshot:\n"); printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n", count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT); printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n", medc, count / callwheelsize / mp_ncpus, (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000, maxc); printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32, (st / count) / 1000000, (st / count) % 1000000, maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32); printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32, (spr / count) / 1000000, (spr / count) % 1000000, maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32); printf(" Distribution: \tbuckets\t time\t tcum\t" " prec\t pcum\n"); for (i = 0, tcum = pcum = 0; i < 64; i++) { if (ct[i] == 0 && cpr[i] == 0) continue; t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0; tcum += ct[i]; pcum += cpr[i]; printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n", t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32, i - 1 - (32 - CC_HASH_SHIFT), ct[i], tcum, cpr[i], pcum); } return (error); } SYSCTL_PROC(_kern, OID_AUTO, callout_stat, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_callout_stat, "I", "Dump immediate statistic snapshot of the scheduled callouts"); #ifdef DDB static void _show_callout(struct callout *c) { db_printf("callout %p\n", c); #define C_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, c->e); db_printf(" &c_links = %p\n", &(c->c_links)); C_DB_PRINTF("%" PRId64, c_time); C_DB_PRINTF("%" PRId64, c_precision); C_DB_PRINTF("%p", c_arg); C_DB_PRINTF("%p", c_func); C_DB_PRINTF("%p", c_lock); C_DB_PRINTF("%#x", c_flags); C_DB_PRINTF("%#x", c_iflags); C_DB_PRINTF("%d", c_cpu); #undef C_DB_PRINTF } DB_SHOW_COMMAND(callout, db_show_callout) { if (!have_addr) { db_printf("usage: show callout \n"); return; } _show_callout((struct callout *)addr); } static void _show_last_callout(int cpu, int direct, const char *dirstr) { struct callout_cpu *cc; void *func, *arg; cc = CC_CPU(cpu); func = cc_exec_last_func(cc, direct); arg = cc_exec_last_arg(cc, direct); db_printf("cpu %d last%s callout function: %p ", cpu, dirstr, func); db_printsym((db_expr_t)func, DB_STGY_ANY); db_printf("\ncpu %d last%s callout argument: %p\n", cpu, dirstr, arg); } DB_SHOW_COMMAND(callout_last, db_show_callout_last) { int cpu, last; if (have_addr) { if (addr < 0 || addr > mp_maxid || CPU_ABSENT(addr)) { db_printf("no such cpu: %d\n", (int)addr); return; } cpu = last = addr; } else { cpu = 0; last = mp_maxid; } while (cpu <= last) { if (!CPU_ABSENT(cpu)) { _show_last_callout(cpu, 0, ""); _show_last_callout(cpu, 1, " direct"); } cpu++; } } #endif /* DDB */ Index: projects/clang900-import/sys/kern/subr_lock.c =================================================================== --- projects/clang900-import/sys/kern/subr_lock.c (revision 352586) +++ projects/clang900-import/sys/kern/subr_lock.c (revision 352587) @@ -1,703 +1,698 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables and functions used to maintain * lock_object structures. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mprof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include SDT_PROVIDER_DEFINE(lock); SDT_PROBE_DEFINE1(lock, , , starvation, "u_int"); CTASSERT(LOCK_CLASS_MAX == 15); struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = { &lock_class_mtx_spin, &lock_class_mtx_sleep, &lock_class_sx, &lock_class_rm, &lock_class_rm_sleepable, &lock_class_rw, &lock_class_lockmgr, }; void lock_init(struct lock_object *lock, struct lock_class *class, const char *name, const char *type, int flags) { int i; /* Check for double-init and zero object. */ KASSERT(flags & LO_NEW || !lock_initialized(lock), ("lock \"%s\" %p already initialized", name, lock)); /* Look up lock class to find its index. */ for (i = 0; i < LOCK_CLASS_MAX; i++) if (lock_classes[i] == class) { lock->lo_flags = i << LO_CLASSSHIFT; break; } KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class)); /* Initialize the lock object. */ lock->lo_name = name; lock->lo_flags |= flags | LO_INITIALIZED; LOCK_LOG_INIT(lock, 0); WITNESS_INIT(lock, (type != NULL) ? type : name); } void lock_destroy(struct lock_object *lock) { KASSERT(lock_initialized(lock), ("lock %p is not initialized", lock)); WITNESS_DESTROY(lock); LOCK_LOG_DESTROY(lock, 0); lock->lo_flags &= ~LO_INITIALIZED; } static SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging"); static SYSCTL_NODE(_debug_lock, OID_AUTO, delay, CTLFLAG_RD, NULL, "lock delay"); static u_int __read_mostly starvation_limit = 131072; SYSCTL_INT(_debug_lock_delay, OID_AUTO, starvation_limit, CTLFLAG_RW, &starvation_limit, 0, ""); static u_int __read_mostly restrict_starvation = 0; SYSCTL_INT(_debug_lock_delay, OID_AUTO, restrict_starvation, CTLFLAG_RW, &restrict_starvation, 0, ""); void lock_delay(struct lock_delay_arg *la) { struct lock_delay_config *lc = la->config; u_int i; la->delay <<= 1; if (__predict_false(la->delay > lc->max)) la->delay = lc->max; for (i = la->delay; i > 0; i--) cpu_spinwait(); la->spin_cnt += la->delay; if (__predict_false(la->spin_cnt > starvation_limit)) { SDT_PROBE1(lock, , , starvation, la->delay); if (restrict_starvation) la->delay = lc->base; } } static u_int lock_roundup_2(u_int val) { u_int res; for (res = 1; res <= val; res <<= 1) continue; return (res); } void lock_delay_default_init(struct lock_delay_config *lc) { lc->base = 1; lc->max = lock_roundup_2(mp_ncpus) * 256; if (lc->max > 32678) lc->max = 32678; } #ifdef DDB DB_SHOW_COMMAND(lock, db_show_lock) { struct lock_object *lock; struct lock_class *class; if (!have_addr) return; lock = (struct lock_object *)addr; if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) { db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock)); return; } class = LOCK_CLASS(lock); db_printf(" class: %s\n", class->lc_name); db_printf(" name: %s\n", lock->lo_name); class->lc_ddb_show(lock); } #endif #ifdef LOCK_PROFILING /* * One object per-thread for each lock the thread owns. Tracks individual * lock instances. */ struct lock_profile_object { LIST_ENTRY(lock_profile_object) lpo_link; struct lock_object *lpo_obj; const char *lpo_file; int lpo_line; uint16_t lpo_ref; uint16_t lpo_cnt; uint64_t lpo_acqtime; uint64_t lpo_waittime; u_int lpo_contest_locking; }; /* * One lock_prof for each (file, line, lock object) triple. */ struct lock_prof { SLIST_ENTRY(lock_prof) link; struct lock_class *class; const char *file; const char *name; int line; int ticks; uintmax_t cnt_wait_max; uintmax_t cnt_max; uintmax_t cnt_tot; uintmax_t cnt_wait; uintmax_t cnt_cur; uintmax_t cnt_contest_locking; }; SLIST_HEAD(lphead, lock_prof); #define LPROF_HASH_SIZE 4096 #define LPROF_HASH_MASK (LPROF_HASH_SIZE - 1) #define LPROF_CACHE_SIZE 4096 /* * Array of objects and profs for each type of object for each cpu. Spinlocks * are handled separately because a thread may be preempted and acquire a * spinlock while in the lock profiling code of a non-spinlock. In this way * we only need a critical section to protect the per-cpu lists. */ struct lock_prof_type { struct lphead lpt_lpalloc; struct lpohead lpt_lpoalloc; struct lphead lpt_hash[LPROF_HASH_SIZE]; struct lock_prof lpt_prof[LPROF_CACHE_SIZE]; struct lock_profile_object lpt_objs[LPROF_CACHE_SIZE]; }; struct lock_prof_cpu { struct lock_prof_type lpc_types[2]; /* One for spin one for other. */ }; struct lock_prof_cpu *lp_cpu[MAXCPU]; volatile int __read_mostly lock_prof_enable; static volatile int lock_prof_resetting; #define LPROF_SBUF_SIZE 256 static int lock_prof_rejected; static int lock_prof_skipspin; static int lock_prof_skipcount; #ifndef USE_CPU_NANOSECONDS uint64_t nanoseconds(void) { struct bintime bt; uint64_t ns; binuptime(&bt); /* From bintime2timespec */ ns = bt.sec * (uint64_t)1000000000; ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32; return (ns); } #endif static void lock_prof_init_type(struct lock_prof_type *type) { int i; SLIST_INIT(&type->lpt_lpalloc); LIST_INIT(&type->lpt_lpoalloc); for (i = 0; i < LPROF_CACHE_SIZE; i++) { SLIST_INSERT_HEAD(&type->lpt_lpalloc, &type->lpt_prof[i], link); LIST_INSERT_HEAD(&type->lpt_lpoalloc, &type->lpt_objs[i], lpo_link); } } static void lock_prof_init(void *arg) { int cpu; - for (cpu = 0; cpu <= mp_maxid; cpu++) { + CPU_FOREACH(cpu) { lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF, M_WAITOK | M_ZERO); lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]); lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]); } } SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL); static void lock_prof_reset_wait(void) { /* * Spin relinquishing our cpu so that quiesce_all_cpus may * complete. */ while (lock_prof_resetting) sched_relinquish(curthread); } static void lock_prof_reset(void) { struct lock_prof_cpu *lpc; int enabled, i, cpu; /* * We not only race with acquiring and releasing locks but also * thread exit. To be certain that threads exit without valid head * pointers they must see resetting set before enabled is cleared. * Otherwise a lock may not be removed from a per-thread list due * to disabled being set but not wait for reset() to remove it below. */ atomic_store_rel_int(&lock_prof_resetting, 1); enabled = lock_prof_enable; lock_prof_enable = 0; quiesce_all_cpus("profreset", 0); /* * Some objects may have migrated between CPUs. Clear all links * before we zero the structures. Some items may still be linked * into per-thread lists as well. */ - for (cpu = 0; cpu <= mp_maxid; cpu++) { + CPU_FOREACH(cpu) { lpc = lp_cpu[cpu]; for (i = 0; i < LPROF_CACHE_SIZE; i++) { LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link); LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link); } } - for (cpu = 0; cpu <= mp_maxid; cpu++) { + CPU_FOREACH(cpu) { lpc = lp_cpu[cpu]; bzero(lpc, sizeof(*lpc)); lock_prof_init_type(&lpc->lpc_types[0]); lock_prof_init_type(&lpc->lpc_types[1]); } atomic_store_rel_int(&lock_prof_resetting, 0); lock_prof_enable = enabled; } static void lock_prof_output(struct lock_prof *lp, struct sbuf *sb) { const char *p; for (p = lp->file; p != NULL && strncmp(p, "../", 3) == 0; p += 3); sbuf_printf(sb, "%8ju %9ju %11ju %11ju %11ju %6ju %6ju %2ju %6ju %s:%d (%s:%s)\n", lp->cnt_max / 1000, lp->cnt_wait_max / 1000, lp->cnt_tot / 1000, lp->cnt_wait / 1000, lp->cnt_cur, lp->cnt_cur == 0 ? (uintmax_t)0 : lp->cnt_tot / (lp->cnt_cur * 1000), lp->cnt_cur == 0 ? (uintmax_t)0 : lp->cnt_wait / (lp->cnt_cur * 1000), (uintmax_t)0, lp->cnt_contest_locking, p, lp->line, lp->class->lc_name, lp->name); } static void lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash, int spin, int t) { struct lock_prof_type *type; struct lock_prof *l; int cpu; dst->file = match->file; dst->line = match->line; dst->class = match->class; dst->name = match->name; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (lp_cpu[cpu] == NULL) - continue; + CPU_FOREACH(cpu) { type = &lp_cpu[cpu]->lpc_types[spin]; SLIST_FOREACH(l, &type->lpt_hash[hash], link) { if (l->ticks == t) continue; if (l->file != match->file || l->line != match->line || l->name != match->name) continue; l->ticks = t; if (l->cnt_max > dst->cnt_max) dst->cnt_max = l->cnt_max; if (l->cnt_wait_max > dst->cnt_wait_max) dst->cnt_wait_max = l->cnt_wait_max; dst->cnt_tot += l->cnt_tot; dst->cnt_wait += l->cnt_wait; dst->cnt_cur += l->cnt_cur; dst->cnt_contest_locking += l->cnt_contest_locking; } } - } static void lock_prof_type_stats(struct lock_prof_type *type, struct sbuf *sb, int spin, int t) { struct lock_prof *l; int i; for (i = 0; i < LPROF_HASH_SIZE; ++i) { SLIST_FOREACH(l, &type->lpt_hash[i], link) { struct lock_prof lp = {}; if (l->ticks == t) continue; lock_prof_sum(l, &lp, i, spin, t); lock_prof_output(&lp, sb); } } } static int dump_lock_prof_stats(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error, cpu, t; int enabled; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, LPROF_SBUF_SIZE, req); sbuf_printf(sb, "\n%8s %9s %11s %11s %11s %6s %6s %2s %6s %s\n", "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name"); enabled = lock_prof_enable; lock_prof_enable = 0; quiesce_all_cpus("profstat", 0); t = ticks; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (lp_cpu[cpu] == NULL) - continue; + CPU_FOREACH(cpu) { lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t); lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t); } lock_prof_enable = enabled; error = sbuf_finish(sb); /* Output a trailing NUL. */ if (error == 0) error = SYSCTL_OUT(req, "", 1); sbuf_delete(sb); return (error); } static int enable_lock_prof(SYSCTL_HANDLER_ARGS) { int error, v; v = lock_prof_enable; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == lock_prof_enable) return (0); if (v == 1) lock_prof_reset(); lock_prof_enable = !!v; return (0); } static int reset_lock_prof_stats(SYSCTL_HANDLER_ARGS) { int error, v; v = 0; error = sysctl_handle_int(oidp, &v, 0, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == 0) return (0); lock_prof_reset(); return (0); } static struct lock_prof * lock_profile_lookup(struct lock_object *lo, int spin, const char *file, int line) { const char *unknown = "(unknown)"; struct lock_prof_type *type; struct lock_prof *lp; struct lphead *head; const char *p; u_int hash; p = file; if (p == NULL || *p == '\0') p = unknown; hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line; hash &= LPROF_HASH_MASK; type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; head = &type->lpt_hash[hash]; SLIST_FOREACH(lp, head, link) { if (lp->line == line && lp->file == p && lp->name == lo->lo_name) return (lp); } lp = SLIST_FIRST(&type->lpt_lpalloc); if (lp == NULL) { lock_prof_rejected++; return (lp); } SLIST_REMOVE_HEAD(&type->lpt_lpalloc, link); lp->file = p; lp->line = line; lp->class = LOCK_CLASS(lo); lp->name = lo->lo_name; SLIST_INSERT_HEAD(&type->lpt_hash[hash], lp, link); return (lp); } static struct lock_profile_object * lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file, int line) { struct lock_profile_object *l; struct lock_prof_type *type; struct lpohead *head; head = &curthread->td_lprof[spin]; LIST_FOREACH(l, head, lpo_link) if (l->lpo_obj == lo && l->lpo_file == file && l->lpo_line == line) return (l); type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; l = LIST_FIRST(&type->lpt_lpoalloc); if (l == NULL) { lock_prof_rejected++; return (NULL); } LIST_REMOVE(l, lpo_link); l->lpo_obj = lo; l->lpo_file = file; l->lpo_line = line; l->lpo_cnt = 0; LIST_INSERT_HEAD(head, l, lpo_link); return (l); } void lock_profile_obtain_lock_success(struct lock_object *lo, int contested, uint64_t waittime, const char *file, int line) { static int lock_prof_count; struct lock_profile_object *l; int spin; if (SCHEDULER_STOPPED()) return; /* don't reset the timer when/if recursing */ if (!lock_prof_enable || (lo->lo_flags & LO_NOPROFILE)) return; if (lock_prof_skipcount && (++lock_prof_count % lock_prof_skipcount) != 0) return; spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0; if (spin && lock_prof_skipspin == 1) return; critical_enter(); /* Recheck enabled now that we're in a critical section. */ if (lock_prof_enable == 0) goto out; l = lock_profile_object_lookup(lo, spin, file, line); if (l == NULL) goto out; l->lpo_cnt++; if (++l->lpo_ref > 1) goto out; l->lpo_contest_locking = contested; l->lpo_acqtime = nanoseconds(); if (waittime && (l->lpo_acqtime > waittime)) l->lpo_waittime = l->lpo_acqtime - waittime; else l->lpo_waittime = 0; out: critical_exit(); } void lock_profile_thread_exit(struct thread *td) { #ifdef INVARIANTS struct lock_profile_object *l; MPASS(curthread->td_critnest == 0); #endif /* * If lock profiling was disabled we have to wait for reset to * clear our pointers before we can exit safely. */ lock_prof_reset_wait(); #ifdef INVARIANTS LIST_FOREACH(l, &td->td_lprof[0], lpo_link) printf("thread still holds lock acquired at %s:%d\n", l->lpo_file, l->lpo_line); LIST_FOREACH(l, &td->td_lprof[1], lpo_link) printf("thread still holds lock acquired at %s:%d\n", l->lpo_file, l->lpo_line); #endif MPASS(LIST_FIRST(&td->td_lprof[0]) == NULL); MPASS(LIST_FIRST(&td->td_lprof[1]) == NULL); } void lock_profile_release_lock(struct lock_object *lo) { struct lock_profile_object *l; struct lock_prof_type *type; struct lock_prof *lp; uint64_t curtime, holdtime; struct lpohead *head; int spin; if (SCHEDULER_STOPPED()) return; if (lo->lo_flags & LO_NOPROFILE) return; spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0; head = &curthread->td_lprof[spin]; if (LIST_FIRST(head) == NULL) return; critical_enter(); /* Recheck enabled now that we're in a critical section. */ if (lock_prof_enable == 0 && lock_prof_resetting == 1) goto out; /* * If lock profiling is not enabled we still want to remove the * lpo from our queue. */ LIST_FOREACH(l, head, lpo_link) if (l->lpo_obj == lo) break; if (l == NULL) goto out; if (--l->lpo_ref > 0) goto out; lp = lock_profile_lookup(lo, spin, l->lpo_file, l->lpo_line); if (lp == NULL) goto release; curtime = nanoseconds(); if (curtime < l->lpo_acqtime) goto release; holdtime = curtime - l->lpo_acqtime; /* * Record if the lock has been held longer now than ever * before. */ if (holdtime > lp->cnt_max) lp->cnt_max = holdtime; if (l->lpo_waittime > lp->cnt_wait_max) lp->cnt_wait_max = l->lpo_waittime; lp->cnt_tot += holdtime; lp->cnt_wait += l->lpo_waittime; lp->cnt_contest_locking += l->lpo_contest_locking; lp->cnt_cur += l->lpo_cnt; release: LIST_REMOVE(l, lpo_link); type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link); out: critical_exit(); } static SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL, "lock profiling"); SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipspin, CTLFLAG_RW, &lock_prof_skipspin, 0, "Skip profiling on spinlocks."); SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipcount, CTLFLAG_RW, &lock_prof_skipcount, 0, "Sample approximately every N lock acquisitions."); SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD, &lock_prof_rejected, 0, "Number of rejected profiling records"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, enable_lock_prof, "I", "Enable lock profiling"); #endif Index: projects/clang900-import/sys/kern/uipc_ktls.c =================================================================== --- projects/clang900-import/sys/kern/uipc_ktls.c (revision 352586) +++ projects/clang900-import/sys/kern/uipc_ktls.c (revision 352587) @@ -1,1450 +1,1450 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014-2019 Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) #include #endif #include #ifdef RSS #include #include #endif #if defined(INET) || defined(INET6) #include #include #endif #include #include #include #include #include #include struct ktls_wq { struct mtx mtx; STAILQ_HEAD(, mbuf_ext_pgs) head; bool running; } __aligned(CACHE_LINE_SIZE); static struct ktls_wq *ktls_wq; static struct proc *ktls_proc; LIST_HEAD(, ktls_crypto_backend) ktls_backends; static struct rmlock ktls_backends_lock; static uma_zone_t ktls_session_zone; static uint16_t ktls_cpuid_lookup[MAXCPU]; SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW, 0, "Kernel TLS offload"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW, 0, "Kernel TLS offload stats"); static int ktls_allow_unload; SYSCTL_INT(_kern_ipc_tls, OID_AUTO, allow_unload, CTLFLAG_RDTUN, &ktls_allow_unload, 0, "Allow software crypto modules to unload"); #ifdef RSS static int ktls_bind_threads = 1; #else static int ktls_bind_threads; #endif SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN, &ktls_bind_threads, 0, "Bind crypto threads to cores or domains at boot"); static u_int ktls_maxlen = 16384; SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RWTUN, &ktls_maxlen, 0, "Maximum TLS record size"); static int ktls_number_threads; SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD, &ktls_number_threads, 0, "Number of TLS threads in thread-pool"); static bool ktls_offload_enable; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RW, &ktls_offload_enable, 0, "Enable support for kernel TLS offload"); static bool ktls_cbc_enable = true; SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RW, &ktls_cbc_enable, 1, "Enable Support of AES-CBC crypto for kernel TLS"); static counter_u64_t ktls_tasks_active; SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD, &ktls_tasks_active, "Number of active tasks"); static counter_u64_t ktls_cnt_on; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, so_inqueue, CTLFLAG_RD, &ktls_cnt_on, "Number of TLS records in queue to tasks for SW crypto"); static counter_u64_t ktls_offload_total; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total, CTLFLAG_RD, &ktls_offload_total, "Total successful TLS setups (parameters set)"); static counter_u64_t ktls_offload_enable_calls; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls, CTLFLAG_RD, &ktls_offload_enable_calls, "Total number of TLS enable calls made"); static counter_u64_t ktls_offload_active; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD, &ktls_offload_active, "Total Active TLS sessions"); static counter_u64_t ktls_offload_failed_crypto; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD, &ktls_offload_failed_crypto, "Total TLS crypto failures"); static counter_u64_t ktls_switch_to_ifnet; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD, &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet"); static counter_u64_t ktls_switch_to_sw; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD, &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW"); static counter_u64_t ktls_switch_failed; SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD, &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD, 0, "Software TLS session stats"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD, 0, "Hardware (ifnet) TLS session stats"); static counter_u64_t ktls_sw_cbc; SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc, "Active number of software TLS sessions using AES-CBC"); static counter_u64_t ktls_sw_gcm; SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm, "Active number of software TLS sessions using AES-GCM"); static counter_u64_t ktls_ifnet_cbc; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD, &ktls_ifnet_cbc, "Active number of ifnet TLS sessions using AES-CBC"); static counter_u64_t ktls_ifnet_gcm; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD, &ktls_ifnet_gcm, "Active number of ifnet TLS sessions using AES-GCM"); static counter_u64_t ktls_ifnet_reset; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD, &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag"); static counter_u64_t ktls_ifnet_reset_dropped; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD, &ktls_ifnet_reset_dropped, "TLS sessions dropped after failing to update ifnet send tag"); static counter_u64_t ktls_ifnet_reset_failed; SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD, &ktls_ifnet_reset_failed, "TLS sessions that failed to allocate a new ifnet send tag"); static int ktls_ifnet_permitted; SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN, &ktls_ifnet_permitted, 1, "Whether to permit hardware (ifnet) TLS sessions"); static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS"); static void ktls_cleanup(struct ktls_session *tls); #if defined(INET) || defined(INET6) static void ktls_reset_send_tag(void *context, int pending); #endif static void ktls_work_thread(void *ctx); int ktls_crypto_backend_register(struct ktls_crypto_backend *be) { struct ktls_crypto_backend *curr_be, *tmp; if (be->api_version != KTLS_API_VERSION) { printf("KTLS: API version mismatch (%d vs %d) for %s\n", be->api_version, KTLS_API_VERSION, be->name); return (EINVAL); } rm_wlock(&ktls_backends_lock); printf("KTLS: Registering crypto method %s with prio %d\n", be->name, be->prio); if (LIST_EMPTY(&ktls_backends)) { LIST_INSERT_HEAD(&ktls_backends, be, next); } else { LIST_FOREACH_SAFE(curr_be, &ktls_backends, next, tmp) { if (curr_be->prio < be->prio) { LIST_INSERT_BEFORE(curr_be, be, next); break; } if (LIST_NEXT(curr_be, next) == NULL) { LIST_INSERT_AFTER(curr_be, be, next); break; } } } rm_wunlock(&ktls_backends_lock); return (0); } int ktls_crypto_backend_deregister(struct ktls_crypto_backend *be) { struct ktls_crypto_backend *tmp; /* * Don't error if the backend isn't registered. This permits * MOD_UNLOAD handlers to use this function unconditionally. */ rm_wlock(&ktls_backends_lock); LIST_FOREACH(tmp, &ktls_backends, next) { if (tmp == be) break; } if (tmp == NULL) { rm_wunlock(&ktls_backends_lock); return (0); } if (!ktls_allow_unload) { rm_wunlock(&ktls_backends_lock); printf( "KTLS: Deregistering crypto method %s is not supported\n", be->name); return (EBUSY); } if (be->use_count) { rm_wunlock(&ktls_backends_lock); return (EBUSY); } LIST_REMOVE(be, next); rm_wunlock(&ktls_backends_lock); return (0); } #if defined(INET) || defined(INET6) static uint16_t ktls_get_cpu(struct socket *so) { struct inpcb *inp; uint16_t cpuid; inp = sotoinpcb(so); #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid != NETISR_CPUID_NONE) return (cpuid); #endif /* * Just use the flowid to shard connections in a repeatable * fashion. Note that some crypto backends rely on the * serialization provided by having the same connection use * the same queue. */ cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; return (cpuid); } #endif static void ktls_init(void *dummy __unused) { struct thread *td; struct pcpu *pc; cpuset_t mask; int error, i; ktls_tasks_active = counter_u64_alloc(M_WAITOK); ktls_cnt_on = counter_u64_alloc(M_WAITOK); ktls_offload_total = counter_u64_alloc(M_WAITOK); ktls_offload_enable_calls = counter_u64_alloc(M_WAITOK); ktls_offload_active = counter_u64_alloc(M_WAITOK); ktls_offload_failed_crypto = counter_u64_alloc(M_WAITOK); ktls_switch_to_ifnet = counter_u64_alloc(M_WAITOK); ktls_switch_to_sw = counter_u64_alloc(M_WAITOK); ktls_switch_failed = counter_u64_alloc(M_WAITOK); ktls_sw_cbc = counter_u64_alloc(M_WAITOK); ktls_sw_gcm = counter_u64_alloc(M_WAITOK); ktls_ifnet_cbc = counter_u64_alloc(M_WAITOK); ktls_ifnet_gcm = counter_u64_alloc(M_WAITOK); ktls_ifnet_reset = counter_u64_alloc(M_WAITOK); ktls_ifnet_reset_dropped = counter_u64_alloc(M_WAITOK); ktls_ifnet_reset_failed = counter_u64_alloc(M_WAITOK); rm_init(&ktls_backends_lock, "ktls backends"); LIST_INIT(&ktls_backends); ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS, M_WAITOK | M_ZERO); ktls_session_zone = uma_zcreate("ktls_session", sizeof(struct ktls_session), #ifdef INVARIANTS trash_ctor, trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, NULL, #endif UMA_ALIGN_CACHE, 0); /* * Initialize the workqueues to run the TLS work. We create a * work queue for each CPU. */ CPU_FOREACH(i) { STAILQ_INIT(&ktls_wq[i].head); mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF); error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i], - &ktls_proc, &td, 0, 0, "KTLS", "ktls_thr_%d", i); + &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i); if (error) panic("Can't add KTLS thread %d error %d", i, error); /* * Bind threads to cores. If ktls_bind_threads is > * 1, then we bind to the NUMA domain. */ if (ktls_bind_threads) { if (ktls_bind_threads > 1) { pc = pcpu_find(i); CPU_COPY(&cpuset_domain[pc->pc_domain], &mask); } else { CPU_SETOF(i, &mask); } error = cpuset_setthread(td->td_tid, &mask); if (error) panic( "Unable to bind KTLS thread for CPU %d error %d", i, error); } ktls_cpuid_lookup[ktls_number_threads] = i; ktls_number_threads++; } printf("KTLS: Initialized %d threads\n", ktls_number_threads); } SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL); #if defined(INET) || defined(INET6) static int ktls_create_session(struct socket *so, struct tls_enable *en, struct ktls_session **tlsp) { struct ktls_session *tls; int error; /* Only TLS 1.0 - 1.2 are supported. */ if (en->tls_vmajor != TLS_MAJOR_VER_ONE) return (EINVAL); if (en->tls_vminor < TLS_MINOR_VER_ZERO || en->tls_vminor > TLS_MINOR_VER_TWO) return (EINVAL); if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE) return (EINVAL); if (en->iv_len < 0 || en->iv_len > TLS_MAX_PARAM_SIZE) return (EINVAL); /* All supported algorithms require a cipher key. */ if (en->cipher_key_len == 0) return (EINVAL); /* No flags are currently supported. */ if (en->flags != 0) return (EINVAL); /* Common checks for supported algorithms. */ switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: /* * auth_algorithm isn't used, but permit GMAC values * for compatibility. */ switch (en->auth_algorithm) { case 0: case CRYPTO_AES_128_NIST_GMAC: case CRYPTO_AES_192_NIST_GMAC: case CRYPTO_AES_256_NIST_GMAC: break; default: return (EINVAL); } if (en->auth_key_len != 0) return (EINVAL); if (en->iv_len != TLS_AEAD_GCM_LEN) return (EINVAL); break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: /* * TLS 1.0 requires an implicit IV. TLS 1.1+ * all use explicit IVs. */ if (en->tls_vminor == TLS_MINOR_VER_ZERO) { if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN) return (EINVAL); break; } /* FALLTHROUGH */ case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: /* Ignore any supplied IV. */ en->iv_len = 0; break; default: return (EINVAL); } if (en->auth_key_len == 0) return (EINVAL); break; default: return (EINVAL); } tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls->refcount, 1); TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls); tls->wq_index = ktls_get_cpu(so); tls->params.cipher_algorithm = en->cipher_algorithm; tls->params.auth_algorithm = en->auth_algorithm; tls->params.tls_vmajor = en->tls_vmajor; tls->params.tls_vminor = en->tls_vminor; tls->params.flags = en->flags; tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen); /* Set the header and trailer lengths. */ tls->params.tls_hlen = sizeof(struct tls_record_layer); switch (en->cipher_algorithm) { case CRYPTO_AES_NIST_GCM_16: tls->params.tls_hlen += 8; tls->params.tls_tlen = AES_GMAC_HASH_LEN; tls->params.tls_bs = 1; break; case CRYPTO_AES_CBC: switch (en->auth_algorithm) { case CRYPTO_SHA1_HMAC: if (en->tls_vminor == TLS_MINOR_VER_ZERO) { /* Implicit IV, no nonce. */ } else { tls->params.tls_hlen += AES_BLOCK_LEN; } tls->params.tls_tlen = AES_BLOCK_LEN + SHA1_HASH_LEN; break; case CRYPTO_SHA2_256_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_256_HASH_LEN; break; case CRYPTO_SHA2_384_HMAC: tls->params.tls_hlen += AES_BLOCK_LEN; tls->params.tls_tlen = AES_BLOCK_LEN + SHA2_384_HASH_LEN; break; default: panic("invalid hmac"); } tls->params.tls_bs = AES_BLOCK_LEN; break; default: panic("invalid cipher"); } KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN, ("TLS header length too long: %d", tls->params.tls_hlen)); KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN, ("TLS trailer length too long: %d", tls->params.tls_tlen)); if (en->auth_key_len != 0) { tls->params.auth_key_len = en->auth_key_len; tls->params.auth_key = malloc(en->auth_key_len, M_KTLS, M_WAITOK); error = copyin(en->auth_key, tls->params.auth_key, en->auth_key_len); if (error) goto out; } tls->params.cipher_key_len = en->cipher_key_len; tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK); error = copyin(en->cipher_key, tls->params.cipher_key, en->cipher_key_len); if (error) goto out; /* * This holds the implicit portion of the nonce for GCM and * the initial implicit IV for TLS 1.0. The explicit portions * of the IV are generated in ktls_frame() and ktls_seq(). */ if (en->iv_len != 0) { MPASS(en->iv_len <= sizeof(tls->params.iv)); tls->params.iv_len = en->iv_len; error = copyin(en->iv, tls->params.iv, en->iv_len); if (error) goto out; } *tlsp = tls; return (0); out: ktls_cleanup(tls); return (error); } static struct ktls_session * ktls_clone_session(struct ktls_session *tls) { struct ktls_session *tls_new; tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO); counter_u64_add(ktls_offload_active, 1); refcount_init(&tls_new->refcount, 1); /* Copy fields from existing session. */ tls_new->params = tls->params; tls_new->wq_index = tls->wq_index; /* Deep copy keys. */ if (tls_new->params.auth_key != NULL) { tls_new->params.auth_key = malloc(tls->params.auth_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.auth_key, tls->params.auth_key, tls->params.auth_key_len); } tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS, M_WAITOK); memcpy(tls_new->params.cipher_key, tls->params.cipher_key, tls->params.cipher_key_len); return (tls_new); } #endif static void ktls_cleanup(struct ktls_session *tls) { counter_u64_add(ktls_offload_active, -1); if (tls->free != NULL) { MPASS(tls->be != NULL); switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, -1); break; } tls->free(tls); } else if (tls->snd_tag != NULL) { switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, -1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, -1); break; } m_snd_tag_rele(tls->snd_tag); } if (tls->params.auth_key != NULL) { explicit_bzero(tls->params.auth_key, tls->params.auth_key_len); free(tls->params.auth_key, M_KTLS); tls->params.auth_key = NULL; tls->params.auth_key_len = 0; } if (tls->params.cipher_key != NULL) { explicit_bzero(tls->params.cipher_key, tls->params.cipher_key_len); free(tls->params.cipher_key, M_KTLS); tls->params.cipher_key = NULL; tls->params.cipher_key_len = 0; } explicit_bzero(tls->params.iv, sizeof(tls->params.iv)); } #if defined(INET) || defined(INET6) /* * Common code used when first enabling ifnet TLS on a connection or * when allocating a new ifnet TLS session due to a routing change. * This function allocates a new TLS send tag on whatever interface * the connection is currently routed over. */ static int ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force, struct m_snd_tag **mstp) { union if_snd_tag_alloc_params params; struct ifnet *ifp; struct rtentry *rt; struct tcpcb *tp; int error; INP_RLOCK(inp); if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); return (ECONNRESET); } if (inp->inp_socket == NULL) { INP_RUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); /* * Check administrative controls on ifnet TLS to determine if * ifnet TLS should be denied. * * - Always permit 'force' requests. * - ktls_ifnet_permitted == 0: always deny. */ if (!force && ktls_ifnet_permitted == 0) { INP_RUNLOCK(inp); return (ENXIO); } /* * XXX: Use the cached route in the inpcb to find the * interface. This should perhaps instead use * rtalloc1_fib(dst, 0, 0, fibnum). Since KTLS is only * enabled after a connection has completed key negotiation in * userland, the cached route will be present in practice. */ rt = inp->inp_route.ro_rt; if (rt == NULL || rt->rt_ifp == NULL) { INP_RUNLOCK(inp); return (ENXIO); } ifp = rt->rt_ifp; if_ref(ifp); params.hdr.type = IF_SND_TAG_TYPE_TLS; params.hdr.flowid = inp->inp_flowid; params.hdr.flowtype = inp->inp_flowtype; params.tls.inp = inp; params.tls.tls = tls; INP_RUNLOCK(inp); if (ifp->if_snd_tag_alloc == NULL) { error = EOPNOTSUPP; goto out; } if ((ifp->if_capenable & IFCAP_NOMAP) == 0) { error = EOPNOTSUPP; goto out; } if (inp->inp_vflag & INP_IPV6) { if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) { error = EOPNOTSUPP; goto out; } } else { if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) { error = EOPNOTSUPP; goto out; } } error = ifp->if_snd_tag_alloc(ifp, ¶ms, mstp); out: if_rele(ifp); return (error); } static int ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force) { struct m_snd_tag *mst; int error; error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst); if (error == 0) { tls->snd_tag = mst; switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_ifnet_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_ifnet_gcm, 1); break; } } return (error); } static int ktls_try_sw(struct socket *so, struct ktls_session *tls) { struct rm_priotracker prio; struct ktls_crypto_backend *be; /* * Choose the best software crypto backend. Backends are * stored in sorted priority order (larget value == most * important at the head of the list), so this just stops on * the first backend that claims the session by returning * success. */ if (ktls_allow_unload) rm_rlock(&ktls_backends_lock, &prio); LIST_FOREACH(be, &ktls_backends, next) { if (be->try(so, tls) == 0) break; KASSERT(tls->cipher == NULL, ("ktls backend leaked a cipher pointer")); } if (be != NULL) { if (ktls_allow_unload) be->use_count++; tls->be = be; } if (ktls_allow_unload) rm_runlock(&ktls_backends_lock, &prio); if (be == NULL) return (EOPNOTSUPP); switch (tls->params.cipher_algorithm) { case CRYPTO_AES_CBC: counter_u64_add(ktls_sw_cbc, 1); break; case CRYPTO_AES_NIST_GCM_16: counter_u64_add(ktls_sw_gcm, 1); break; } return (0); } int ktls_enable_tx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; int error; if (!ktls_offload_enable) return (ENOTSUP); counter_u64_add(ktls_offload_enable_calls, 1); /* * This should always be true since only the TCP socket option * invokes this function. */ if (so->so_proto->pr_protocol != IPPROTO_TCP) return (EINVAL); /* * XXX: Don't overwrite existing sessions. We should permit * this to support rekeying in the future. */ if (so->so_snd.sb_tls_info != NULL) return (EALREADY); if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable) return (ENOTSUP); /* TLS requires ext pgs */ if (mb_use_ext_pgs == 0) return (ENXIO); error = ktls_create_session(so, en, &tls); if (error) return (error); /* Prefer ifnet TLS over software TLS. */ error = ktls_try_ifnet(so, tls, false); if (error) error = ktls_try_sw(so, tls); if (error) { ktls_cleanup(tls); return (error); } error = sblock(&so->so_snd, SBL_WAIT); if (error) { ktls_cleanup(tls); return (error); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_info = tls; if (tls->sw_encrypt == NULL) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); sbunlock(&so->so_snd); counter_u64_add(ktls_offload_total, 1); return (0); } int ktls_get_tx_mode(struct socket *so) { struct ktls_session *tls; struct inpcb *inp; int mode; inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) mode = TCP_TLS_MODE_NONE; else if (tls->sw_encrypt != NULL) mode = TCP_TLS_MODE_SW; else mode = TCP_TLS_MODE_IFNET; SOCKBUF_UNLOCK(&so->so_snd); return (mode); } /* * Switch between SW and ifnet TLS sessions as requested. */ int ktls_set_tx_mode(struct socket *so, int mode) { struct ktls_session *tls, *tls_new; struct inpcb *inp; int error; MPASS(mode == TCP_TLS_MODE_SW || mode == TCP_TLS_MODE_IFNET); inp = so->so_pcb; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } if ((tls->sw_encrypt != NULL && mode == TCP_TLS_MODE_SW) || (tls->sw_encrypt == NULL && mode == TCP_TLS_MODE_IFNET)) { SOCKBUF_UNLOCK(&so->so_snd); return (0); } tls = ktls_hold(tls); SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); tls_new = ktls_clone_session(tls); if (mode == TCP_TLS_MODE_IFNET) error = ktls_try_ifnet(so, tls_new, true); else error = ktls_try_sw(so, tls_new); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } error = sblock(&so->so_snd, SBL_WAIT); if (error) { counter_u64_add(ktls_switch_failed, 1); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (error); } /* * If we raced with another session change, keep the existing * session. */ if (tls != so->so_snd.sb_tls_info) { counter_u64_add(ktls_switch_failed, 1); sbunlock(&so->so_snd); ktls_free(tls_new); ktls_free(tls); INP_WLOCK(inp); return (EBUSY); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_info = tls_new; if (tls_new->sw_encrypt == NULL) so->so_snd.sb_flags |= SB_TLS_IFNET; SOCKBUF_UNLOCK(&so->so_snd); sbunlock(&so->so_snd); /* * Drop two references on 'tls'. The first is for the * ktls_hold() above. The second drops the reference from the * socket buffer. */ KASSERT(tls->refcount >= 2, ("too few references on old session")); ktls_free(tls); ktls_free(tls); if (mode == TCP_TLS_MODE_IFNET) counter_u64_add(ktls_switch_to_ifnet, 1); else counter_u64_add(ktls_switch_to_sw, 1); INP_WLOCK(inp); return (0); } /* * Try to allocate a new TLS send tag. This task is scheduled when * ip_output detects a route change while trying to transmit a packet * holding a TLS record. If a new tag is allocated, replace the tag * in the TLS session. Subsequent packets on the connection will use * the new tag. If a new tag cannot be allocated, drop the * connection. */ static void ktls_reset_send_tag(void *context, int pending) { struct epoch_tracker et; struct ktls_session *tls; struct m_snd_tag *old, *new; struct inpcb *inp; struct tcpcb *tp; int error; MPASS(pending == 1); tls = context; inp = tls->inp; /* * Free the old tag first before allocating a new one. * ip[6]_output_send() will treat a NULL send tag the same as * an ifp mismatch and drop packets until a new tag is * allocated. * * Write-lock the INP when changing tls->snd_tag since * ip[6]_output_send() holds a read-lock when reading the * pointer. */ INP_WLOCK(inp); old = tls->snd_tag; tls->snd_tag = NULL; INP_WUNLOCK(inp); if (old != NULL) m_snd_tag_rele(old); error = ktls_alloc_snd_tag(inp, tls, true, &new); if (error == 0) { INP_WLOCK(inp); tls->snd_tag = new; mtx_pool_lock(mtxpool_sleep, tls); tls->reset_pending = false; mtx_pool_unlock(mtxpool_sleep, tls); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset, 1); /* * XXX: Should we kick tcp_output explicitly now that * the send tag is fixed or just rely on timers? */ } else { INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); if (!in_pcbrele_wlocked(inp)) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset_dropped, 1); } else INP_WUNLOCK(inp); } INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); counter_u64_add(ktls_ifnet_reset_failed, 1); /* * Leave reset_pending true to avoid future tasks while * the socket goes away. */ } ktls_free(tls); } int ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls) { if (inp == NULL) return (ENOBUFS); INP_LOCK_ASSERT(inp); /* * See if we should schedule a task to update the send tag for * this session. */ mtx_pool_lock(mtxpool_sleep, tls); if (!tls->reset_pending) { (void) ktls_hold(tls); in_pcbref(inp); tls->inp = inp; tls->reset_pending = true; taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task); } mtx_pool_unlock(mtxpool_sleep, tls); return (ENOBUFS); } #endif void ktls_destroy(struct ktls_session *tls) { struct rm_priotracker prio; ktls_cleanup(tls); if (tls->be != NULL && ktls_allow_unload) { rm_rlock(&ktls_backends_lock, &prio); tls->be->use_count--; rm_runlock(&ktls_backends_lock, &prio); } uma_zfree(ktls_session_zone, tls); } void ktls_seq(struct sockbuf *sb, struct mbuf *m) { struct mbuf_ext_pgs *pgs; struct tls_record_layer *tlshdr; uint64_t seqno; for (; m != NULL; m = m->m_next) { KASSERT((m->m_flags & M_NOMAP) != 0, ("ktls_seq: mapped mbuf %p", m)); pgs = m->m_ext.ext_pgs; pgs->seqno = sb->sb_tls_seqno; /* * Store the sequence number in the TLS header as the * explicit part of the IV for GCM. */ if (pgs->tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16) { tlshdr = (void *)pgs->hdr; seqno = htobe64(pgs->seqno); memcpy(tlshdr + 1, &seqno, sizeof(seqno)); } sb->sb_tls_seqno++; } } /* * Add TLS framing (headers and trailers) to a chain of mbufs. Each * mbuf in the chain must be an unmapped mbuf. The payload of the * mbuf must be populated with the payload of each TLS record. * * The record_type argument specifies the TLS record type used when * populating the TLS header. * * The enq_count argument on return is set to the number of pages of * payload data for this entire chain that need to be encrypted via SW * encryption. The returned value should be passed to ktls_enqueue * when scheduling encryption of this chain of mbufs. */ int ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt, uint8_t record_type) { struct tls_record_layer *tlshdr; struct mbuf *m; struct mbuf_ext_pgs *pgs; uint16_t tls_len; int maxlen; maxlen = tls->params.max_frame_len; *enq_cnt = 0; for (m = top; m != NULL; m = m->m_next) { /* * All mbufs in the chain should be non-empty TLS * records whose payload does not exceed the maximum * frame length. */ if (m->m_len > maxlen || m->m_len == 0) return (EINVAL); tls_len = m->m_len; /* * TLS frames require unmapped mbufs to store session * info. */ KASSERT((m->m_flags & M_NOMAP) != 0, ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top)); pgs = m->m_ext.ext_pgs; /* Save a reference to the session. */ pgs->tls = ktls_hold(tls); pgs->hdr_len = tls->params.tls_hlen; pgs->trail_len = tls->params.tls_tlen; if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) { int bs, delta; /* * AES-CBC pads messages to a multiple of the * block size. Note that the padding is * applied after the digest and the encryption * is done on the "plaintext || mac || padding". * At least one byte of padding is always * present. * * Compute the final trailer length assuming * at most one block of padding. * tls->params.sb_tls_tlen is the maximum * possible trailer length (padding + digest). * delta holds the number of excess padding * bytes if the maximum were used. Those * extra bytes are removed. */ bs = tls->params.tls_bs; delta = (tls_len + tls->params.tls_tlen) & (bs - 1); pgs->trail_len -= delta; } m->m_len += pgs->hdr_len + pgs->trail_len; /* Populate the TLS header. */ tlshdr = (void *)pgs->hdr; tlshdr->tls_vmajor = tls->params.tls_vmajor; tlshdr->tls_vminor = tls->params.tls_vminor; tlshdr->tls_type = record_type; tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr)); /* * For GCM, the sequence number is stored in the * header by ktls_seq(). For CBC, a random nonce is * inserted for TLS 1.1+. */ if (tls->params.cipher_algorithm == CRYPTO_AES_CBC && tls->params.tls_vminor >= TLS_MINOR_VER_ONE) arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0); /* * When using SW encryption, mark the mbuf not ready. * It will be marked ready via sbready() after the * record has been encrypted. * * When using ifnet TLS, unencrypted TLS records are * sent down the stack to the NIC. */ if (tls->sw_encrypt != NULL) { m->m_flags |= M_NOTREADY; pgs->nrdy = pgs->npgs; *enq_cnt += pgs->npgs; } } return (0); } void ktls_enqueue_to_free(struct mbuf_ext_pgs *pgs) { struct ktls_wq *wq; bool running; /* Mark it for freeing. */ pgs->mbuf = NULL; wq = &ktls_wq[pgs->tls->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->head, pgs, stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); } void ktls_enqueue(struct mbuf *m, struct socket *so, int page_count) { struct mbuf_ext_pgs *pgs; struct ktls_wq *wq; bool running; KASSERT(((m->m_flags & (M_NOMAP | M_NOTREADY)) == (M_NOMAP | M_NOTREADY)), ("ktls_enqueue: %p not unready & nomap mbuf\n", m)); KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count")); pgs = m->m_ext.ext_pgs; KASSERT(pgs->tls->sw_encrypt != NULL, ("ifnet TLS mbuf")); pgs->enc_cnt = page_count; pgs->mbuf = m; /* * Save a pointer to the socket. The caller is responsible * for taking an additional reference via soref(). */ pgs->so = so; wq = &ktls_wq[pgs->tls->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->head, pgs, stailq); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); counter_u64_add(ktls_cnt_on, 1); } static __noinline void ktls_encrypt(struct mbuf_ext_pgs *pgs) { struct ktls_session *tls; struct socket *so; struct mbuf *m, *top; vm_paddr_t parray[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)]; struct iovec src_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)]; struct iovec dst_iov[1 + btoc(TLS_MAX_MSG_SIZE_V10_2)]; vm_page_t pg; int error, i, len, npages, off, total_pages; bool is_anon; so = pgs->so; tls = pgs->tls; top = pgs->mbuf; KASSERT(tls != NULL, ("tls = NULL, top = %p, pgs = %p\n", top, pgs)); KASSERT(so != NULL, ("so = NULL, top = %p, pgs = %p\n", top, pgs)); #ifdef INVARIANTS pgs->so = NULL; pgs->mbuf = NULL; #endif total_pages = pgs->enc_cnt; npages = 0; /* * Encrypt the TLS records in the chain of mbufs starting with * 'top'. 'total_pages' gives us a total count of pages and is * used to know when we have finished encrypting the TLS * records originally queued with 'top'. * * NB: These mbufs are queued in the socket buffer and * 'm_next' is traversing the mbufs in the socket buffer. The * socket buffer lock is not held while traversing this chain. * Since the mbufs are all marked M_NOTREADY their 'm_next' * pointers should be stable. However, the 'm_next' of the * last mbuf encrypted is not necessarily NULL. It can point * to other mbufs appended while 'top' was on the TLS work * queue. * * Each mbuf holds an entire TLS record. */ error = 0; for (m = top; npages != total_pages; m = m->m_next) { pgs = m->m_ext.ext_pgs; KASSERT(pgs->tls == tls, ("different TLS sessions in a single mbuf chain: %p vs %p", tls, pgs->tls)); KASSERT((m->m_flags & (M_NOMAP | M_NOTREADY)) == (M_NOMAP | M_NOTREADY), ("%p not unready & nomap mbuf (top = %p)\n", m, top)); KASSERT(npages + pgs->npgs <= total_pages, ("page count mismatch: top %p, total_pages %d, m %p", top, total_pages, m)); /* * Generate source and destination ivoecs to pass to * the SW encryption backend. For writable mbufs, the * destination iovec is a copy of the source and * encryption is done in place. For file-backed mbufs * (from sendfile), anonymous wired pages are * allocated and assigned to the destination iovec. */ is_anon = M_WRITABLE(m); off = pgs->first_pg_off; for (i = 0; i < pgs->npgs; i++, off = 0) { len = mbuf_ext_pg_len(pgs, i, off); src_iov[i].iov_len = len; src_iov[i].iov_base = (char *)(void *)PHYS_TO_DMAP(pgs->pa[i]) + off; if (is_anon) { dst_iov[i].iov_base = src_iov[i].iov_base; dst_iov[i].iov_len = src_iov[i].iov_len; continue; } retry_page: pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED); if (pg == NULL) { vm_wait(NULL); goto retry_page; } parray[i] = VM_PAGE_TO_PHYS(pg); dst_iov[i].iov_base = (char *)(void *)PHYS_TO_DMAP(parray[i]) + off; dst_iov[i].iov_len = len; } npages += i; error = (*tls->sw_encrypt)(tls, (const struct tls_record_layer *)pgs->hdr, pgs->trail, src_iov, dst_iov, i, pgs->seqno); if (error) { counter_u64_add(ktls_offload_failed_crypto, 1); break; } /* * For file-backed mbufs, release the file-backed * pages and replace them in the ext_pgs array with * the anonymous wired pages allocated above. */ if (!is_anon) { /* Free the old pages. */ m->m_ext.ext_free(m); /* Replace them with the new pages. */ for (i = 0; i < pgs->npgs; i++) pgs->pa[i] = parray[i]; /* Use the basic free routine. */ m->m_ext.ext_free = mb_free_mext_pgs; } /* * Drop a reference to the session now that it is no * longer needed. Existing code depends on encrypted * records having no associated session vs * yet-to-be-encrypted records having an associated * session. */ pgs->tls = NULL; ktls_free(tls); } CURVNET_SET(so->so_vnet); if (error == 0) { (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages); } else { so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; mb_free_notready(top, total_pages); } SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } static void ktls_work_thread(void *ctx) { struct ktls_wq *wq = ctx; struct mbuf_ext_pgs *p, *n; struct ktls_session *tls; STAILQ_HEAD(, mbuf_ext_pgs) local_head; #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif for (;;) { mtx_lock(&wq->mtx); while (STAILQ_EMPTY(&wq->head)) { wq->running = false; mtx_sleep(wq, &wq->mtx, 0, "-", 0); wq->running = true; } STAILQ_INIT(&local_head); STAILQ_CONCAT(&local_head, &wq->head); mtx_unlock(&wq->mtx); STAILQ_FOREACH_SAFE(p, &local_head, stailq, n) { if (p->mbuf != NULL) { ktls_encrypt(p); counter_u64_add(ktls_cnt_on, -1); } else { tls = p->tls; ktls_free(tls); uma_zfree(zone_extpgs, p); } } } } Index: projects/clang900-import/sys/kern/vfs_mount.c =================================================================== --- projects/clang900-import/sys/kern/vfs_mount.c (revision 352586) +++ projects/clang900-import/sys/kern/vfs_mount.c (revision 352587) @@ -1,2282 +1,2282 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, uint64_t fsflags, struct vfsoptlist **optlist); static void free_mntarg(struct mntarg *ma); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); static bool default_autoro = false; SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); EVENTHANDLER_LIST_DEFINE(vfs_mounted); EVENTHANDLER_LIST_DEFINE(vfs_unmounted); /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "ro", "rw", "nosuid", "noexec", NULL }; static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); mp->mnt_ref = 0; mp->mnt_vfs_ops = 1; return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; uma_zfree_pcpu(pcpu_zone_int, mp->mnt_writeopcount_pcpu); uma_zfree_pcpu(pcpu_zone_int, mp->mnt_lockref_pcpu); uma_zfree_pcpu(pcpu_zone_int, mp->mnt_ref_pcpu); uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu); lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); mtx_destroy(&mp->mnt_mtx); } static void vfs_mount_init(void *dummy __unused) { mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, - NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); } SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; if (opts == NULL) return; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } static int vfs_isopt_ro(const char *opt) { if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || strcmp(opt, "norw") == 0) return (1); return (0); } static int vfs_isopt_rw(const char *opt) { if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) return (1); return (0); } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { char *p; /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); while ((p = strchr(opt1, '.')) != NULL && !strncmp(opt1, opt2, ++p - opt1)) { opt2 += p - opt1; opt1 = p; /* "foo.noopt" vs. "foo.opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "foo.opt" vs. "foo.noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); } /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurrence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused, namelen, optlen; unsigned int i, iovcnt; int error; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; opt->pos = i / 2; opt->seen = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (namelen == 0 || opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. * * XXX: This function will keep a "nofoo" option in the new * options. E.g, if the option's canonical name is "foo", * "nofoo" ends up in the mount point's active options. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) { struct vfsopt *opt, *new; TAILQ_FOREACH(opt, oldopts, link) { new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = strdup(opt->name, M_MOUNT); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else new->value = NULL; new->len = opt->len; new->seen = opt->seen; TAILQ_INSERT_HEAD(toopts, new, link); } vfs_sanitizeopts(toopts); } /* * Mount a filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct nmount_args { struct iovec *iovp; unsigned int iovcnt; int flags; }; #endif int sys_nmount(struct thread *td, struct nmount_args *uap) { struct uio *auio; int error; u_int iovcnt; uint64_t flags; /* * Mount flags are now 64-bits. On 32-bit archtectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, uap->iovp, uap->iovcnt, flags); /* * Filter out MNT_ROOTFS. We do not want clients of nmount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) { CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, uap->iovcnt); return (EINVAL); } error = copyinuio(uap->iovp, iovcnt, &auio); if (error) { CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", __func__, error); return (error); } error = vfs_donmount(td, flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ void vfs_ref(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { vfs_mp_count_add_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } /* * Allocate and initialize the mount point struct. */ struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct ucred *cred) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; TAILQ_INIT(&mp->mnt_activevnodelist); mp->mnt_activevnodelistsize = 0; TAILQ_INIT(&mp->mnt_tmpfreevnodelist); mp->mnt_tmpfreevnodelistsize = 0; if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || mp->mnt_writeopcount != 0) panic("%s: non-zero counters on new mp %p\n", __func__, mp); if (mp->mnt_vfs_ops != 1) panic("%s: vfs_ops should be 1 but %d found\n", __func__, mp->mnt_vfs_ops); (void) vfs_busy(mp, MBF_NOWAIT); atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(cred); mp->mnt_stat.f_owner = cred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_mount_init(mp); mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); TAILQ_INIT(&mp->mnt_uppers); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ void vfs_mount_destroy(struct mount *mp) { if (mp->mnt_vfs_ops == 0) panic("%s: entered with zero vfs_ops\n", __func__); vfs_assert_mount_counters(mp); MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } while (mp->mnt_ref) msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); KASSERT(mp->mnt_ref == 0, ("%s: invalid refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_writeopcount != 0) panic("vfs_mount_destroy: nonzero writeopcount"); if (mp->mnt_secondary_writes != 0) panic("vfs_mount_destroy: nonzero secondary_writes"); atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vn_printf(vp, "dangling vnode "); panic("unmount: dangling vnode"); } KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_activevnodelistsize != 0) panic("vfs_mount_destroy: nonzero activevnodelistsize"); if (mp->mnt_tmpfreevnodelistsize != 0) panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); if (mp->mnt_vfs_ops != 1) panic("%s: vfs_ops should be 1 but %d found\n", __func__, mp->mnt_vfs_ops); if (mp->mnt_vnodecovered != NULL) vrele(mp->mnt_vnodecovered); #ifdef MAC mac_mount_destroy(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } static bool vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error) { /* This is an upgrade of an exisiting mount. */ if ((fsflags & MNT_UPDATE) != 0) return (false); /* This is already an R/O mount. */ if ((fsflags & MNT_RDONLY) != 0) return (false); switch (error) { case ENODEV: /* generic, geom, ... */ case EACCES: /* cam/scsi, ... */ case EROFS: /* md, mmcsd, ... */ /* * These errors can be returned by the storage layer to signal * that the media is read-only. No harm in the R/O mount * attempt if the error was returned for some other reason. */ return (true); default: return (false); } } int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *tmp_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; bool autoro; errmsg = fspath = NULL; errmsg_len = fspathlen = 0; errmsg_pos = -1; autoro = default_autoro; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { if (strcmp(opt->name, "update") == 0) { fsflags |= MNT_UPDATE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) { fsflags |= MNT_FORCE; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "reload") == 0) { fsflags |= MNT_RELOAD; vfs_freeopt(optlist, opt); } else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "atime") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoatime", M_MOUNT); } else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "clusterr") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterr", M_MOUNT); } else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "clusterw") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoclusterw", M_MOUNT); } else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "exec") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonoexec", M_MOUNT); } else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "suid") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosuid", M_MOUNT); } else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "symfollow") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("nonosymfollow", M_MOUNT); } else if (strcmp(opt->name, "noro") == 0) { fsflags &= ~MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "rw") == 0) { fsflags &= ~MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "ro") == 0) { fsflags |= MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "rdonly") == 0) { free(opt->name, M_MOUNT); opt->name = strdup("ro", M_MOUNT); fsflags |= MNT_RDONLY; autoro = false; } else if (strcmp(opt->name, "autoro") == 0) { vfs_freeopt(optlist, opt); autoro = true; } else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; else if (strcmp(opt->name, "automounted") == 0) { fsflags |= MNT_AUTOMOUNTED; vfs_freeopt(optlist, opt); } } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { error = ENAMETOOLONG; goto bail; } error = vfs_domount(td, fstype, fspath, fsflags, &optlist); /* * See if we can mount in the read-only mode if the error code suggests * that it could be possible and the mount options allow for that. * Never try it if "[no]{ro|rw}" has been explicitly requested and not * overridden by "autoro". */ if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) { printf("%s: R/W mount failed, possibly R/O media," " trying R/O mount\n", __func__); fsflags |= MNT_RDONLY; error = vfs_domount(td, fstype, fspath, fsflags, &optlist); } bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (optlist != NULL) vfs_freeopts(optlist); return (error); } /* * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int sys_mount(struct thread *td, struct mount_args *uap) { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; uint64_t flags; int error; /* * Mount flags are now 64-bits. On 32-bit architectures only * 32-bits are passed in, but from here on everything handles * 64-bit flags correctly. */ flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Filter out MNT_ROOTFS. We do not want clients of mount() in * userspace to set this flag, but we must filter it out if we want * MNT_UPDATE on the root file system to work. * MNT_ROOTFS should only be set by the kernel when mounting its * root file system. */ flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG_TEXT(fstype); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) return (ENOENT); if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 && vfsp->vfc_vfsops_sd->vfs_cmount == NULL) || ((vfsp->vfc_flags & VFCF_SBDRY) == 0 && vfsp->vfc_vfsops->vfs_cmount == NULL)) return (EOPNOTSUPP); ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); if ((vfsp->vfc_flags & VFCF_SBDRY) != 0) return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags)); return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags)); } /* * vfs_domount_first(): first file system mount (not update) */ static int vfs_domount_first( struct thread *td, /* Calling thread. */ struct vfsconf *vfsp, /* File system type. */ char *fspath, /* Mount path. */ struct vnode *vp, /* Vnode to be covered. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vattr va; struct mount *mp; struct vnode *newdp; int error, error1; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); /* * If the jail of the calling thread lacks permission for this type of * file system, deny immediately. */ if (jailed(td->td_ucred) && !prison_allow(td->td_ucred, vfsp->vfc_prison_flag)) { vput(vp); return (EPERM); } /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred); if (error == 0 && va.va_uid != td->td_ucred->cr_uid) error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN); if (error == 0) error = vinvalbuf(vp, V_SAVE, 0, 0); if (error == 0 && vp->v_type != VDIR) error = ENOTDIR; if (error == 0) { VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) vp->v_iflag |= VI_MOUNT; else error = EBUSY; VI_UNLOCK(vp); } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp, 0); /* Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = *optlist; /* Set the mount level flags. */ mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY)); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error1 = 0; if ((error = VFS_MOUNT(mp)) != 0 || (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 || (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) { if (error1 != 0) { error = error1; if ((error1 = VFS_UNMOUNT(mp, 0)) != 0) printf("VFS_UNMOUNT returned %d\n", error1); } vfs_unbusy(mp); mp->mnt_vnodecovered = NULL; vfs_mount_destroy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error); } VOP_UNLOCK(newdp, 0); if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; /* * Prevent external consumers of mount options from reading mnt_optnew. */ mp->mnt_optnew = NULL; MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); cache_purge(vp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; /* Place the new filesystem at the end of the mount list. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY); VOP_UNLOCK(vp, 0); EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); VOP_UNLOCK(newdp, 0); mountcheckdirs(vp, newdp); vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); vfs_op_exit(mp); vfs_unbusy(mp); return (0); } /* * vfs_domount_update(): update of mounted file system */ static int vfs_domount_update( struct thread *td, /* Calling thread. */ struct vnode *vp, /* Mount point vnode. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct export_args export; void *bufp; struct mount *mp; int error, export_error, len; uint64_t flag; ASSERT_VOP_ELOCKED(vp, __func__); KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); mp = vp->v_mount; if ((vp->v_vflag & VV_ROOT) == 0) { if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) == 0) error = EXDEV; else error = EINVAL; vput(vp); return (error); } /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ flag = mp->mnt_flag; if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { vput(vp); return (EOPNOTSUPP); /* Needs translation */ } /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error != 0) { vput(vp); return (error); } if (vfs_busy(mp, MBF_NOWAIT)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); vfs_op_enter(mp); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); error = EBUSY; goto end; } mp->mnt_flag &= ~MNT_UPDATEMASK; mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); mp->mnt_optnew = *optlist; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp); export_error = 0; /* Process the export option. */ if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, &len) == 0) { /* Assume that there is only 1 ABI for each length. */ switch (len) { case (sizeof(struct oexport_args)): bzero(&export, sizeof(export)); /* FALLTHROUGH */ case (sizeof(export)): bcopy(bufp, &export, len); export_error = vfs_export(mp, &export); break; default: export_error = EINVAL; break; } } MNT_ILOCK(mp); if (error == 0) { mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); } else { /* * If we fail, restore old mount flags. MNT_QUOTA is special, * because it is not part of MNT_UPDATEMASK, but it could have * changed in the meantime if quotactl(2) was called. * All in all we want current value of MNT_QUOTA, not the old * one. */ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); } if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if (error != 0) goto end; if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; *optlist = NULL; (void)VFS_STATFS(mp, &mp->mnt_stat); /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); else vfs_deallocate_syncvnode(mp); end: vfs_op_exit(mp); vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error != 0 ? error : export_error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ uint64_t fsflags, /* Flags common to all filesystems. */ struct vfsoptlist **optlist /* Options local to the filesystem. */ ) { struct vfsconf *vfsp; struct nameidata nd; struct vnode *vp; char *pathbuf; int error; /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred) || usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) vfsp = vfs_byname(fstype); else vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); } /* * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if ((fsflags & MNT_UPDATE) == 0) { pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); strcpy(pathbuf, fspath); error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); /* debug.disablefullpath == 1 results in ENODEV */ if (error == 0 || error == ENODEV) { error = vfs_domount_first(td, vfsp, pathbuf, vp, fsflags, optlist); } free(pathbuf, M_TEMP); } else error = vfs_domount_update(td, vp, fsflags, optlist); return (error); } /* * Unmount a filesystem. * * Note: unmount takes a path to the vnode mounted on as argument, not * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int sys_unmount(struct thread *td, struct unmount_args *uap) { struct nameidata nd; struct mount *mp; char *pathbuf; int error, id0, id1; AUDIT_ARG_VALUE(uap->flags); if (jailed(td->td_ucred) || usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } if (uap->flags & MNT_BYFSID) { AUDIT_ARG_TEXT(pathbuf); /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } else { /* * Try to find global path for path argument. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, pathbuf, td); if (namei(&nd) == 0) { NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, MNAMELEN); if (error == 0 || error == ENODEV) vput(nd.ni_vp); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { vfs_ref(mp); break; } } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { vfs_rel(mp); return (EINVAL); } error = dounmount(mp, uap->flags, td); return (error); } /* * Return error if any of the vnodes, ignoring the root vnode * and the syncer vnode, have non-zero usecount. * * This function is purely advisory - it can return false positives * and negatives. */ static int vfs_check_usecounts(struct mount *mp) { struct vnode *vp, *mvp; MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && vp->v_usecount != 0) { VI_UNLOCK(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (EBUSY); } VI_UNLOCK(vp); } return (0); } static void dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) { mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag &= ~mntkflags; if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vn_finished_write(mp); } /* * There are various reference counters associated with the mount point. * Normally it is permitted to modify them without taking the mnt ilock, * but this behavior can be temporarily disabled if stable value is needed * or callers are expected to block (e.g. to not allow new users during * forced unmount). */ void vfs_op_enter(struct mount *mp) { int cpu; MNT_ILOCK(mp); mp->mnt_vfs_ops++; if (mp->mnt_vfs_ops > 1) { MNT_IUNLOCK(mp); return; } /* * Paired with a fence in vfs_op_thread_enter(). See the comment * above it for details. */ atomic_thread_fence_seq_cst(); vfs_op_barrier_wait(mp); /* * Paired with a fence in vfs_op_thread_exit(). */ atomic_thread_fence_acq(); CPU_FOREACH(cpu) { mp->mnt_ref += zpcpu_replace_cpu(mp->mnt_ref_pcpu, 0, cpu); mp->mnt_lockref += zpcpu_replace_cpu(mp->mnt_lockref_pcpu, 0, cpu); mp->mnt_writeopcount += zpcpu_replace_cpu(mp->mnt_writeopcount_pcpu, 0, cpu); } MNT_IUNLOCK(mp); vfs_assert_mount_counters(mp); } void vfs_op_exit_locked(struct mount *mp) { mtx_assert(MNT_MTX(mp), MA_OWNED); if (mp->mnt_vfs_ops <= 0) panic("%s: invalid vfs_ops count %d for mp %p\n", __func__, mp->mnt_vfs_ops, mp); mp->mnt_vfs_ops--; } void vfs_op_exit(struct mount *mp) { MNT_ILOCK(mp); vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); } /* * It is assumed the caller already posted at least an acquire barrier. */ void vfs_op_barrier_wait(struct mount *mp) { int *in_op; int cpu; CPU_FOREACH(cpu) { in_op = zpcpu_get_cpu(mp->mnt_thread_in_ops_pcpu, cpu); while (atomic_load_int(in_op)) cpu_spinwait(); } } #ifdef DIAGNOSTIC void vfs_assert_mount_counters(struct mount *mp) { int cpu; if (mp->mnt_vfs_ops == 0) return; CPU_FOREACH(cpu) { if (*(int *)zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu) != 0 || *(int *)zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu) != 0 || *(int *)zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu) != 0) vfs_dump_mount_counters(mp); } } void vfs_dump_mount_counters(struct mount *mp) { int cpu, *count; int ref, lockref, writeopcount; printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); printf(" ref : "); ref = mp->mnt_ref; CPU_FOREACH(cpu) { count = zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu); printf("%d ", *count); ref += *count; } printf("\n"); printf(" lockref : "); lockref = mp->mnt_lockref; CPU_FOREACH(cpu) { count = zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu); printf("%d ", *count); lockref += *count; } printf("\n"); printf("writeopcount: "); writeopcount = mp->mnt_writeopcount; CPU_FOREACH(cpu) { count = zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu); printf("%d ", *count); writeopcount += *count; } printf("\n"); printf("counter struct total\n"); printf("ref %-5d %-5d\n", mp->mnt_ref, ref); printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); panic("invalid counts on struct mount"); } #endif int vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) { int *base, *pcpu; int cpu, sum; switch (which) { case MNT_COUNT_REF: base = &mp->mnt_ref; pcpu = mp->mnt_ref_pcpu; break; case MNT_COUNT_LOCKREF: base = &mp->mnt_lockref; pcpu = mp->mnt_lockref_pcpu; break; case MNT_COUNT_WRITEOPCOUNT: base = &mp->mnt_writeopcount; pcpu = mp->mnt_writeopcount_pcpu; break; } sum = *base; CPU_FOREACH(cpu) { sum += *(int *)zpcpu_get_cpu(pcpu, cpu); } return (sum); } /* * Do the actual filesystem unmount. */ int dounmount(struct mount *mp, int flags, struct thread *td) { struct vnode *coveredvp; int error; uint64_t async_flag; int mnt_gen_r; if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); vfs_rel(mp); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error != 0) { if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); vdrop(coveredvp); } vfs_rel(mp); return (error); } vfs_op_enter(mp); vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || (mp->mnt_flag & MNT_UPDATE) != 0 || !TAILQ_EMPTY(&mp->mnt_uppers)) { dounmount_cleanup(mp, coveredvp, 0); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT; if (flags & MNT_NONBUSY) { MNT_IUNLOCK(mp); error = vfs_check_usecounts(mp); MNT_ILOCK(mp); if (error != 0) { dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); return (error); } } /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) { mp->mnt_kern_flag |= MNTK_UNMOUNTF; MNT_IUNLOCK(mp); /* * Must be done after setting MNTK_UNMOUNTF and before * waiting for mnt_lockref to become 0. */ VFS_PURGE(mp); MNT_ILOCK(mp); } error = 0; if (mp->mnt_lockref) { mp->mnt_kern_flag |= MNTK_DRAINING; error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, "mount drain", 0); } MNT_IUNLOCK(mp); KASSERT(mp->mnt_lockref == 0, ("%s: invalid lock refcount in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); KASSERT(error == 0, ("%s: invalid return value for msleep in the drain path @ %s:%d", __func__, __FILE__, __LINE__)); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); /* * From now, we can claim that the use reference on the * coveredvp is ours, and the ref can be released only by * successfull unmount by us, or left for later unmount * attempt. The previously acquired hold reference is no * longer needed to protect the vnode from reuse. */ if (coveredvp != NULL) vdrop(coveredvp); vfs_msync(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); cache_purgevfs(mp, false); /* remove cache entries for this file sys */ vfs_deallocate_syncvnode(mp); if ((mp->mnt_flag & MNT_RDONLY) != 0 || (flags & MNT_FORCE) != 0 || (error = VFS_SYNC(mp, MNT_WAIT)) == 0) error = VFS_UNMOUNT(mp, flags); vn_finished_write(mp); /* * If we failed to flush the dirty blocks for this mount point, * undo all the cdir/rdir and rootvnode changes we made above. * Unless we failed to do so because the device is reporting that * it doesn't exist anymore. */ if (error && error != ENXIO) { MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0) { MNT_IUNLOCK(mp); vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); } mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) mp->mnt_kern_flag |= MNTK_ASYNC; if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; VOP_UNLOCK(coveredvp, 0); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); if (rootvnode != NULL && mp == rootvnode->v_mount) { vrele(rootvnode); rootvnode = NULL; } if (mp == rootdevmp) rootdevmp = NULL; vfs_mount_destroy(mp); return (0); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) { va_list ap; int error, len; char *errmsg; error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; char errmsg[255]; const char **t, *p, *q; int ret = 0; TAILQ_FOREACH(opt, opts, link) { p = opt->name; q = NULL; if (p[0] == 'n' && p[1] == 'o') q = p + 2; for(t = global_opts; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; for(t = legal; *t != NULL; t++) { if (strcmp(*t, p) == 0) break; if (q != NULL) { if (strcmp(*t, q) == 0) break; } } if (*t != NULL) continue; snprintf(errmsg, sizeof(errmsg), "mount option <%s> is unknown", p); ret = EINVAL; } if (ret != 0) { TAILQ_FOREACH(opt, opts, link) { if (strcmp(opt->name, "errmsg") == 0) { strncpy((char *)opt->value, errmsg, opt->len); break; } } if (opt == NULL) printf("%s\n", errmsg); } return (ret); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len) { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; if (opts == NULL) return (-1); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; return (opt->pos); } } return (-1); } int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) { char *opt_value, *vtp; quad_t iv; int error, opt_len; error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); if (error != 0) return (error); if (opt_len == 0 || opt_value == NULL) return (EINVAL); if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') return (EINVAL); iv = strtoq(opt_value, &vtp, 0); if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) return (EINVAL); if (iv < 0) return (EINVAL); switch (vtp[0]) { case 't': case 'T': iv *= 1024; /* FALLTHROUGH */ case 'g': case 'G': iv *= 1024; /* FALLTHROUGH */ case 'm': case 'M': iv *= 1024; /* FALLTHROUGH */ case 'k': case 'K': iv *= 1024; case '\0': break; default: return (EINVAL); } *value = iv; return (0); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || ((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->len == 0 || opt->value == NULL) return (0); if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len != len) return (EINVAL); bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = len; else { if (opt->len < len) return (EINVAL); opt->len = len; bcopy(value, opt->value, len); } return (0); } return (ENOENT); } int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; opt->seen = 1; if (opt->value == NULL) opt->len = strlen(value) + 1; else if (strlcpy(opt->value, value, opt->len) >= opt->len) return (EINVAL); return (0); } return (ENOENT); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len) { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { opt->seen = 1; if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } int __vfs_statfs(struct mount *mp, struct statfs *sbp) { /* * Filesystems only fill in part of the structure for updates, we * have to read the entirety first to get all content. */ memcpy(sbp, &mp->mnt_stat, sizeof(*sbp)); /* * Set these in case the underlying filesystem fails to do so. */ sbp->f_version = STATFS_VERSION; sbp->f_namemax = NAME_MAX; sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; return (mp->mnt_op->vfs_statfs(mp, sbp)); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new_auto(); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, treat value as a C string. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, uint64_t flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); } va_end(ap); error = kernel_mount(ma, flags); return (error); } void vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp) { bcopy(oexp, exp, sizeof(*oexp)); exp->ex_numsecflavors = 0; } Index: projects/clang900-import/sys/netinet/sctp_asconf.c =================================================================== --- projects/clang900-import/sys/netinet/sctp_asconf.c (revision 352586) +++ projects/clang900-import/sys/netinet/sctp_asconf.c (revision 352587) @@ -1,3482 +1,3483 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* * debug flags: * SCTP_DEBUG_ASCONF1: protocol info, general info and errors * SCTP_DEBUG_ASCONF2: detailed info */ /* * RFC 5061 * * An ASCONF parameter queue exists per asoc which holds the pending address * operations. Lists are updated upon receipt of ASCONF-ACK. * * A restricted_addrs list exists per assoc to hold local addresses that are * not (yet) usable by the assoc as a source address. These addresses are * either pending an ASCONF operation (and exist on the ASCONF parameter * queue), or they are permanently restricted (the peer has returned an * ERROR indication to an ASCONF(ADD), or the peer does not support ASCONF). * * Deleted addresses are always immediately removed from the lists as they will * (shortly) no longer exist in the kernel. We send ASCONFs as a courtesy, * only if allowed. */ /* * ASCONF parameter processing. * response_required: set if a reply is required (eg. SUCCESS_REPORT). * returns a mbuf to an "error" response parameter or NULL/"success" if ok. * FIX: allocating this many mbufs on the fly is pretty inefficient... */ static struct mbuf * sctp_asconf_success_response(uint32_t id) { struct mbuf *m_reply = NULL; struct sctp_asconf_paramhdr *aph; m_reply = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_paramhdr), 0, M_NOWAIT, 1, MT_DATA); if (m_reply == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_success_response: couldn't get mbuf!\n"); return (NULL); } aph = mtod(m_reply, struct sctp_asconf_paramhdr *); aph->correlation_id = id; aph->ph.param_type = htons(SCTP_SUCCESS_REPORT); aph->ph.param_length = sizeof(struct sctp_asconf_paramhdr); SCTP_BUF_LEN(m_reply) = aph->ph.param_length; aph->ph.param_length = htons(aph->ph.param_length); return (m_reply); } static struct mbuf * sctp_asconf_error_response(uint32_t id, uint16_t cause, uint8_t *error_tlv, uint16_t tlv_length) { struct mbuf *m_reply = NULL; struct sctp_asconf_paramhdr *aph; struct sctp_error_cause *error; uint8_t *tlv; m_reply = sctp_get_mbuf_for_msg((sizeof(struct sctp_asconf_paramhdr) + tlv_length + sizeof(struct sctp_error_cause)), 0, M_NOWAIT, 1, MT_DATA); if (m_reply == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_error_response: couldn't get mbuf!\n"); return (NULL); } aph = mtod(m_reply, struct sctp_asconf_paramhdr *); error = (struct sctp_error_cause *)(aph + 1); aph->correlation_id = id; aph->ph.param_type = htons(SCTP_ERROR_CAUSE_IND); error->code = htons(cause); error->length = tlv_length + sizeof(struct sctp_error_cause); aph->ph.param_length = error->length + sizeof(struct sctp_asconf_paramhdr); if (aph->ph.param_length > MLEN) { SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_error_response: tlv_length (%xh) too big\n", tlv_length); sctp_m_freem(m_reply); /* discard */ return (NULL); } if (error_tlv != NULL) { tlv = (uint8_t *)(error + 1); memcpy(tlv, error_tlv, tlv_length); } SCTP_BUF_LEN(m_reply) = aph->ph.param_length; error->length = htons(error->length); aph->ph.param_length = htons(aph->ph.param_length); return (m_reply); } static struct mbuf * sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *aph, struct sctp_tcb *stcb, int send_hb, int response_required) { struct sctp_nets *net; struct mbuf *m_reply = NULL; union sctp_sockstore store; struct sctp_paramhdr *ph; uint16_t param_type, aparam_length; #if defined(INET) || defined(INET6) uint16_t param_length; #endif struct sockaddr *sa; int zero_address = 0; int bad_address = 0; #ifdef INET struct sockaddr_in *sin; struct sctp_ipv4addr_param *v4addr; #endif #ifdef INET6 struct sockaddr_in6 *sin6; struct sctp_ipv6addr_param *v6addr; #endif aparam_length = ntohs(aph->ph.param_length); ph = (struct sctp_paramhdr *)(aph + 1); param_type = ntohs(ph->param_type); #if defined(INET) || defined(INET6) param_length = ntohs(ph->param_length); #endif sa = &store.sa; switch (param_type) { #ifdef INET case SCTP_IPV4_ADDRESS: if (param_length != sizeof(struct sctp_ipv4addr_param)) { /* invalid param size */ return (NULL); } v4addr = (struct sctp_ipv4addr_param *)ph; sin = &store.sin; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_port = stcb->rport; sin->sin_addr.s_addr = v4addr->addr; if ((sin->sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { bad_address = 1; } if (sin->sin_addr.s_addr == INADDR_ANY) zero_address = 1; SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: adding "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: if (param_length != sizeof(struct sctp_ipv6addr_param)) { /* invalid param size */ return (NULL); } v6addr = (struct sctp_ipv6addr_param *)ph; sin6 = &store.sin6; memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_port = stcb->rport; memcpy((caddr_t)&sin6->sin6_addr, v6addr->addr, sizeof(struct in6_addr)); if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { bad_address = 1; } if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) zero_address = 1; SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: adding "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); break; #endif default: m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_INVALID_PARAM, (uint8_t *)aph, aparam_length); return (m_reply); } /* end switch */ /* if 0.0.0.0/::0, add the source address instead */ if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) { sa = src; SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: using source addr "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src); } /* add the address */ if (bad_address) { m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_INVALID_PARAM, (uint8_t *)aph, aparam_length); } else if (sctp_add_remote_addr(stcb, sa, &net, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_DYNAMIC_ADDED) != 0) { SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: error adding address\n"); m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_RESOURCE_SHORTAGE, (uint8_t *)aph, aparam_length); } else { /* notify upper layer */ sctp_ulp_notify(SCTP_NOTIFY_ASCONF_ADD_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED); if (response_required) { m_reply = sctp_asconf_success_response(aph->correlation_id); } sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); if (send_hb) { sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); } } return (m_reply); } static int sctp_asconf_del_remote_addrs_except(struct sctp_tcb *stcb, struct sockaddr *src) { struct sctp_nets *src_net, *net; /* make sure the source address exists as a destination net */ src_net = sctp_findnet(stcb, src); if (src_net == NULL) { /* not found */ return (-1); } /* delete all destination addresses except the source */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net != src_net) { /* delete this address */ sctp_remove_net(stcb, net); SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_del_remote_addrs_except: deleting "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, (struct sockaddr *)&net->ro._l_addr); /* notify upper layer */ sctp_ulp_notify(SCTP_NOTIFY_ASCONF_DELETE_IP, stcb, 0, (struct sockaddr *)&net->ro._l_addr, SCTP_SO_NOT_LOCKED); } } return (0); } static struct mbuf * sctp_process_asconf_delete_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *aph, struct sctp_tcb *stcb, int response_required) { struct mbuf *m_reply = NULL; union sctp_sockstore store; struct sctp_paramhdr *ph; uint16_t param_type, aparam_length; #if defined(INET) || defined(INET6) uint16_t param_length; #endif struct sockaddr *sa; int zero_address = 0; int result; #ifdef INET struct sockaddr_in *sin; struct sctp_ipv4addr_param *v4addr; #endif #ifdef INET6 struct sockaddr_in6 *sin6; struct sctp_ipv6addr_param *v6addr; #endif aparam_length = ntohs(aph->ph.param_length); ph = (struct sctp_paramhdr *)(aph + 1); param_type = ntohs(ph->param_type); #if defined(INET) || defined(INET6) param_length = ntohs(ph->param_length); #endif sa = &store.sa; switch (param_type) { #ifdef INET case SCTP_IPV4_ADDRESS: if (param_length != sizeof(struct sctp_ipv4addr_param)) { /* invalid param size */ return (NULL); } v4addr = (struct sctp_ipv4addr_param *)ph; sin = &store.sin; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_port = stcb->rport; sin->sin_addr.s_addr = v4addr->addr; if (sin->sin_addr.s_addr == INADDR_ANY) zero_address = 1; SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: deleting "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: if (param_length != sizeof(struct sctp_ipv6addr_param)) { /* invalid param size */ return (NULL); } v6addr = (struct sctp_ipv6addr_param *)ph; sin6 = &store.sin6; memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_port = stcb->rport; memcpy(&sin6->sin6_addr, v6addr->addr, sizeof(struct in6_addr)); if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) zero_address = 1; SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: deleting "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); break; #endif default: m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *)aph, aparam_length); return (m_reply); } /* make sure the source address is not being deleted */ if (sctp_cmpaddr(sa, src)) { /* trying to delete the source address! */ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete source addr\n"); m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_DELETING_SRC_ADDR, (uint8_t *)aph, aparam_length); return (m_reply); } /* if deleting 0.0.0.0/::0, delete all addresses except src addr */ if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) { result = sctp_asconf_del_remote_addrs_except(stcb, src); if (result) { /* src address did not exist? */ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: src addr does not exist?\n"); /* what error to reply with?? */ m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_REQUEST_REFUSED, (uint8_t *)aph, aparam_length); } else if (response_required) { m_reply = sctp_asconf_success_response(aph->correlation_id); } return (m_reply); } /* delete the address */ result = sctp_del_remote_addr(stcb, sa); /* * note if result == -2, the address doesn't exist in the asoc but * since it's being deleted anyways, we just ack the delete -- but * this probably means something has already gone awry */ if (result == -1) { /* only one address in the asoc */ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete last IP addr!\n"); m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_DELETING_LAST_ADDR, (uint8_t *)aph, aparam_length); } else { if (response_required) { m_reply = sctp_asconf_success_response(aph->correlation_id); } /* notify upper layer */ sctp_ulp_notify(SCTP_NOTIFY_ASCONF_DELETE_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED); } return (m_reply); } static struct mbuf * sctp_process_asconf_set_primary(struct sockaddr *src, struct sctp_asconf_paramhdr *aph, struct sctp_tcb *stcb, int response_required) { struct mbuf *m_reply = NULL; union sctp_sockstore store; struct sctp_paramhdr *ph; uint16_t param_type, aparam_length; #if defined(INET) || defined(INET6) uint16_t param_length; #endif struct sockaddr *sa; int zero_address = 0; #ifdef INET struct sockaddr_in *sin; struct sctp_ipv4addr_param *v4addr; #endif #ifdef INET6 struct sockaddr_in6 *sin6; struct sctp_ipv6addr_param *v6addr; #endif aparam_length = ntohs(aph->ph.param_length); ph = (struct sctp_paramhdr *)(aph + 1); param_type = ntohs(ph->param_type); #if defined(INET) || defined(INET6) param_length = ntohs(ph->param_length); #endif sa = &store.sa; switch (param_type) { #ifdef INET case SCTP_IPV4_ADDRESS: if (param_length != sizeof(struct sctp_ipv4addr_param)) { /* invalid param size */ return (NULL); } v4addr = (struct sctp_ipv4addr_param *)ph; sin = &store.sin; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = v4addr->addr; if (sin->sin_addr.s_addr == INADDR_ANY) zero_address = 1; SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: if (param_length != sizeof(struct sctp_ipv6addr_param)) { /* invalid param size */ return (NULL); } v6addr = (struct sctp_ipv6addr_param *)ph; sin6 = &store.sin6; memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); memcpy((caddr_t)&sin6->sin6_addr, v6addr->addr, sizeof(struct in6_addr)); if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) zero_address = 1; SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); break; #endif default: m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *)aph, aparam_length); return (m_reply); } /* if 0.0.0.0/::0, use the source address instead */ if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) { sa = src; SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: using source addr "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src); } /* set the primary address */ if (sctp_set_primary_addr(stcb, sa, NULL) == 0) { SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: primary address set\n"); /* notify upper layer */ sctp_ulp_notify(SCTP_NOTIFY_ASCONF_SET_PRIMARY, stcb, 0, sa, SCTP_SO_NOT_LOCKED); if ((stcb->asoc.primary_destination->dest_state & SCTP_ADDR_REACHABLE) && (!(stcb->asoc.primary_destination->dest_state & SCTP_ADDR_PF)) && (stcb->asoc.alternate)) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } if (response_required) { m_reply = sctp_asconf_success_response(aph->correlation_id); } /* * Mobility adaptation. Ideally, when the reception of SET * PRIMARY with DELETE IP ADDRESS of the previous primary * destination, unacknowledged DATA are retransmitted * immediately to the new primary destination for seamless * handover. If the destination is UNCONFIRMED and marked to * REQ_PRIM, The retransmission occur when reception of the * HEARTBEAT-ACK. (See sctp_handle_heartbeat_ack in * sctp_input.c) Also, when change of the primary * destination, it is better that all subsequent new DATA * containing already queued DATA are transmitted to the new * primary destination. (by micchie) */ if ((sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE) || sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) && sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_PRIM_DELETED) && (stcb->asoc.primary_destination->dest_state & SCTP_ADDR_UNCONFIRMED) == 0) { sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_ASCONF + SCTP_LOC_1); if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { sctp_assoc_immediate_retrans(stcb, stcb->asoc.primary_destination); } if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { sctp_move_chunks_from_net(stcb, stcb->asoc.deleted_primary); } sctp_delete_prim_timer(stcb->sctp_ep, stcb, stcb->asoc.deleted_primary); } } else { /* couldn't set the requested primary address! */ SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: set primary failed!\n"); /* must have been an invalid address, so report */ m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *)aph, aparam_length); } return (m_reply); } /* * handles an ASCONF chunk. * if all parameters are processed ok, send a plain (empty) ASCONF-ACK */ void sctp_handle_asconf(struct mbuf *m, unsigned int offset, struct sockaddr *src, struct sctp_asconf_chunk *cp, struct sctp_tcb *stcb, int first) { struct sctp_association *asoc; uint32_t serial_num; struct mbuf *n, *m_ack, *m_result, *m_tail; struct sctp_asconf_ack_chunk *ack_cp; struct sctp_asconf_paramhdr *aph; struct sctp_ipv6addr_param *p_addr; unsigned int asconf_limit, cnt; int error = 0; /* did an error occur? */ /* asconf param buffer */ uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE]; struct sctp_asconf_ack *ack, *ack_next; /* verify minimum length */ if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_asconf_chunk)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: chunk too small = %xh\n", ntohs(cp->ch.chunk_length)); return; } asoc = &stcb->asoc; serial_num = ntohl(cp->serial_number); if (SCTP_TSN_GE(asoc->asconf_seq_in, serial_num)) { /* got a duplicate ASCONF */ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: got duplicate serial number = %xh\n", serial_num); return; } else if (serial_num != (asoc->asconf_seq_in + 1)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: incorrect serial number = %xh (expected next = %xh)\n", serial_num, asoc->asconf_seq_in + 1); return; } /* it's the expected "next" sequence number, so process it */ asoc->asconf_seq_in = serial_num; /* update sequence */ /* get length of all the param's in the ASCONF */ asconf_limit = offset + ntohs(cp->ch.chunk_length); SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: asconf_limit=%u, sequence=%xh\n", asconf_limit, serial_num); if (first) { /* delete old cache */ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: Now processing first ASCONF. Try to delete old cache\n"); TAILQ_FOREACH_SAFE(ack, &asoc->asconf_ack_sent, next, ack_next) { if (ack->serial_number == serial_num) break; SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: delete old(%u) < first(%u)\n", ack->serial_number, serial_num); TAILQ_REMOVE(&asoc->asconf_ack_sent, ack, next); if (ack->data != NULL) { sctp_m_freem(ack->data); } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), ack); } } m_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_ack_chunk), 0, M_NOWAIT, 1, MT_DATA); if (m_ack == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: couldn't get mbuf!\n"); return; } m_tail = m_ack; /* current reply chain's tail */ /* fill in ASCONF-ACK header */ ack_cp = mtod(m_ack, struct sctp_asconf_ack_chunk *); ack_cp->ch.chunk_type = SCTP_ASCONF_ACK; ack_cp->ch.chunk_flags = 0; ack_cp->serial_number = htonl(serial_num); /* set initial lengths (eg. just an ASCONF-ACK), ntohx at the end! */ SCTP_BUF_LEN(m_ack) = sizeof(struct sctp_asconf_ack_chunk); ack_cp->ch.chunk_length = sizeof(struct sctp_asconf_ack_chunk); /* skip the lookup address parameter */ offset += sizeof(struct sctp_asconf_chunk); p_addr = (struct sctp_ipv6addr_param *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *)&aparam_buf); if (p_addr == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: couldn't get lookup addr!\n"); /* respond with a missing/invalid mandatory parameter error */ sctp_m_freem(m_ack); return; } /* param_length is already validated in process_control... */ offset += ntohs(p_addr->ph.param_length); /* skip lookup addr */ /* get pointer to first asconf param in ASCONF */ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_asconf_paramhdr), (uint8_t *)&aparam_buf); if (aph == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "Empty ASCONF received?\n"); goto send_reply; } /* process through all parameters */ cnt = 0; while (aph != NULL) { unsigned int param_length, param_type; param_type = ntohs(aph->ph.param_type); param_length = ntohs(aph->ph.param_length); if (offset + param_length > asconf_limit) { /* parameter goes beyond end of chunk! */ sctp_m_freem(m_ack); return; } m_result = NULL; if (param_length > sizeof(aparam_buf)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) larger than buffer size!\n", param_length); sctp_m_freem(m_ack); return; } if (param_length <= sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) too short\n", param_length); sctp_m_freem(m_ack); + return; } /* get the entire parameter */ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf); if (aph == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: couldn't get entire param\n"); sctp_m_freem(m_ack); return; } switch (param_type) { case SCTP_ADD_IP_ADDRESS: m_result = sctp_process_asconf_add_ip(src, aph, stcb, (cnt < SCTP_BASE_SYSCTL(sctp_hb_maxburst)), error); cnt++; break; case SCTP_DEL_IP_ADDRESS: m_result = sctp_process_asconf_delete_ip(src, aph, stcb, error); break; case SCTP_ERROR_CAUSE_IND: /* not valid in an ASCONF chunk */ break; case SCTP_SET_PRIM_ADDR: m_result = sctp_process_asconf_set_primary(src, aph, stcb, error); break; case SCTP_NAT_VTAGS: SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: sees a NAT VTAG state parameter\n"); break; case SCTP_SUCCESS_REPORT: /* not valid in an ASCONF chunk */ break; case SCTP_ULP_ADAPTATION: /* FIX */ break; default: if ((param_type & 0x8000) == 0) { /* Been told to STOP at this param */ asconf_limit = offset; /* * FIX FIX - We need to call * sctp_arethere_unrecognized_parameters() * to get a operr and send it for any * param's with the 0x4000 bit set OR do it * here ourselves... note we still must STOP * if the 0x8000 bit is clear. */ } /* unknown/invalid param type */ break; } /* switch */ /* add any (error) result to the reply mbuf chain */ if (m_result != NULL) { SCTP_BUF_NEXT(m_tail) = m_result; m_tail = m_result; /* update lengths, make sure it's aligned too */ SCTP_BUF_LEN(m_result) = SCTP_SIZE32(SCTP_BUF_LEN(m_result)); ack_cp->ch.chunk_length += SCTP_BUF_LEN(m_result); /* set flag to force success reports */ error = 1; } offset += SCTP_SIZE32(param_length); /* update remaining ASCONF message length to process */ if (offset >= asconf_limit) { /* no more data in the mbuf chain */ break; } /* get pointer to next asconf param */ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_asconf_paramhdr), (uint8_t *)&aparam_buf); if (aph == NULL) { /* can't get an asconf paramhdr */ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: can't get asconf param hdr!\n"); /* FIX ME - add error here... */ } } send_reply: ack_cp->ch.chunk_length = htons(ack_cp->ch.chunk_length); /* save the ASCONF-ACK reply */ ack = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asconf_ack), struct sctp_asconf_ack); if (ack == NULL) { sctp_m_freem(m_ack); return; } ack->serial_number = serial_num; ack->last_sent_to = NULL; ack->data = m_ack; ack->len = 0; for (n = m_ack; n != NULL; n = SCTP_BUF_NEXT(n)) { ack->len += SCTP_BUF_LEN(n); } TAILQ_INSERT_TAIL(&stcb->asoc.asconf_ack_sent, ack, next); /* see if last_control_chunk_from is set properly (use IP src addr) */ if (stcb->asoc.last_control_chunk_from == NULL) { /* * this could happen if the source address was just newly * added */ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: looking up net for IP source address\n"); SCTPDBG(SCTP_DEBUG_ASCONF1, "Looking for IP source: "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src); /* look up the from address */ stcb->asoc.last_control_chunk_from = sctp_findnet(stcb, src); #ifdef SCTP_DEBUG if (stcb->asoc.last_control_chunk_from == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: IP source address not found?!\n"); } #endif } } /* * does the address match? returns 0 if not, 1 if so */ static uint32_t sctp_asconf_addr_match(struct sctp_asconf_addr *aa, struct sockaddr *sa) { switch (sa->sa_family) { #ifdef INET6 case AF_INET6: { /* XXX scopeid */ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; if ((aa->ap.addrp.ph.param_type == SCTP_IPV6_ADDRESS) && (memcmp(&aa->ap.addrp.addr, &sin6->sin6_addr, sizeof(struct in6_addr)) == 0)) { return (1); } break; } #endif #ifdef INET case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sa; if ((aa->ap.addrp.ph.param_type == SCTP_IPV4_ADDRESS) && (memcmp(&aa->ap.addrp.addr, &sin->sin_addr, sizeof(struct in_addr)) == 0)) { return (1); } break; } #endif default: break; } return (0); } /* * does the address match? returns 0 if not, 1 if so */ static uint32_t sctp_addr_match(struct sctp_paramhdr *ph, struct sockaddr *sa) { #if defined(INET) || defined(INET6) uint16_t param_type, param_length; param_type = ntohs(ph->param_type); param_length = ntohs(ph->param_length); #endif switch (sa->sa_family) { #ifdef INET6 case AF_INET6: { /* XXX scopeid */ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; struct sctp_ipv6addr_param *v6addr; v6addr = (struct sctp_ipv6addr_param *)ph; if ((param_type == SCTP_IPV6_ADDRESS) && (param_length == sizeof(struct sctp_ipv6addr_param)) && (memcmp(&v6addr->addr, &sin6->sin6_addr, sizeof(struct in6_addr)) == 0)) { return (1); } break; } #endif #ifdef INET case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sa; struct sctp_ipv4addr_param *v4addr; v4addr = (struct sctp_ipv4addr_param *)ph; if ((param_type == SCTP_IPV4_ADDRESS) && (param_length == sizeof(struct sctp_ipv4addr_param)) && (memcmp(&v4addr->addr, &sin->sin_addr, sizeof(struct in_addr)) == 0)) { return (1); } break; } #endif default: break; } return (0); } /* * Cleanup for non-responded/OP ERR'd ASCONF */ void sctp_asconf_cleanup(struct sctp_tcb *stcb, struct sctp_nets *net) { /* * clear out any existing asconfs going out */ sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_ASCONF + SCTP_LOC_2); stcb->asoc.asconf_seq_out_acked = stcb->asoc.asconf_seq_out; /* remove the old ASCONF on our outbound queue */ sctp_toss_old_asconf(stcb); } /* * cleanup any cached source addresses that may be topologically * incorrect after a new address has been added to this interface. */ static void sctp_asconf_nets_cleanup(struct sctp_tcb *stcb, struct sctp_ifn *ifn) { struct sctp_nets *net; /* * Ideally, we want to only clear cached routes and source addresses * that are topologically incorrect. But since there is no easy way * to know whether the newly added address on the ifn would cause a * routing change (i.e. a new egress interface would be chosen) * without doing a new routing lookup and source address selection, * we will (for now) just flush any cached route using a different * ifn (and cached source addrs) and let output re-choose them * during the next send on that net. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* * clear any cached route (and cached source address) if the * route's interface is NOT the same as the address change. * If it's the same interface, just clear the cached source * address. */ if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro) && ((ifn == NULL) || (SCTP_GET_IF_INDEX_FROM_ROUTE(&net->ro) != ifn->ifn_index))) { /* clear any cached route */ RTFREE(net->ro.ro_rt); net->ro.ro_rt = NULL; } /* clear any cached source address */ if (net->src_addr_selected) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; } } } void sctp_assoc_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *dstnet) { int error; if (dstnet->dest_state & SCTP_ADDR_UNCONFIRMED) { return; } if (stcb->asoc.deleted_primary == NULL) { return; } if (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "assoc_immediate_retrans: Deleted primary is "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.deleted_primary->ro._l_addr.sa); SCTPDBG(SCTP_DEBUG_ASCONF1, "Current Primary is "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.primary_destination->ro._l_addr.sa); sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, stcb->asoc.deleted_primary, SCTP_FROM_SCTP_ASCONF + SCTP_LOC_3); stcb->asoc.num_send_timers_up--; if (stcb->asoc.num_send_timers_up < 0) { stcb->asoc.num_send_timers_up = 0; } SCTP_TCB_LOCK_ASSERT(stcb); error = sctp_t3rxt_timer(stcb->sctp_ep, stcb, stcb->asoc.deleted_primary); if (error) { SCTP_INP_DECR_REF(stcb->sctp_ep); return; } SCTP_TCB_LOCK_ASSERT(stcb); #ifdef SCTP_AUDITING_ENABLED sctp_auditing(4, stcb->sctp_ep, stcb, stcb->asoc.deleted_primary); #endif sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); if ((stcb->asoc.num_send_timers_up == 0) && (stcb->asoc.sent_queue_cnt > 0)) { struct sctp_tmit_chunk *chk; chk = TAILQ_FIRST(&stcb->asoc.sent_queue); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo); } } return; } static int sctp_asconf_queue_mgmt(struct sctp_tcb *, struct sctp_ifa *, uint16_t); void sctp_net_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_tmit_chunk *chk; SCTPDBG(SCTP_DEBUG_ASCONF1, "net_immediate_retrans: RTO is %d\n", net->RTO); sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_ASCONF + SCTP_LOC_4); stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); net->error_count = 0; TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { if (chk->whoTo == net) { if (chk->sent < SCTP_DATAGRAM_RESEND) { chk->sent = SCTP_DATAGRAM_RESEND; sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); sctp_flight_size_decrease(chk); sctp_total_flight_decrease(stcb, chk); net->marked_retrans++; stcb->asoc.marked_retrans++; } } } if (net->marked_retrans) { sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); } } static void sctp_path_check_and_react(struct sctp_tcb *stcb, struct sctp_ifa *newifa) { struct sctp_nets *net; int addrnum, changed; /* * If number of local valid addresses is 1, the valid address is * probably newly added address. Several valid addresses in this * association. A source address may not be changed. Additionally, * they can be configured on a same interface as "alias" addresses. * (by micchie) */ addrnum = sctp_local_addr_count(stcb); SCTPDBG(SCTP_DEBUG_ASCONF1, "p_check_react(): %d local addresses\n", addrnum); if (addrnum == 1) { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* clear any cached route and source address */ if (net->ro.ro_rt) { RTFREE(net->ro.ro_rt); net->ro.ro_rt = NULL; } if (net->src_addr_selected) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; } /* Retransmit unacknowledged DATA chunks immediately */ if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { sctp_net_immediate_retrans(stcb, net); } /* also, SET PRIMARY is maybe already sent */ } return; } /* Multiple local addresses exsist in the association. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* clear any cached route and source address */ if (net->ro.ro_rt) { RTFREE(net->ro.ro_rt); net->ro.ro_rt = NULL; } if (net->src_addr_selected) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; } /* * Check if the nexthop is corresponding to the new address. * If the new address is corresponding to the current * nexthop, the path will be changed. If the new address is * NOT corresponding to the current nexthop, the path will * not be changed. */ SCTP_RTALLOC((sctp_route_t *)&net->ro, stcb->sctp_ep->def_vrf_id, stcb->sctp_ep->fibnum); if (net->ro.ro_rt == NULL) continue; changed = 0; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: if (sctp_v4src_match_nexthop(newifa, (sctp_route_t *)&net->ro)) { changed = 1; } break; #endif #ifdef INET6 case AF_INET6: if (sctp_v6src_match_nexthop( &newifa->address.sin6, (sctp_route_t *)&net->ro)) { changed = 1; } break; #endif default: break; } /* * if the newly added address does not relate routing * information, we skip. */ if (changed == 0) continue; /* Retransmit unacknowledged DATA chunks immediately */ if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { sctp_net_immediate_retrans(stcb, net); } /* Send SET PRIMARY for this new address */ if (net == stcb->asoc.primary_destination) { (void)sctp_asconf_queue_mgmt(stcb, newifa, SCTP_SET_PRIM_ADDR); } } } /* * process an ADD/DELETE IP ack from peer. * addr: corresponding sctp_ifa to the address being added/deleted. * type: SCTP_ADD_IP_ADDRESS or SCTP_DEL_IP_ADDRESS. * flag: 1=success, 0=failure. */ static void sctp_asconf_addr_mgmt_ack(struct sctp_tcb *stcb, struct sctp_ifa *addr, uint32_t flag) { /* * do the necessary asoc list work- if we get a failure indication, * leave the address on the assoc's restricted list. If we get a * success indication, remove the address from the restricted list. */ /* * Note: this will only occur for ADD_IP_ADDRESS, since * DEL_IP_ADDRESS is never actually added to the list... */ if (flag) { /* success case, so remove from the restricted list */ sctp_del_local_addr_restricted(stcb, addr); if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE) || sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { sctp_path_check_and_react(stcb, addr); return; } /* clear any cached/topologically incorrect source addresses */ sctp_asconf_nets_cleanup(stcb, addr->ifn_p); } /* else, leave it on the list */ } /* * add an asconf add/delete/set primary IP address parameter to the queue. * type = SCTP_ADD_IP_ADDRESS, SCTP_DEL_IP_ADDRESS, SCTP_SET_PRIM_ADDR. * returns 0 if queued, -1 if not queued/removed. * NOTE: if adding, but a delete for the same address is already scheduled * (and not yet sent out), simply remove it from queue. Same for deleting * an address already scheduled for add. If a duplicate operation is found, * ignore the new one. */ static int sctp_asconf_queue_mgmt(struct sctp_tcb *stcb, struct sctp_ifa *ifa, uint16_t type) { struct sctp_asconf_addr *aa, *aa_next; /* make sure the request isn't already in the queue */ TAILQ_FOREACH_SAFE(aa, &stcb->asoc.asconf_queue, next, aa_next) { /* address match? */ if (sctp_asconf_addr_match(aa, &ifa->address.sa) == 0) continue; /* * is the request already in queue but not sent? pass the * request already sent in order to resolve the following * case: 1. arrival of ADD, then sent 2. arrival of DEL. we * can't remove the ADD request already sent 3. arrival of * ADD */ if (aa->ap.aph.ph.param_type == type && aa->sent == 0) { return (-1); } /* is the negative request already in queue, and not sent */ if ((aa->sent == 0) && (type == SCTP_ADD_IP_ADDRESS) && (aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS)) { /* add requested, delete already queued */ TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next); /* remove the ifa from the restricted list */ sctp_del_local_addr_restricted(stcb, ifa); /* free the asconf param */ SCTP_FREE(aa, SCTP_M_ASC_ADDR); SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: add removes queued entry\n"); return (-1); } if ((aa->sent == 0) && (type == SCTP_DEL_IP_ADDRESS) && (aa->ap.aph.ph.param_type == SCTP_ADD_IP_ADDRESS)) { /* delete requested, add already queued */ TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next); /* remove the aa->ifa from the restricted list */ sctp_del_local_addr_restricted(stcb, aa->ifa); /* free the asconf param */ SCTP_FREE(aa, SCTP_M_ASC_ADDR); SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: delete removes queued entry\n"); return (-1); } } /* for each aa */ /* adding new request to the queue */ SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), SCTP_M_ASC_ADDR); if (aa == NULL) { /* didn't get memory */ SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_queue_mgmt: failed to get memory!\n"); return (-1); } aa->special_del = 0; /* fill in asconf address parameter fields */ /* top level elements are "networked" during send */ aa->ap.aph.ph.param_type = type; aa->ifa = ifa; atomic_add_int(&ifa->refcount, 1); /* correlation_id filled in during send routine later... */ switch (ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = &ifa->address.sin6; aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS; aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param)); aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv6addr_param); memcpy(&aa->ap.addrp.addr, &sin6->sin6_addr, sizeof(struct in6_addr)); break; } #endif #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = &ifa->address.sin; aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS; aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param)); aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv4addr_param); memcpy(&aa->ap.addrp.addr, &sin->sin_addr, sizeof(struct in_addr)); break; } #endif default: /* invalid family! */ SCTP_FREE(aa, SCTP_M_ASC_ADDR); sctp_free_ifa(ifa); return (-1); } aa->sent = 0; /* clear sent flag */ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); #ifdef SCTP_DEBUG if (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_ASCONF2) { if (type == SCTP_ADD_IP_ADDRESS) { SCTP_PRINTF("asconf_queue_mgmt: inserted asconf ADD_IP_ADDRESS: "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, &ifa->address.sa); } else if (type == SCTP_DEL_IP_ADDRESS) { SCTP_PRINTF("asconf_queue_mgmt: appended asconf DEL_IP_ADDRESS: "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, &ifa->address.sa); } else { SCTP_PRINTF("asconf_queue_mgmt: appended asconf SET_PRIM_ADDR: "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, &ifa->address.sa); } } #endif return (0); } /* * add an asconf operation for the given ifa and type. * type = SCTP_ADD_IP_ADDRESS, SCTP_DEL_IP_ADDRESS, SCTP_SET_PRIM_ADDR. * returns 0 if completed, -1 if not completed, 1 if immediate send is * advisable. */ static int sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa, uint16_t type) { uint32_t status; int pending_delete_queued = 0; int last; /* see if peer supports ASCONF */ if (stcb->asoc.asconf_supported == 0) { return (-1); } /* * if this is deleting the last address from the assoc, mark it as * pending. */ if ((type == SCTP_DEL_IP_ADDRESS) && !stcb->asoc.asconf_del_pending) { if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { last = (sctp_local_addr_count(stcb) == 0); } else { last = (sctp_local_addr_count(stcb) == 1); } if (last) { /* set the pending delete info only */ stcb->asoc.asconf_del_pending = 1; stcb->asoc.asconf_addr_del_pending = ifa; atomic_add_int(&ifa->refcount, 1); SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_add: mark delete last address pending\n"); return (-1); } } /* queue an asconf parameter */ status = sctp_asconf_queue_mgmt(stcb, ifa, type); /* * if this is an add, and there is a delete also pending (i.e. the * last local address is being changed), queue the pending delete * too. */ if ((type == SCTP_ADD_IP_ADDRESS) && stcb->asoc.asconf_del_pending && (status == 0)) { /* queue in the pending delete */ if (sctp_asconf_queue_mgmt(stcb, stcb->asoc.asconf_addr_del_pending, SCTP_DEL_IP_ADDRESS) == 0) { SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_add: queuing pending delete\n"); pending_delete_queued = 1; /* clear out the pending delete info */ stcb->asoc.asconf_del_pending = 0; sctp_free_ifa(stcb->asoc.asconf_addr_del_pending); stcb->asoc.asconf_addr_del_pending = NULL; } } if (pending_delete_queued) { struct sctp_nets *net; /* * since we know that the only/last address is now being * changed in this case, reset the cwnd/rto on all nets to * start as a new address and path. Also clear the error * counts to give the assoc the best chance to complete the * address change. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); net->RTO = 0; net->error_count = 0; } stcb->asoc.overall_error_count = 0; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_ASCONF, __LINE__); } /* queue in an advisory set primary too */ (void)sctp_asconf_queue_mgmt(stcb, ifa, SCTP_SET_PRIM_ADDR); /* let caller know we should send this out immediately */ status = 1; } return (status); } /*- * add an asconf delete IP address parameter to the queue by sockaddr and * possibly with no sctp_ifa available. This is only called by the routine * that checks the addresses in an INIT-ACK against the current address list. * returns 0 if completed, non-zero if not completed. * NOTE: if an add is already scheduled (and not yet sent out), simply * remove it from queue. If a duplicate operation is found, ignore the * new one. */ static int sctp_asconf_queue_sa_delete(struct sctp_tcb *stcb, struct sockaddr *sa) { struct sctp_ifa *ifa; struct sctp_asconf_addr *aa, *aa_next; if (stcb == NULL) { return (-1); } /* see if peer supports ASCONF */ if (stcb->asoc.asconf_supported == 0) { return (-1); } /* make sure the request isn't already in the queue */ TAILQ_FOREACH_SAFE(aa, &stcb->asoc.asconf_queue, next, aa_next) { /* address match? */ if (sctp_asconf_addr_match(aa, sa) == 0) continue; /* is the request already in queue (sent or not) */ if (aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS) { return (-1); } /* is the negative request already in queue, and not sent */ if (aa->sent == 1) continue; if (aa->ap.aph.ph.param_type == SCTP_ADD_IP_ADDRESS) { /* add already queued, so remove existing entry */ TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next); sctp_del_local_addr_restricted(stcb, aa->ifa); /* free the entry */ SCTP_FREE(aa, SCTP_M_ASC_ADDR); return (-1); } } /* for each aa */ /* find any existing ifa-- NOTE ifa CAN be allowed to be NULL */ ifa = sctp_find_ifa_by_addr(sa, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED); /* adding new request to the queue */ SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), SCTP_M_ASC_ADDR); if (aa == NULL) { /* didn't get memory */ SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_queue_sa_delete: failed to get memory!\n"); return (-1); } aa->special_del = 0; /* fill in asconf address parameter fields */ /* top level elements are "networked" during send */ aa->ap.aph.ph.param_type = SCTP_DEL_IP_ADDRESS; aa->ifa = ifa; if (ifa) atomic_add_int(&ifa->refcount, 1); /* correlation_id filled in during send routine later... */ switch (sa->sa_family) { #ifdef INET6 case AF_INET6: { /* IPv6 address */ struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS; aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param)); aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv6addr_param); memcpy(&aa->ap.addrp.addr, &sin6->sin6_addr, sizeof(struct in6_addr)); break; } #endif #ifdef INET case AF_INET: { /* IPv4 address */ struct sockaddr_in *sin = (struct sockaddr_in *)sa; aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS; aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param)); aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv4addr_param); memcpy(&aa->ap.addrp.addr, &sin->sin_addr, sizeof(struct in_addr)); break; } #endif default: /* invalid family! */ SCTP_FREE(aa, SCTP_M_ASC_ADDR); if (ifa) sctp_free_ifa(ifa); return (-1); } aa->sent = 0; /* clear sent flag */ /* delete goes to the back of the queue */ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); /* sa_ignore MEMLEAK {memory is put on the tailq} */ return (0); } /* * find a specific asconf param on our "sent" queue */ static struct sctp_asconf_addr * sctp_asconf_find_param(struct sctp_tcb *stcb, uint32_t correlation_id) { struct sctp_asconf_addr *aa; TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) { if (aa->ap.aph.correlation_id == correlation_id && aa->sent == 1) { /* found it */ return (aa); } } /* didn't find it */ return (NULL); } /* * process an SCTP_ERROR_CAUSE_IND for a ASCONF-ACK parameter and do * notifications based on the error response */ static void sctp_asconf_process_error(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_asconf_paramhdr *aph) { struct sctp_error_cause *eh; struct sctp_paramhdr *ph; uint16_t param_type; uint16_t error_code; eh = (struct sctp_error_cause *)(aph + 1); ph = (struct sctp_paramhdr *)(eh + 1); /* validate lengths */ if (htons(eh->length) + sizeof(struct sctp_error_cause) > htons(aph->ph.param_length)) { /* invalid error cause length */ SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_process_error: cause element too long\n"); return; } if (htons(ph->param_length) + sizeof(struct sctp_paramhdr) > htons(eh->length)) { /* invalid included TLV length */ SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_process_error: included TLV too long\n"); return; } /* which error code ? */ error_code = ntohs(eh->code); param_type = ntohs(aph->ph.param_type); /* FIX: this should go back up the REMOTE_ERROR ULP notify */ switch (error_code) { case SCTP_CAUSE_RESOURCE_SHORTAGE: /* we allow ourselves to "try again" for this error */ break; default: /* peer can't handle it... */ switch (param_type) { case SCTP_ADD_IP_ADDRESS: case SCTP_DEL_IP_ADDRESS: case SCTP_SET_PRIM_ADDR: break; default: break; } } } /* * process an asconf queue param. * aparam: parameter to process, will be removed from the queue. * flag: 1=success case, 0=failure case */ static void sctp_asconf_process_param_ack(struct sctp_tcb *stcb, struct sctp_asconf_addr *aparam, uint32_t flag) { uint16_t param_type; /* process this param */ param_type = aparam->ap.aph.ph.param_type; switch (param_type) { case SCTP_ADD_IP_ADDRESS: SCTPDBG(SCTP_DEBUG_ASCONF1, "process_param_ack: added IP address\n"); sctp_asconf_addr_mgmt_ack(stcb, aparam->ifa, flag); break; case SCTP_DEL_IP_ADDRESS: SCTPDBG(SCTP_DEBUG_ASCONF1, "process_param_ack: deleted IP address\n"); /* nothing really to do... lists already updated */ break; case SCTP_SET_PRIM_ADDR: SCTPDBG(SCTP_DEBUG_ASCONF1, "process_param_ack: set primary IP address\n"); /* nothing to do... peer may start using this addr */ break; default: /* should NEVER happen */ break; } /* remove the param and free it */ TAILQ_REMOVE(&stcb->asoc.asconf_queue, aparam, next); if (aparam->ifa) sctp_free_ifa(aparam->ifa); SCTP_FREE(aparam, SCTP_M_ASC_ADDR); } /* * cleanup from a bad asconf ack parameter */ static void sctp_asconf_ack_clear(struct sctp_tcb *stcb SCTP_UNUSED) { /* assume peer doesn't really know how to do asconfs */ /* XXX we could free the pending queue here */ } void sctp_handle_asconf_ack(struct mbuf *m, int offset, struct sctp_asconf_ack_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock) { struct sctp_association *asoc; uint32_t serial_num; uint16_t ack_length; struct sctp_asconf_paramhdr *aph; struct sctp_asconf_addr *aa, *aa_next; uint32_t last_error_id = 0; /* last error correlation id */ uint32_t id; struct sctp_asconf_addr *ap; /* asconf param buffer */ uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE]; /* verify minimum length */ if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_asconf_ack_chunk)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: chunk too small = %xh\n", ntohs(cp->ch.chunk_length)); return; } asoc = &stcb->asoc; serial_num = ntohl(cp->serial_number); /* * NOTE: we may want to handle this differently- currently, we will * abort when we get an ack for the expected serial number + 1 (eg. * we didn't send it), process an ack normally if it is the expected * serial number, and re-send the previous ack for *ALL* other * serial numbers */ /* * if the serial number is the next expected, but I didn't send it, * abort the asoc, since someone probably just hijacked us... */ if (serial_num == (asoc->asconf_seq_out + 1)) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got unexpected next serial number! Aborting asoc!\n"); snprintf(msg, sizeof(msg), "Never sent serial number %8.8x", serial_num); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_no_unlock = 1; return; } if (serial_num != asoc->asconf_seq_out_acked + 1) { /* got a duplicate/unexpected ASCONF-ACK */ SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got duplicate/unexpected serial number = %xh (expected = %xh)\n", serial_num, asoc->asconf_seq_out_acked + 1); return; } if (serial_num == asoc->asconf_seq_out - 1) { /* stop our timer */ sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_ASCONF + SCTP_LOC_5); } /* process the ASCONF-ACK contents */ ack_length = ntohs(cp->ch.chunk_length) - sizeof(struct sctp_asconf_ack_chunk); offset += sizeof(struct sctp_asconf_ack_chunk); /* process through all parameters */ while (ack_length >= sizeof(struct sctp_asconf_paramhdr)) { unsigned int param_length, param_type; /* get pointer to next asconf parameter */ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_asconf_paramhdr), aparam_buf); if (aph == NULL) { /* can't get an asconf paramhdr */ sctp_asconf_ack_clear(stcb); return; } param_type = ntohs(aph->ph.param_type); param_length = ntohs(aph->ph.param_length); if (param_length > ack_length) { sctp_asconf_ack_clear(stcb); return; } if (param_length < sizeof(struct sctp_paramhdr)) { sctp_asconf_ack_clear(stcb); return; } /* get the complete parameter... */ if (param_length > sizeof(aparam_buf)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "param length (%u) larger than buffer size!\n", param_length); sctp_asconf_ack_clear(stcb); return; } aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf); if (aph == NULL) { sctp_asconf_ack_clear(stcb); return; } /* correlation_id is transparent to peer, no ntohl needed */ id = aph->correlation_id; switch (param_type) { case SCTP_ERROR_CAUSE_IND: last_error_id = id; /* find the corresponding asconf param in our queue */ ap = sctp_asconf_find_param(stcb, id); if (ap == NULL) { /* hmm... can't find this in our queue! */ break; } /* process the parameter, failed flag */ sctp_asconf_process_param_ack(stcb, ap, 0); /* process the error response */ sctp_asconf_process_error(stcb, aph); break; case SCTP_SUCCESS_REPORT: /* find the corresponding asconf param in our queue */ ap = sctp_asconf_find_param(stcb, id); if (ap == NULL) { /* hmm... can't find this in our queue! */ break; } /* process the parameter, success flag */ sctp_asconf_process_param_ack(stcb, ap, 1); break; default: break; } /* switch */ /* update remaining ASCONF-ACK message length to process */ ack_length -= SCTP_SIZE32(param_length); if (ack_length <= 0) { /* no more data in the mbuf chain */ break; } offset += SCTP_SIZE32(param_length); } /* while */ /* * if there are any "sent" params still on the queue, these are * implicitly "success", or "failed" (if we got an error back) ... * so process these appropriately * * we assume that the correlation_id's are monotonically increasing * beginning from 1 and that we don't have *that* many outstanding * at any given time */ if (last_error_id == 0) last_error_id--; /* set to "max" value */ TAILQ_FOREACH_SAFE(aa, &stcb->asoc.asconf_queue, next, aa_next) { if (aa->sent == 1) { /* * implicitly successful or failed if correlation_id * < last_error_id, then success else, failure */ if (aa->ap.aph.correlation_id < last_error_id) sctp_asconf_process_param_ack(stcb, aa, 1); else sctp_asconf_process_param_ack(stcb, aa, 0); } else { /* * since we always process in order (FIFO queue) if * we reach one that hasn't been sent, the rest * should not have been sent either. so, we're * done... */ break; } } /* update the next sequence number to use */ asoc->asconf_seq_out_acked++; /* remove the old ASCONF on our outbound queue */ sctp_toss_old_asconf(stcb); if (!TAILQ_EMPTY(&stcb->asoc.asconf_queue)) { #ifdef SCTP_TIMER_BASED_ASCONF /* we have more params, so restart our timer */ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net); #else /* we have more params, so send out more */ sctp_send_asconf(stcb, net, SCTP_ADDR_NOT_LOCKED); #endif } } #ifdef INET6 static uint32_t sctp_is_scopeid_in_nets(struct sctp_tcb *stcb, struct sockaddr *sa) { struct sockaddr_in6 *sin6, *net6; struct sctp_nets *net; if (sa->sa_family != AF_INET6) { /* wrong family */ return (0); } sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) == 0) { /* not link local address */ return (0); } /* hunt through our destination nets list for this scope_id */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (((struct sockaddr *)(&net->ro._l_addr))->sa_family != AF_INET6) continue; net6 = (struct sockaddr_in6 *)&net->ro._l_addr; if (IN6_IS_ADDR_LINKLOCAL(&net6->sin6_addr) == 0) continue; if (sctp_is_same_scope(sin6, net6)) { /* found one */ return (1); } } /* didn't find one */ return (0); } #endif /* * address management functions */ static void sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_ifa *ifa, uint16_t type, int addr_locked) { int status; if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0 || sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) { /* subset bound, no ASCONF allowed case, so ignore */ return; } /* * note: we know this is not the subset bound, no ASCONF case eg. * this is boundall or subset bound w/ASCONF allowed */ /* first, make sure that the address is IPv4 or IPv6 and not jailed */ switch (ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &ifa->address.sin6.sin6_addr) != 0) { return; } break; #endif #ifdef INET case AF_INET: if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &ifa->address.sin.sin_addr) != 0) { return; } break; #endif default: return; } #ifdef INET6 /* make sure we're "allowed" to add this type of addr */ if (ifa->address.sa.sa_family == AF_INET6) { /* invalid if we're not a v6 endpoint */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) return; /* is the v6 addr really valid ? */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { return; } } #endif /* put this address on the "pending/do not use yet" list */ sctp_add_local_addr_restricted(stcb, ifa); /* * check address scope if address is out of scope, don't queue * anything... note: this would leave the address on both inp and * asoc lists */ switch (ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = &ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* we skip unspecifed addresses */ return; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { if (stcb->asoc.scope.local_scope == 0) { return; } /* is it the right link local scope? */ if (sctp_is_scopeid_in_nets(stcb, &ifa->address.sa) == 0) { return; } } if (stcb->asoc.scope.site_scope == 0 && IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { return; } break; } #endif #ifdef INET case AF_INET: { struct sockaddr_in *sin; /* invalid if we are a v6 only endpoint */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) return; sin = &ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* we skip unspecifed addresses */ return; } if (stcb->asoc.scope.ipv4_local_scope == 0 && IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { return; } break; } #endif default: /* else, not AF_INET or AF_INET6, so skip */ return; } /* queue an asconf for this address add/delete */ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF)) { /* does the peer do asconf? */ if (stcb->asoc.asconf_supported) { /* queue an asconf for this addr */ status = sctp_asconf_queue_add(stcb, ifa, type); /* * if queued ok, and in the open state, send out the * ASCONF. If in the non-open state, these will be * sent when the state goes open. */ if (status == 0 && ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED))) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, stcb->asoc.primary_destination); #else sctp_send_asconf(stcb, NULL, addr_locked); #endif } } } } int sctp_asconf_iterator_ep(struct sctp_inpcb *inp, void *ptr, uint32_t val SCTP_UNUSED) { struct sctp_asconf_iterator *asc; struct sctp_ifa *ifa; struct sctp_laddr *l; int cnt_invalid = 0; asc = (struct sctp_asconf_iterator *)ptr; LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) { ifa = l->ifa; switch (ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: /* invalid if we're not a v6 endpoint */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { cnt_invalid++; if (asc->cnt == cnt_invalid) return (1); } break; #endif #ifdef INET case AF_INET: { /* invalid if we are a v6 only endpoint */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { cnt_invalid++; if (asc->cnt == cnt_invalid) return (1); } break; } #endif default: /* invalid address family */ cnt_invalid++; if (asc->cnt == cnt_invalid) return (1); } } return (0); } static int sctp_asconf_iterator_ep_end(struct sctp_inpcb *inp, void *ptr, uint32_t val SCTP_UNUSED) { struct sctp_ifa *ifa; struct sctp_asconf_iterator *asc; struct sctp_laddr *laddr, *nladdr, *l; /* Only for specific case not bound all */ asc = (struct sctp_asconf_iterator *)ptr; LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) { ifa = l->ifa; if (l->action == SCTP_ADD_IP_ADDRESS) { LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == ifa) { laddr->action = 0; break; } } } else if (l->action == SCTP_DEL_IP_ADDRESS) { LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) { /* remove only after all guys are done */ if (laddr->ifa == ifa) { sctp_del_local_addr_ep(inp, ifa); } } } } return (0); } void sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, uint32_t val SCTP_UNUSED) { struct sctp_asconf_iterator *asc; struct sctp_ifa *ifa; struct sctp_laddr *l; int cnt_invalid = 0; int type, status; int num_queued = 0; asc = (struct sctp_asconf_iterator *)ptr; LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) { ifa = l->ifa; type = l->action; /* address's vrf_id must be the vrf_id of the assoc */ if (ifa->vrf_id != stcb->asoc.vrf_id) { continue; } /* Same checks again for assoc */ switch (ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: { /* invalid if we're not a v6 endpoint */ struct sockaddr_in6 *sin6; if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { cnt_invalid++; if (asc->cnt == cnt_invalid) return; else continue; } sin6 = &ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* we skip unspecifed addresses */ continue; } if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { if (stcb->asoc.scope.local_scope == 0) { continue; } /* is it the right link local scope? */ if (sctp_is_scopeid_in_nets(stcb, &ifa->address.sa) == 0) { continue; } } break; } #endif #ifdef INET case AF_INET: { /* invalid if we are a v6 only endpoint */ struct sockaddr_in *sin; /* invalid if we are a v6 only endpoint */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) continue; sin = &ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* we skip unspecifed addresses */ continue; } if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; } if (stcb->asoc.scope.ipv4_local_scope == 0 && IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { continue; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { cnt_invalid++; if (asc->cnt == cnt_invalid) return; else continue; } break; } #endif default: /* invalid address family */ cnt_invalid++; if (asc->cnt == cnt_invalid) return; else continue; break; } if (type == SCTP_ADD_IP_ADDRESS) { /* prevent this address from being used as a source */ sctp_add_local_addr_restricted(stcb, ifa); } else if (type == SCTP_DEL_IP_ADDRESS) { struct sctp_nets *net; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sctp_rtentry_t *rt; /* delete this address if cached */ if (net->ro._s_addr == ifa) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; rt = net->ro.ro_rt; if (rt) { RTFREE(rt); net->ro.ro_rt = NULL; } /* * Now we deleted our src address, * should we not also now reset the * cwnd/rto to start as if its a new * address? */ stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); net->RTO = 0; } } } else if (type == SCTP_SET_PRIM_ADDR) { if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { /* must validate the ifa is in the ep */ if (sctp_is_addr_in_ep(stcb->sctp_ep, ifa) == 0) { continue; } } else { /* Need to check scopes for this guy */ if (sctp_is_address_in_scope(ifa, &stcb->asoc.scope, 0) == 0) { continue; } } } /* queue an asconf for this address add/delete */ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF) && stcb->asoc.asconf_supported == 1) { /* queue an asconf for this addr */ status = sctp_asconf_queue_add(stcb, ifa, type); /* * if queued ok, and in the open state, update the * count of queued params. If in the non-open * state, these get sent when the assoc goes open. */ if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { if (status >= 0) { num_queued++; } } } } /* * If we have queued params in the open state, send out an ASCONF. */ if (num_queued > 0) { sctp_send_asconf(stcb, NULL, SCTP_ADDR_NOT_LOCKED); } } void sctp_asconf_iterator_end(void *ptr, uint32_t val SCTP_UNUSED) { struct sctp_asconf_iterator *asc; struct sctp_ifa *ifa; struct sctp_laddr *l, *nl; asc = (struct sctp_asconf_iterator *)ptr; LIST_FOREACH_SAFE(l, &asc->list_of_work, sctp_nxt_addr, nl) { ifa = l->ifa; if (l->action == SCTP_ADD_IP_ADDRESS) { /* Clear the defer use flag */ ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE; } sctp_free_ifa(ifa); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), l); SCTP_DECR_LADDR_COUNT(); } SCTP_FREE(asc, SCTP_M_ASC_IT); } /* * sa is the sockaddr to ask the peer to set primary to. * returns: 0 = completed, -1 = error */ int32_t sctp_set_primary_ip_address_sa(struct sctp_tcb *stcb, struct sockaddr *sa) { uint32_t vrf_id; struct sctp_ifa *ifa; /* find the ifa for the desired set primary */ vrf_id = stcb->asoc.vrf_id; ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED); if (ifa == NULL) { /* Invalid address */ return (-1); } /* queue an ASCONF:SET_PRIM_ADDR to be sent */ if (!sctp_asconf_queue_add(stcb, ifa, SCTP_SET_PRIM_ADDR)) { /* set primary queuing succeeded */ SCTPDBG(SCTP_DEBUG_ASCONF1, "set_primary_ip_address_sa: queued on tcb=%p, ", (void *)stcb); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, stcb->asoc.primary_destination); #else sctp_send_asconf(stcb, NULL, SCTP_ADDR_NOT_LOCKED); #endif } } else { SCTPDBG(SCTP_DEBUG_ASCONF1, "set_primary_ip_address_sa: failed to add to queue on tcb=%p, ", (void *)stcb); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); return (-1); } return (0); } int sctp_is_addr_pending(struct sctp_tcb *stcb, struct sctp_ifa *sctp_ifa) { struct sctp_tmit_chunk *chk, *nchk; unsigned int offset, asconf_limit; struct sctp_asconf_chunk *acp; struct sctp_asconf_paramhdr *aph; uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE]; struct sctp_paramhdr *ph; int add_cnt, del_cnt; uint16_t last_param_type; add_cnt = del_cnt = 0; last_param_type = 0; TAILQ_FOREACH_SAFE(chk, &stcb->asoc.asconf_send_queue, sctp_next, nchk) { if (chk->data == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: No mbuf data?\n"); continue; } offset = 0; acp = mtod(chk->data, struct sctp_asconf_chunk *); offset += sizeof(struct sctp_asconf_chunk); asconf_limit = ntohs(acp->ch.chunk_length); ph = (struct sctp_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_paramhdr), aparam_buf); if (ph == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: couldn't get lookup addr!\n"); continue; } offset += ntohs(ph->param_length); aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_asconf_paramhdr), aparam_buf); if (aph == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: Empty ASCONF will be sent?\n"); continue; } while (aph != NULL) { unsigned int param_length, param_type; param_type = ntohs(aph->ph.param_type); param_length = ntohs(aph->ph.param_length); if (offset + param_length > asconf_limit) { /* parameter goes beyond end of chunk! */ break; } if (param_length > sizeof(aparam_buf)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: param length (%u) larger than buffer size!\n", param_length); break; } if (param_length <= sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: param length(%u) too short\n", param_length); break; } aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, param_length, aparam_buf); if (aph == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: couldn't get entire param\n"); break; } ph = (struct sctp_paramhdr *)(aph + 1); if (sctp_addr_match(ph, &sctp_ifa->address.sa) != 0) { switch (param_type) { case SCTP_ADD_IP_ADDRESS: add_cnt++; break; case SCTP_DEL_IP_ADDRESS: del_cnt++; break; default: break; } last_param_type = param_type; } offset += SCTP_SIZE32(param_length); if (offset >= asconf_limit) { /* no more data in the mbuf chain */ break; } /* get pointer to next asconf param */ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_asconf_paramhdr), aparam_buf); } } /* * we want to find the sequences which consist of ADD -> DEL -> ADD * or DEL -> ADD */ if (add_cnt > del_cnt || (add_cnt == del_cnt && last_param_type == SCTP_ADD_IP_ADDRESS)) { return (1); } return (0); } static struct sockaddr * sctp_find_valid_localaddr(struct sctp_tcb *stcb, int addr_locked) { struct sctp_vrf *vrf = NULL; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; if (addr_locked == SCTP_ADDR_NOT_LOCKED) SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(stcb->asoc.vrf_id); if (vrf == NULL) { if (addr_locked == SCTP_ADDR_NOT_LOCKED) SCTP_IPI_ADDR_RUNLOCK(); return (NULL); } LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if (stcb->asoc.scope.loopback_scope == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { /* Skip if loopback_scope not set */ continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: if (stcb->asoc.scope.ipv4_addr_legal) { struct sockaddr_in *sin; sin = &sctp_ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* skip unspecifed addresses */ continue; } if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; } if (stcb->asoc.scope.ipv4_local_scope == 0 && IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) continue; if (sctp_is_addr_restricted(stcb, sctp_ifa) && (!sctp_is_addr_pending(stcb, sctp_ifa))) continue; /* * found a valid local v4 address to * use */ if (addr_locked == SCTP_ADDR_NOT_LOCKED) SCTP_IPI_ADDR_RUNLOCK(); return (&sctp_ifa->address.sa); } break; #endif #ifdef INET6 case AF_INET6: if (stcb->asoc.scope.ipv6_addr_legal) { struct sockaddr_in6 *sin6; if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { continue; } sin6 = &sctp_ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * we skip unspecifed * addresses */ continue; } if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; } if (stcb->asoc.scope.local_scope == 0 && IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) continue; if (stcb->asoc.scope.site_scope == 0 && IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) continue; if (sctp_is_addr_restricted(stcb, sctp_ifa) && (!sctp_is_addr_pending(stcb, sctp_ifa))) continue; /* * found a valid local v6 address to * use */ if (addr_locked == SCTP_ADDR_NOT_LOCKED) SCTP_IPI_ADDR_RUNLOCK(); return (&sctp_ifa->address.sa); } break; #endif default: break; } } } /* no valid addresses found */ if (addr_locked == SCTP_ADDR_NOT_LOCKED) SCTP_IPI_ADDR_RUNLOCK(); return (NULL); } static struct sockaddr * sctp_find_valid_localaddr_ep(struct sctp_tcb *stcb) { struct sctp_laddr *laddr; LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { continue; } /* is the address restricted ? */ if (sctp_is_addr_restricted(stcb, laddr->ifa) && (!sctp_is_addr_pending(stcb, laddr->ifa))) continue; /* found a valid local address to use */ return (&laddr->ifa->address.sa); } /* no valid addresses found */ return (NULL); } /* * builds an ASCONF chunk from queued ASCONF params. * returns NULL on error (no mbuf, no ASCONF params queued, etc). */ struct mbuf * sctp_compose_asconf(struct sctp_tcb *stcb, int *retlen, int addr_locked) { struct mbuf *m_asconf, *m_asconf_chk; struct sctp_asconf_addr *aa; struct sctp_asconf_chunk *acp; struct sctp_asconf_paramhdr *aph; struct sctp_asconf_addr_param *aap; uint32_t p_length; uint32_t correlation_id = 1; /* 0 is reserved... */ caddr_t ptr, lookup_ptr; uint8_t lookup_used = 0; /* are there any asconf params to send? */ TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) { if (aa->sent == 0) break; } if (aa == NULL) return (NULL); /* * get a chunk header mbuf and a cluster for the asconf params since * it's simpler to fill in the asconf chunk header lookup address on * the fly */ m_asconf_chk = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_chunk), 0, M_NOWAIT, 1, MT_DATA); if (m_asconf_chk == NULL) { /* no mbuf's */ SCTPDBG(SCTP_DEBUG_ASCONF1, "compose_asconf: couldn't get chunk mbuf!\n"); return (NULL); } m_asconf = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (m_asconf == NULL) { /* no mbuf's */ SCTPDBG(SCTP_DEBUG_ASCONF1, "compose_asconf: couldn't get mbuf!\n"); sctp_m_freem(m_asconf_chk); return (NULL); } SCTP_BUF_LEN(m_asconf_chk) = sizeof(struct sctp_asconf_chunk); SCTP_BUF_LEN(m_asconf) = 0; acp = mtod(m_asconf_chk, struct sctp_asconf_chunk *); memset(acp, 0, sizeof(struct sctp_asconf_chunk)); /* save pointers to lookup address and asconf params */ lookup_ptr = (caddr_t)(acp + 1); /* after the header */ ptr = mtod(m_asconf, caddr_t); /* beginning of cluster */ /* fill in chunk header info */ acp->ch.chunk_type = SCTP_ASCONF; acp->ch.chunk_flags = 0; acp->serial_number = htonl(stcb->asoc.asconf_seq_out); stcb->asoc.asconf_seq_out++; /* add parameters... up to smallest MTU allowed */ TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) { if (aa->sent) continue; /* get the parameter length */ p_length = SCTP_SIZE32(aa->ap.aph.ph.param_length); /* will it fit in current chunk? */ if ((SCTP_BUF_LEN(m_asconf) + p_length > stcb->asoc.smallest_mtu) || (SCTP_BUF_LEN(m_asconf) + p_length > MCLBYTES)) { /* won't fit, so we're done with this chunk */ break; } /* assign (and store) a correlation id */ aa->ap.aph.correlation_id = correlation_id++; /* * fill in address if we're doing a delete this is a simple * way for us to fill in the correlation address, which * should only be used by the peer if we're deleting our * source address and adding a new address (e.g. renumbering * case) */ if (lookup_used == 0 && (aa->special_del == 0) && aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS) { struct sctp_ipv6addr_param *lookup; uint16_t p_size, addr_size; lookup = (struct sctp_ipv6addr_param *)lookup_ptr; lookup->ph.param_type = htons(aa->ap.addrp.ph.param_type); if (aa->ap.addrp.ph.param_type == SCTP_IPV6_ADDRESS) { /* copy IPv6 address */ p_size = sizeof(struct sctp_ipv6addr_param); addr_size = sizeof(struct in6_addr); } else { /* copy IPv4 address */ p_size = sizeof(struct sctp_ipv4addr_param); addr_size = sizeof(struct in_addr); } lookup->ph.param_length = htons(SCTP_SIZE32(p_size)); memcpy(lookup->addr, &aa->ap.addrp.addr, addr_size); SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(p_size); lookup_used = 1; } /* copy into current space */ memcpy(ptr, &aa->ap, p_length); /* network elements and update lengths */ aph = (struct sctp_asconf_paramhdr *)ptr; aap = (struct sctp_asconf_addr_param *)ptr; /* correlation_id is transparent to peer, no htonl needed */ aph->ph.param_type = htons(aph->ph.param_type); aph->ph.param_length = htons(aph->ph.param_length); aap->addrp.ph.param_type = htons(aap->addrp.ph.param_type); aap->addrp.ph.param_length = htons(aap->addrp.ph.param_length); SCTP_BUF_LEN(m_asconf) += SCTP_SIZE32(p_length); ptr += SCTP_SIZE32(p_length); /* * these params are removed off the pending list upon * getting an ASCONF-ACK back from the peer, just set flag */ aa->sent = 1; } /* check to see if the lookup addr has been populated yet */ if (lookup_used == 0) { /* NOTE: if the address param is optional, can skip this... */ /* add any valid (existing) address... */ struct sctp_ipv6addr_param *lookup; uint16_t p_size, addr_size; struct sockaddr *found_addr; caddr_t addr_ptr; if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) found_addr = sctp_find_valid_localaddr(stcb, addr_locked); else found_addr = sctp_find_valid_localaddr_ep(stcb); lookup = (struct sctp_ipv6addr_param *)lookup_ptr; if (found_addr != NULL) { switch (found_addr->sa_family) { #ifdef INET6 case AF_INET6: /* copy IPv6 address */ lookup->ph.param_type = htons(SCTP_IPV6_ADDRESS); p_size = sizeof(struct sctp_ipv6addr_param); addr_size = sizeof(struct in6_addr); addr_ptr = (caddr_t)&((struct sockaddr_in6 *) found_addr)->sin6_addr; break; #endif #ifdef INET case AF_INET: /* copy IPv4 address */ lookup->ph.param_type = htons(SCTP_IPV4_ADDRESS); p_size = sizeof(struct sctp_ipv4addr_param); addr_size = sizeof(struct in_addr); addr_ptr = (caddr_t)&((struct sockaddr_in *) found_addr)->sin_addr; break; #endif default: p_size = 0; addr_size = 0; addr_ptr = NULL; break; } lookup->ph.param_length = htons(SCTP_SIZE32(p_size)); memcpy(lookup->addr, addr_ptr, addr_size); SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(p_size); } else { /* uh oh... don't have any address?? */ SCTPDBG(SCTP_DEBUG_ASCONF1, "compose_asconf: no lookup addr!\n"); /* XXX for now, we send a IPv4 address of 0.0.0.0 */ lookup->ph.param_type = htons(SCTP_IPV4_ADDRESS); lookup->ph.param_length = htons(SCTP_SIZE32(sizeof(struct sctp_ipv4addr_param))); memset(lookup->addr, 0, sizeof(struct in_addr)); SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(sizeof(struct sctp_ipv4addr_param)); } } /* chain it all together */ SCTP_BUF_NEXT(m_asconf_chk) = m_asconf; *retlen = SCTP_BUF_LEN(m_asconf_chk) + SCTP_BUF_LEN(m_asconf); acp->ch.chunk_length = htons(*retlen); return (m_asconf_chk); } /* * section to handle address changes before an association is up eg. changes * during INIT/INIT-ACK/COOKIE-ECHO handshake */ /* * processes the (local) addresses in the INIT-ACK chunk */ static void sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, unsigned int offset, unsigned int length) { struct sctp_paramhdr tmp_param, *ph; uint16_t plen, ptype; struct sctp_ifa *sctp_ifa; union sctp_sockstore store; #ifdef INET6 struct sctp_ipv6addr_param addr6_store; #endif #ifdef INET struct sctp_ipv4addr_param addr4_store; #endif SCTPDBG(SCTP_DEBUG_ASCONF2, "processing init-ack addresses\n"); if (stcb == NULL) /* Un-needed check for SA */ return; /* convert to upper bound */ length += offset; if ((offset + sizeof(struct sctp_paramhdr)) > length) { return; } /* go through the addresses in the init-ack */ ph = (struct sctp_paramhdr *) sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *)&tmp_param); while (ph != NULL) { ptype = ntohs(ph->param_type); plen = ntohs(ph->param_length); switch (ptype) { #ifdef INET6 case SCTP_IPV6_ADDRESS: { struct sctp_ipv6addr_param *a6p; /* get the entire IPv6 address param */ a6p = (struct sctp_ipv6addr_param *) sctp_m_getptr(m, offset, sizeof(struct sctp_ipv6addr_param), (uint8_t *)&addr6_store); if (plen != sizeof(struct sctp_ipv6addr_param) || a6p == NULL) { return; } memset(&store, 0, sizeof(union sctp_sockstore)); store.sin6.sin6_family = AF_INET6; store.sin6.sin6_len = sizeof(struct sockaddr_in6); store.sin6.sin6_port = stcb->rport; memcpy(&store.sin6.sin6_addr, a6p->addr, sizeof(struct in6_addr)); break; } #endif #ifdef INET case SCTP_IPV4_ADDRESS: { struct sctp_ipv4addr_param *a4p; /* get the entire IPv4 address param */ a4p = (struct sctp_ipv4addr_param *)sctp_m_getptr(m, offset, sizeof(struct sctp_ipv4addr_param), (uint8_t *)&addr4_store); if (plen != sizeof(struct sctp_ipv4addr_param) || a4p == NULL) { return; } memset(&store, 0, sizeof(union sctp_sockstore)); store.sin.sin_family = AF_INET; store.sin.sin_len = sizeof(struct sockaddr_in); store.sin.sin_port = stcb->rport; store.sin.sin_addr.s_addr = a4p->addr; break; } #endif default: goto next_addr; } /* see if this address really (still) exists */ sctp_ifa = sctp_find_ifa_by_addr(&store.sa, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED); if (sctp_ifa == NULL) { /* address doesn't exist anymore */ int status; /* are ASCONFs allowed ? */ if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) && stcb->asoc.asconf_supported) { /* queue an ASCONF DEL_IP_ADDRESS */ status = sctp_asconf_queue_sa_delete(stcb, &store.sa); /* * if queued ok, and in correct state, send * out the ASCONF. */ if (status == 0 && SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, stcb->asoc.primary_destination); #else sctp_send_asconf(stcb, NULL, SCTP_ADDR_NOT_LOCKED); #endif } } } next_addr: /* * Sanity check: Make sure the length isn't 0, otherwise * we'll be stuck in this loop for a long time... */ if (SCTP_SIZE32(plen) == 0) { SCTP_PRINTF("process_initack_addrs: bad len (%d) type=%xh\n", plen, ptype); return; } /* get next parameter */ offset += SCTP_SIZE32(plen); if ((offset + sizeof(struct sctp_paramhdr)) > length) return; ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *)&tmp_param); } /* while */ } /* FIX ME: need to verify return result for v6 address type if v6 disabled */ /* * checks to see if a specific address is in the initack address list returns * 1 if found, 0 if not */ static uint32_t sctp_addr_in_initack(struct mbuf *m, uint32_t offset, uint32_t length, struct sockaddr *sa) { struct sctp_paramhdr tmp_param, *ph; uint16_t plen, ptype; #ifdef INET struct sockaddr_in *sin; struct sctp_ipv4addr_param *a4p; struct sctp_ipv6addr_param addr4_store; #endif #ifdef INET6 struct sockaddr_in6 *sin6; struct sctp_ipv6addr_param *a6p; struct sctp_ipv6addr_param addr6_store; struct sockaddr_in6 sin6_tmp; #endif switch (sa->sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: return (0); } SCTPDBG(SCTP_DEBUG_ASCONF2, "find_initack_addr: starting search for "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa); /* convert to upper bound */ length += offset; if ((offset + sizeof(struct sctp_paramhdr)) > length) { SCTPDBG(SCTP_DEBUG_ASCONF1, "find_initack_addr: invalid offset?\n"); return (0); } /* go through the addresses in the init-ack */ ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *)&tmp_param); while (ph != NULL) { ptype = ntohs(ph->param_type); plen = ntohs(ph->param_length); switch (ptype) { #ifdef INET6 case SCTP_IPV6_ADDRESS: if (sa->sa_family == AF_INET6) { /* get the entire IPv6 address param */ if (plen != sizeof(struct sctp_ipv6addr_param)) { break; } /* get the entire IPv6 address param */ a6p = (struct sctp_ipv6addr_param *) sctp_m_getptr(m, offset, sizeof(struct sctp_ipv6addr_param), (uint8_t *)&addr6_store); if (a6p == NULL) { return (0); } sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { /* create a copy and clear scope */ memcpy(&sin6_tmp, sin6, sizeof(struct sockaddr_in6)); sin6 = &sin6_tmp; in6_clearscope(&sin6->sin6_addr); } if (memcmp(&sin6->sin6_addr, a6p->addr, sizeof(struct in6_addr)) == 0) { /* found it */ return (1); } } break; #endif /* INET6 */ #ifdef INET case SCTP_IPV4_ADDRESS: if (sa->sa_family == AF_INET) { if (plen != sizeof(struct sctp_ipv4addr_param)) { break; } /* get the entire IPv4 address param */ a4p = (struct sctp_ipv4addr_param *) sctp_m_getptr(m, offset, sizeof(struct sctp_ipv4addr_param), (uint8_t *)&addr4_store); if (a4p == NULL) { return (0); } sin = (struct sockaddr_in *)sa; if (sin->sin_addr.s_addr == a4p->addr) { /* found it */ return (1); } } break; #endif default: break; } /* get next parameter */ offset += SCTP_SIZE32(plen); if (offset + sizeof(struct sctp_paramhdr) > length) { return (0); } ph = (struct sctp_paramhdr *) sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *)&tmp_param); } /* while */ /* not found! */ return (0); } /* * makes sure that the current endpoint local addr list is consistent with * the new association (eg. subset bound, asconf allowed) adds addresses as * necessary */ static void sctp_check_address_list_ep(struct sctp_tcb *stcb, struct mbuf *m, int offset, int length, struct sockaddr *init_addr) { struct sctp_laddr *laddr; /* go through the endpoint list */ LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { /* be paranoid and validate the laddr */ if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "check_addr_list_ep: laddr->ifa is NULL"); continue; } if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "check_addr_list_ep: laddr->ifa->ifa_addr is NULL"); continue; } /* do i have it implicitly? */ if (sctp_cmpaddr(&laddr->ifa->address.sa, init_addr)) { continue; } /* check to see if in the init-ack */ if (!sctp_addr_in_initack(m, offset, length, &laddr->ifa->address.sa)) { /* try to add it */ sctp_addr_mgmt_assoc(stcb->sctp_ep, stcb, laddr->ifa, SCTP_ADD_IP_ADDRESS, SCTP_ADDR_NOT_LOCKED); } } } /* * makes sure that the current kernel address list is consistent with the new * association (with all addrs bound) adds addresses as necessary */ static void sctp_check_address_list_all(struct sctp_tcb *stcb, struct mbuf *m, int offset, int length, struct sockaddr *init_addr, uint16_t local_scope, uint16_t site_scope, uint16_t ipv4_scope, uint16_t loopback_scope) { struct sctp_vrf *vrf = NULL; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; uint32_t vrf_id; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif if (stcb) { vrf_id = stcb->asoc.vrf_id; } else { return; } SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTP_IPI_ADDR_RUNLOCK(); return; } /* go through all our known interfaces */ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if (loopback_scope == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { /* skip loopback interface */ continue; } /* go through each interface address */ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { /* do i have it implicitly? */ if (sctp_cmpaddr(&sctp_ifa->address.sa, init_addr)) { continue; } switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: sin = &sctp_ifa->address.sin; if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; } if ((ipv4_scope == 0) && (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { /* private address not in scope */ continue; } break; #endif #ifdef INET6 case AF_INET6: sin6 = &sctp_ifa->address.sin6; if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; } if ((local_scope == 0) && (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))) { continue; } if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { continue; } break; #endif default: break; } /* check to see if in the init-ack */ if (!sctp_addr_in_initack(m, offset, length, &sctp_ifa->address.sa)) { /* try to add it */ sctp_addr_mgmt_assoc(stcb->sctp_ep, stcb, sctp_ifa, SCTP_ADD_IP_ADDRESS, SCTP_ADDR_LOCKED); } } /* end foreach ifa */ } /* end foreach ifn */ SCTP_IPI_ADDR_RUNLOCK(); } /* * validates an init-ack chunk (from a cookie-echo) with current addresses * adds addresses from the init-ack into our local address list, if needed * queues asconf adds/deletes addresses as needed and makes appropriate list * changes for source address selection m, offset: points to the start of the * address list in an init-ack chunk length: total length of the address * params only init_addr: address where my INIT-ACK was sent from */ void sctp_check_address_list(struct sctp_tcb *stcb, struct mbuf *m, int offset, int length, struct sockaddr *init_addr, uint16_t local_scope, uint16_t site_scope, uint16_t ipv4_scope, uint16_t loopback_scope) { /* process the local addresses in the initack */ sctp_process_initack_addresses(stcb, m, offset, length); if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* bound all case */ sctp_check_address_list_all(stcb, m, offset, length, init_addr, local_scope, site_scope, ipv4_scope, loopback_scope); } else { /* subset bound case */ if (sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) { /* asconf's allowed */ sctp_check_address_list_ep(stcb, m, offset, length, init_addr); } /* else, no asconfs allowed, so what we sent is what we get */ } } /* * sctp_bindx() support */ uint32_t sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa, uint32_t type, uint32_t vrf_id, struct sctp_ifa *sctp_ifap) { struct sctp_ifa *ifa; struct sctp_laddr *laddr, *nladdr; if (sa->sa_len == 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EINVAL); return (EINVAL); } if (sctp_ifap) { ifa = sctp_ifap; } else if (type == SCTP_ADD_IP_ADDRESS) { /* For an add the address MUST be on the system */ ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED); } else if (type == SCTP_DEL_IP_ADDRESS) { /* For a delete we need to find it in the inp */ ifa = sctp_find_ifa_in_ep(inp, sa, SCTP_ADDR_NOT_LOCKED); } else { ifa = NULL; } if (ifa != NULL) { if (type == SCTP_ADD_IP_ADDRESS) { sctp_add_local_addr_ep(inp, ifa, type); } else if (type == SCTP_DEL_IP_ADDRESS) { if (inp->laddr_count < 2) { /* can't delete the last local address */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EINVAL); return (EINVAL); } LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (ifa == laddr->ifa) { /* Mark in the delete */ laddr->action = type; } } } if (LIST_EMPTY(&inp->sctp_asoc_list)) { /* * There is no need to start the iterator if the inp * has no associations. */ if (type == SCTP_DEL_IP_ADDRESS) { LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) { if (laddr->ifa == ifa) { sctp_del_local_addr_ep(inp, ifa); } } } } else { struct sctp_asconf_iterator *asc; struct sctp_laddr *wi; int ret; SCTP_MALLOC(asc, struct sctp_asconf_iterator *, sizeof(struct sctp_asconf_iterator), SCTP_M_ASC_IT); if (asc == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, ENOMEM); return (ENOMEM); } wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (wi == NULL) { SCTP_FREE(asc, SCTP_M_ASC_IT); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, ENOMEM); return (ENOMEM); } LIST_INIT(&asc->list_of_work); asc->cnt = 1; SCTP_INCR_LADDR_COUNT(); wi->ifa = ifa; wi->action = type; atomic_add_int(&ifa->refcount, 1); LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr); ret = sctp_initiate_iterator(sctp_asconf_iterator_ep, sctp_asconf_iterator_stcb, sctp_asconf_iterator_ep_end, SCTP_PCB_ANY_FLAGS, SCTP_PCB_ANY_FEATURES, SCTP_ASOC_ANY_STATE, (void *)asc, 0, sctp_asconf_iterator_end, inp, 0); if (ret) { SCTP_PRINTF("Failed to initiate iterator for addr_mgmt_ep_sa\n"); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EFAULT); sctp_asconf_iterator_end(asc, 0); return (EFAULT); } } return (0); } else { /* invalid address! */ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EADDRNOTAVAIL); return (EADDRNOTAVAIL); } } void sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_asconf_addr *aa; struct sctp_ifa *sctp_ifap; struct sctp_asconf_tag_param *vtag; #ifdef INET struct sockaddr_in *to; #endif #ifdef INET6 struct sockaddr_in6 *to6; #endif if (net == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: Missing net\n"); return; } if (stcb == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: Missing stcb\n"); return; } /* * Need to have in the asconf: - vtagparam(my_vtag/peer_vtag) - * add(0.0.0.0) - del(0.0.0.0) - Any global addresses add(addr) */ SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), SCTP_M_ASC_ADDR); if (aa == NULL) { /* didn't get memory */ SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: failed to get memory!\n"); return; } aa->special_del = 0; /* fill in asconf address parameter fields */ /* top level elements are "networked" during send */ aa->ifa = NULL; aa->sent = 0; /* clear sent flag */ vtag = (struct sctp_asconf_tag_param *)&aa->ap.aph; vtag->aph.ph.param_type = SCTP_NAT_VTAGS; vtag->aph.ph.param_length = sizeof(struct sctp_asconf_tag_param); vtag->local_vtag = htonl(stcb->asoc.my_vtag); vtag->remote_vtag = htonl(stcb->asoc.peer_vtag); TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), SCTP_M_ASC_ADDR); if (aa == NULL) { /* didn't get memory */ SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: failed to get memory!\n"); return; } memset(aa, 0, sizeof(struct sctp_asconf_addr)); /* fill in asconf address parameter fields */ /* ADD(0.0.0.0) */ switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS; aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addrv4_param); aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS; aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv4addr_param); /* No need to add an address, we are using 0.0.0.0 */ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); break; #endif #ifdef INET6 case AF_INET6: aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS; aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addr_param); aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS; aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv6addr_param); /* No need to add an address, we are using 0.0.0.0 */ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); break; #endif default: SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: unknown address family\n"); SCTP_FREE(aa, SCTP_M_ASC_ADDR); return; } SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), SCTP_M_ASC_ADDR); if (aa == NULL) { /* didn't get memory */ SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: failed to get memory!\n"); return; } memset(aa, 0, sizeof(struct sctp_asconf_addr)); /* fill in asconf address parameter fields */ /* ADD(0.0.0.0) */ switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS; aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addrv4_param); aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS; aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv4addr_param); /* No need to add an address, we are using 0.0.0.0 */ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); break; #endif #ifdef INET6 case AF_INET6: aa->ap.aph.ph.param_type = SCTP_DEL_IP_ADDRESS; aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addr_param); aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS; aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv6addr_param); /* No need to add an address, we are using 0.0.0.0 */ TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); break; #endif default: SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: unknown address family\n"); SCTP_FREE(aa, SCTP_M_ASC_ADDR); return; } /* Now we must hunt the addresses and add all global addresses */ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { struct sctp_vrf *vrf = NULL; struct sctp_ifn *sctp_ifnp; uint32_t vrf_id; vrf_id = stcb->sctp_ep->def_vrf_id; vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { goto skip_rest; } SCTP_IPI_ADDR_RLOCK(); LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) { LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) { switch (sctp_ifap->address.sa.sa_family) { #ifdef INET case AF_INET: to = &sctp_ifap->address.sin; if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred, &to->sin_addr) != 0) { continue; } if (IN4_ISPRIVATE_ADDRESS(&to->sin_addr)) { continue; } if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) { continue; } break; #endif #ifdef INET6 case AF_INET6: to6 = &sctp_ifap->address.sin6; if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred, &to6->sin6_addr) != 0) { continue; } if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr)) { continue; } if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) { continue; } break; #endif default: continue; } sctp_asconf_queue_mgmt(stcb, sctp_ifap, SCTP_ADD_IP_ADDRESS); } } SCTP_IPI_ADDR_RUNLOCK(); } else { struct sctp_laddr *laddr; LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) /* * Address being deleted by the system, dont * list. */ continue; if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* * Address being deleted on this ep don't * list. */ continue; } sctp_ifap = laddr->ifa; switch (sctp_ifap->address.sa.sa_family) { #ifdef INET case AF_INET: to = &sctp_ifap->address.sin; if (IN4_ISPRIVATE_ADDRESS(&to->sin_addr)) { continue; } if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) { continue; } break; #endif #ifdef INET6 case AF_INET6: to6 = &sctp_ifap->address.sin6; if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr)) { continue; } if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) { continue; } break; #endif default: continue; } sctp_asconf_queue_mgmt(stcb, sctp_ifap, SCTP_ADD_IP_ADDRESS); } } skip_rest: /* Now we must send the asconf into the queue */ sctp_send_asconf(stcb, net, SCTP_ADDR_NOT_LOCKED); } Index: projects/clang900-import/sys/sys/elf_common.h =================================================================== --- projects/clang900-import/sys/sys/elf_common.h (revision 352586) +++ projects/clang900-import/sys/sys/elf_common.h (revision 352587) @@ -1,1457 +1,1463 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017, 2018 Dell EMC * Copyright (c) 2000, 2001, 2008, 2011, David E. O'Brien * Copyright (c) 1998 John D. Polstra. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_ELF_COMMON_H_ #define _SYS_ELF_COMMON_H_ 1 /* * ELF definitions that are independent of architecture or word size. */ /* * Note header. The ".note" section contains an array of notes. Each * begins with this header, aligned to a word boundary. Immediately * following the note header is n_namesz bytes of name, padded to the * next word boundary. Then comes n_descsz bytes of descriptor, again * padded to a word boundary. The values of n_namesz and n_descsz do * not include the padding. */ typedef struct { u_int32_t n_namesz; /* Length of name. */ u_int32_t n_descsz; /* Length of descriptor. */ u_int32_t n_type; /* Type of this note. */ } Elf_Note; typedef Elf_Note Elf_Nhdr; /* * Option kinds. */ #define ODK_NULL 0 /* undefined */ #define ODK_REGINFO 1 /* register usage info */ #define ODK_EXCEPTIONS 2 /* exception processing info */ #define ODK_PAD 3 /* section padding */ #define ODK_HWPATCH 4 /* hardware patch applied */ #define ODK_FILL 5 /* fill value used by the linker */ #define ODK_TAGS 6 /* reserved space for tools */ #define ODK_HWAND 7 /* hardware AND patch applied */ #define ODK_HWOR 8 /* hardware OR patch applied */ #define ODK_GP_GROUP 9 /* GP group for text/data sections */ #define ODK_IDENT 10 /* ID information */ #define ODK_PAGESIZE 11 /* page size information */ /* * ODK_EXCEPTIONS info field masks. */ #define OEX_FPU_MIN 0x0000001f /* min FPU exception required */ #define OEX_FPU_MAX 0x00001f00 /* max FPU exception allowed */ #define OEX_PAGE0 0x00010000 /* page zero must be mapped */ #define OEX_SMM 0x00020000 /* run in sequential memory mode */ #define OEX_PRECISEFP 0x00040000 /* run in precise FP exception mode */ #define OEX_DISMISS 0x00080000 /* dismiss invalid address traps */ /* * ODK_PAD info field masks. */ #define OPAD_PREFIX 0x0001 #define OPAD_POSTFIX 0x0002 #define OPAD_SYMBOL 0x0004 /* * ODK_HWPATCH info field masks. */ #define OHW_R4KEOP 0x00000001 /* patch for R4000 branch at end-of-page bug */ #define OHW_R8KPFETCH 0x00000002 /* R8000 prefetch bug may occur */ #define OHW_R5KEOP 0x00000004 /* patch for R5000 branch at end-of-page bug */ #define OHW_R5KCVTL 0x00000008 /* R5000 cvt.[ds].l bug: clean == 1 */ #define OHW_R10KLDL 0x00000010UL /* need patch for R10000 misaligned load */ /* * ODK_HWAND/ODK_HWOR info field and hwp_flags[12] masks. */ #define OHWA0_R4KEOP_CHECKED 0x00000001 /* object checked for R4000 end-of-page bug */ #define OHWA0_R4KEOP_CLEAN 0x00000002 /* object verified clean for R4000 end-of-page bug */ #define OHWO0_FIXADE 0x00000001 /* object requires call to fixade */ /* * ODK_IDENT/ODK_GP_GROUP info field masks. */ #define OGP_GROUP 0x0000ffff /* GP group number */ #define OGP_SELF 0x00010000 /* GP group is self-contained */ /* * The header for GNU-style hash sections. */ typedef struct { u_int32_t gh_nbuckets; /* Number of hash buckets. */ u_int32_t gh_symndx; /* First visible symbol in .dynsym. */ u_int32_t gh_maskwords; /* #maskwords used in bloom filter. */ u_int32_t gh_shift2; /* Bloom filter shift count. */ } Elf_GNU_Hash_Header; /* Indexes into the e_ident array. Keep synced with http://www.sco.com/developers/gabi/latest/ch4.eheader.html */ #define EI_MAG0 0 /* Magic number, byte 0. */ #define EI_MAG1 1 /* Magic number, byte 1. */ #define EI_MAG2 2 /* Magic number, byte 2. */ #define EI_MAG3 3 /* Magic number, byte 3. */ #define EI_CLASS 4 /* Class of machine. */ #define EI_DATA 5 /* Data format. */ #define EI_VERSION 6 /* ELF format version. */ #define EI_OSABI 7 /* Operating system / ABI identification */ #define EI_ABIVERSION 8 /* ABI version */ #define OLD_EI_BRAND 8 /* Start of architecture identification. */ #define EI_PAD 9 /* Start of padding (per SVR4 ABI). */ #define EI_NIDENT 16 /* Size of e_ident array. */ /* Values for the magic number bytes. */ #define ELFMAG0 0x7f #define ELFMAG1 'E' #define ELFMAG2 'L' #define ELFMAG3 'F' #define ELFMAG "\177ELF" /* magic string */ #define SELFMAG 4 /* magic string size */ /* Values for e_ident[EI_VERSION] and e_version. */ #define EV_NONE 0 #define EV_CURRENT 1 /* Values for e_ident[EI_CLASS]. */ #define ELFCLASSNONE 0 /* Unknown class. */ #define ELFCLASS32 1 /* 32-bit architecture. */ #define ELFCLASS64 2 /* 64-bit architecture. */ /* Values for e_ident[EI_DATA]. */ #define ELFDATANONE 0 /* Unknown data format. */ #define ELFDATA2LSB 1 /* 2's complement little-endian. */ #define ELFDATA2MSB 2 /* 2's complement big-endian. */ /* Values for e_ident[EI_OSABI]. */ #define ELFOSABI_NONE 0 /* UNIX System V ABI */ #define ELFOSABI_HPUX 1 /* HP-UX operating system */ #define ELFOSABI_NETBSD 2 /* NetBSD */ #define ELFOSABI_LINUX 3 /* GNU/Linux */ #define ELFOSABI_HURD 4 /* GNU/Hurd */ #define ELFOSABI_86OPEN 5 /* 86Open common IA32 ABI */ #define ELFOSABI_SOLARIS 6 /* Solaris */ #define ELFOSABI_AIX 7 /* AIX */ #define ELFOSABI_IRIX 8 /* IRIX */ #define ELFOSABI_FREEBSD 9 /* FreeBSD */ #define ELFOSABI_TRU64 10 /* TRU64 UNIX */ #define ELFOSABI_MODESTO 11 /* Novell Modesto */ #define ELFOSABI_OPENBSD 12 /* OpenBSD */ #define ELFOSABI_OPENVMS 13 /* Open VMS */ #define ELFOSABI_NSK 14 /* HP Non-Stop Kernel */ #define ELFOSABI_AROS 15 /* Amiga Research OS */ #define ELFOSABI_FENIXOS 16 /* FenixOS */ #define ELFOSABI_CLOUDABI 17 /* Nuxi CloudABI */ #define ELFOSABI_ARM_AEABI 64 /* ARM EABI */ #define ELFOSABI_ARM 97 /* ARM */ #define ELFOSABI_STANDALONE 255 /* Standalone (embedded) application */ #define ELFOSABI_SYSV ELFOSABI_NONE /* symbol used in old spec */ #define ELFOSABI_MONTEREY ELFOSABI_AIX /* Monterey */ #define ELFOSABI_GNU ELFOSABI_LINUX /* e_ident */ #define IS_ELF(ehdr) ((ehdr).e_ident[EI_MAG0] == ELFMAG0 && \ (ehdr).e_ident[EI_MAG1] == ELFMAG1 && \ (ehdr).e_ident[EI_MAG2] == ELFMAG2 && \ (ehdr).e_ident[EI_MAG3] == ELFMAG3) /* Values for e_type. */ #define ET_NONE 0 /* Unknown type. */ #define ET_REL 1 /* Relocatable. */ #define ET_EXEC 2 /* Executable. */ #define ET_DYN 3 /* Shared object. */ #define ET_CORE 4 /* Core file. */ #define ET_LOOS 0xfe00 /* First operating system specific. */ #define ET_HIOS 0xfeff /* Last operating system-specific. */ #define ET_LOPROC 0xff00 /* First processor-specific. */ #define ET_HIPROC 0xffff /* Last processor-specific. */ /* Values for e_machine. */ #define EM_NONE 0 /* Unknown machine. */ #define EM_M32 1 /* AT&T WE32100. */ #define EM_SPARC 2 /* Sun SPARC. */ #define EM_386 3 /* Intel i386. */ #define EM_68K 4 /* Motorola 68000. */ #define EM_88K 5 /* Motorola 88000. */ #define EM_IAMCU 6 /* Intel MCU. */ #define EM_860 7 /* Intel i860. */ #define EM_MIPS 8 /* MIPS R3000 Big-Endian only. */ #define EM_S370 9 /* IBM System/370. */ #define EM_MIPS_RS3_LE 10 /* MIPS R3000 Little-Endian. */ #define EM_PARISC 15 /* HP PA-RISC. */ #define EM_VPP500 17 /* Fujitsu VPP500. */ #define EM_SPARC32PLUS 18 /* SPARC v8plus. */ #define EM_960 19 /* Intel 80960. */ #define EM_PPC 20 /* PowerPC 32-bit. */ #define EM_PPC64 21 /* PowerPC 64-bit. */ #define EM_S390 22 /* IBM System/390. */ #define EM_V800 36 /* NEC V800. */ #define EM_FR20 37 /* Fujitsu FR20. */ #define EM_RH32 38 /* TRW RH-32. */ #define EM_RCE 39 /* Motorola RCE. */ #define EM_ARM 40 /* ARM. */ #define EM_SH 42 /* Hitachi SH. */ #define EM_SPARCV9 43 /* SPARC v9 64-bit. */ #define EM_TRICORE 44 /* Siemens TriCore embedded processor. */ #define EM_ARC 45 /* Argonaut RISC Core. */ #define EM_H8_300 46 /* Hitachi H8/300. */ #define EM_H8_300H 47 /* Hitachi H8/300H. */ #define EM_H8S 48 /* Hitachi H8S. */ #define EM_H8_500 49 /* Hitachi H8/500. */ #define EM_IA_64 50 /* Intel IA-64 Processor. */ #define EM_MIPS_X 51 /* Stanford MIPS-X. */ #define EM_COLDFIRE 52 /* Motorola ColdFire. */ #define EM_68HC12 53 /* Motorola M68HC12. */ #define EM_MMA 54 /* Fujitsu MMA. */ #define EM_PCP 55 /* Siemens PCP. */ #define EM_NCPU 56 /* Sony nCPU. */ #define EM_NDR1 57 /* Denso NDR1 microprocessor. */ #define EM_STARCORE 58 /* Motorola Star*Core processor. */ #define EM_ME16 59 /* Toyota ME16 processor. */ #define EM_ST100 60 /* STMicroelectronics ST100 processor. */ #define EM_TINYJ 61 /* Advanced Logic Corp. TinyJ processor. */ #define EM_X86_64 62 /* Advanced Micro Devices x86-64 */ #define EM_AMD64 EM_X86_64 /* Advanced Micro Devices x86-64 (compat) */ #define EM_PDSP 63 /* Sony DSP Processor. */ #define EM_FX66 66 /* Siemens FX66 microcontroller. */ #define EM_ST9PLUS 67 /* STMicroelectronics ST9+ 8/16 microcontroller. */ #define EM_ST7 68 /* STmicroelectronics ST7 8-bit microcontroller. */ #define EM_68HC16 69 /* Motorola MC68HC16 microcontroller. */ #define EM_68HC11 70 /* Motorola MC68HC11 microcontroller. */ #define EM_68HC08 71 /* Motorola MC68HC08 microcontroller. */ #define EM_68HC05 72 /* Motorola MC68HC05 microcontroller. */ #define EM_SVX 73 /* Silicon Graphics SVx. */ #define EM_ST19 74 /* STMicroelectronics ST19 8-bit mc. */ #define EM_VAX 75 /* Digital VAX. */ #define EM_CRIS 76 /* Axis Communications 32-bit embedded processor. */ #define EM_JAVELIN 77 /* Infineon Technologies 32-bit embedded processor. */ #define EM_FIREPATH 78 /* Element 14 64-bit DSP Processor. */ #define EM_ZSP 79 /* LSI Logic 16-bit DSP Processor. */ #define EM_MMIX 80 /* Donald Knuth's educational 64-bit proc. */ #define EM_HUANY 81 /* Harvard University machine-independent object files. */ #define EM_PRISM 82 /* SiTera Prism. */ #define EM_AVR 83 /* Atmel AVR 8-bit microcontroller. */ #define EM_FR30 84 /* Fujitsu FR30. */ #define EM_D10V 85 /* Mitsubishi D10V. */ #define EM_D30V 86 /* Mitsubishi D30V. */ #define EM_V850 87 /* NEC v850. */ #define EM_M32R 88 /* Mitsubishi M32R. */ #define EM_MN10300 89 /* Matsushita MN10300. */ #define EM_MN10200 90 /* Matsushita MN10200. */ #define EM_PJ 91 /* picoJava. */ #define EM_OPENRISC 92 /* OpenRISC 32-bit embedded processor. */ #define EM_ARC_A5 93 /* ARC Cores Tangent-A5. */ #define EM_XTENSA 94 /* Tensilica Xtensa Architecture. */ #define EM_VIDEOCORE 95 /* Alphamosaic VideoCore processor. */ #define EM_TMM_GPP 96 /* Thompson Multimedia General Purpose Processor. */ #define EM_NS32K 97 /* National Semiconductor 32000 series. */ #define EM_TPC 98 /* Tenor Network TPC processor. */ #define EM_SNP1K 99 /* Trebia SNP 1000 processor. */ #define EM_ST200 100 /* STMicroelectronics ST200 microcontroller. */ #define EM_IP2K 101 /* Ubicom IP2xxx microcontroller family. */ #define EM_MAX 102 /* MAX Processor. */ #define EM_CR 103 /* National Semiconductor CompactRISC microprocessor. */ #define EM_F2MC16 104 /* Fujitsu F2MC16. */ #define EM_MSP430 105 /* Texas Instruments embedded microcontroller msp430. */ #define EM_BLACKFIN 106 /* Analog Devices Blackfin (DSP) processor. */ #define EM_SE_C33 107 /* S1C33 Family of Seiko Epson processors. */ #define EM_SEP 108 /* Sharp embedded microprocessor. */ #define EM_ARCA 109 /* Arca RISC Microprocessor. */ #define EM_UNICORE 110 /* Microprocessor series from PKU-Unity Ltd. and MPRC of Peking University */ #define EM_AARCH64 183 /* AArch64 (64-bit ARM) */ #define EM_RISCV 243 /* RISC-V */ /* Non-standard or deprecated. */ #define EM_486 6 /* Intel i486. */ #define EM_MIPS_RS4_BE 10 /* MIPS R4000 Big-Endian */ #define EM_ALPHA_STD 41 /* Digital Alpha (standard value). */ #define EM_ALPHA 0x9026 /* Alpha (written in the absence of an ABI) */ /** * e_flags */ #define EF_ARM_RELEXEC 0x1 #define EF_ARM_HASENTRY 0x2 #define EF_ARM_SYMSARESORTED 0x4 #define EF_ARM_DYNSYMSUSESEGIDX 0x8 #define EF_ARM_MAPSYMSFIRST 0x10 #define EF_ARM_LE8 0x00400000 #define EF_ARM_BE8 0x00800000 #define EF_ARM_EABIMASK 0xFF000000 #define EF_ARM_EABI_UNKNOWN 0x00000000 #define EF_ARM_EABI_VER1 0x01000000 #define EF_ARM_EABI_VER2 0x02000000 #define EF_ARM_EABI_VER3 0x03000000 #define EF_ARM_EABI_VER4 0x04000000 #define EF_ARM_EABI_VER5 0x05000000 #define EF_ARM_INTERWORK 0x00000004 #define EF_ARM_APCS_26 0x00000008 #define EF_ARM_APCS_FLOAT 0x00000010 #define EF_ARM_PIC 0x00000020 #define EF_ARM_ALIGN8 0x00000040 #define EF_ARM_NEW_ABI 0x00000080 #define EF_ARM_OLD_ABI 0x00000100 #define EF_ARM_ABI_FLOAT_SOFT 0x00000200 #define EF_ARM_SOFT_FLOAT EF_ARM_ABI_FLOAT_SOFT /* Pre-V5 ABI name */ #define EF_ARM_ABI_FLOAT_HARD 0x00000400 #define EF_ARM_VFP_FLOAT EF_ARM_ABI_FLOAT_HARD /* Pre-V5 ABI name */ #define EF_ARM_MAVERICK_FLOAT 0x00000800 #define EF_MIPS_NOREORDER 0x00000001 #define EF_MIPS_PIC 0x00000002 /* Contains PIC code */ #define EF_MIPS_CPIC 0x00000004 /* STD PIC calling sequence */ #define EF_MIPS_UCODE 0x00000010 #define EF_MIPS_ABI2 0x00000020 /* N32 */ #define EF_MIPS_OPTIONS_FIRST 0x00000080 #define EF_MIPS_ABI 0x0000F000 #define EF_MIPS_ABI_O32 0x00001000 #define EF_MIPS_ABI_O64 0x00002000 #define EF_MIPS_ABI_EABI32 0x00003000 #define EF_MIPS_ABI_EABI64 0x00004000 #define EF_MIPS_ARCH_ASE 0x0F000000 /* Architectural extensions */ #define EF_MIPS_ARCH_ASE_MDMX 0x08000000 /* MDMX multimedia extension */ #define EF_MIPS_ARCH_ASE_M16 0x04000000 /* MIPS-16 ISA extensions */ #define EF_MIPS_ARCH 0xF0000000 /* Architecture field */ #define EF_MIPS_ARCH_1 0x00000000 /* -mips1 code */ #define EF_MIPS_ARCH_2 0x10000000 /* -mips2 code */ #define EF_MIPS_ARCH_3 0x20000000 /* -mips3 code */ #define EF_MIPS_ARCH_4 0x30000000 /* -mips4 code */ #define EF_MIPS_ARCH_5 0x40000000 /* -mips5 code */ #define EF_MIPS_ARCH_32 0x50000000 /* -mips32 code */ #define EF_MIPS_ARCH_64 0x60000000 /* -mips64 code */ #define EF_MIPS_ARCH_32R2 0x70000000 /* -mips32r2 code */ #define EF_MIPS_ARCH_64R2 0x80000000 /* -mips64r2 code */ #define EF_PPC_EMB 0x80000000 #define EF_PPC_RELOCATABLE 0x00010000 #define EF_PPC_RELOCATABLE_LIB 0x00008000 #define EF_RISCV_RVC 0x00000001 #define EF_RISCV_FLOAT_ABI_MASK 0x00000006 #define EF_RISCV_FLOAT_ABI_SOFT 0x00000000 #define EF_RISCV_FLOAT_ABI_SINGLE 0x000002 #define EF_RISCV_FLOAT_ABI_DOUBLE 0x000004 #define EF_RISCV_FLOAT_ABI_QUAD 0x00000006 #define EF_RISCV_RVE 0x00000008 #define EF_RISCV_TSO 0x00000010 #define EF_SPARC_EXT_MASK 0x00ffff00 #define EF_SPARC_32PLUS 0x00000100 #define EF_SPARC_SUN_US1 0x00000200 #define EF_SPARC_HAL_R1 0x00000200 #define EF_SPARC_SUN_US3 0x00000800 #define EF_SPARCV9_MM 0x00000003 #define EF_SPARCV9_TSO 0x00000000 #define EF_SPARCV9_PSO 0x00000001 #define EF_SPARCV9_RMO 0x00000002 /* Special section indexes. */ #define SHN_UNDEF 0 /* Undefined, missing, irrelevant. */ #define SHN_LORESERVE 0xff00 /* First of reserved range. */ #define SHN_LOPROC 0xff00 /* First processor-specific. */ #define SHN_HIPROC 0xff1f /* Last processor-specific. */ #define SHN_LOOS 0xff20 /* First operating system-specific. */ #define SHN_FBSD_CACHED SHN_LOOS /* Transient, for sys/kern/link_elf_obj linker only: Cached global in local symtab. */ #define SHN_HIOS 0xff3f /* Last operating system-specific. */ #define SHN_ABS 0xfff1 /* Absolute values. */ #define SHN_COMMON 0xfff2 /* Common data. */ #define SHN_XINDEX 0xffff /* Escape -- index stored elsewhere. */ #define SHN_HIRESERVE 0xffff /* Last of reserved range. */ /* sh_type */ #define SHT_NULL 0 /* inactive */ #define SHT_PROGBITS 1 /* program defined information */ #define SHT_SYMTAB 2 /* symbol table section */ #define SHT_STRTAB 3 /* string table section */ #define SHT_RELA 4 /* relocation section with addends */ #define SHT_HASH 5 /* symbol hash table section */ #define SHT_DYNAMIC 6 /* dynamic section */ #define SHT_NOTE 7 /* note section */ #define SHT_NOBITS 8 /* no space section */ #define SHT_REL 9 /* relocation section - no addends */ #define SHT_SHLIB 10 /* reserved - purpose unknown */ #define SHT_DYNSYM 11 /* dynamic symbol table section */ #define SHT_INIT_ARRAY 14 /* Initialization function pointers. */ #define SHT_FINI_ARRAY 15 /* Termination function pointers. */ #define SHT_PREINIT_ARRAY 16 /* Pre-initialization function ptrs. */ #define SHT_GROUP 17 /* Section group. */ #define SHT_SYMTAB_SHNDX 18 /* Section indexes (see SHN_XINDEX). */ #define SHT_LOOS 0x60000000 /* First of OS specific semantics */ #define SHT_LOSUNW 0x6ffffff4 #define SHT_SUNW_dof 0x6ffffff4 #define SHT_SUNW_cap 0x6ffffff5 #define SHT_GNU_ATTRIBUTES 0x6ffffff5 #define SHT_SUNW_SIGNATURE 0x6ffffff6 #define SHT_GNU_HASH 0x6ffffff6 #define SHT_GNU_LIBLIST 0x6ffffff7 #define SHT_SUNW_ANNOTATE 0x6ffffff7 #define SHT_SUNW_DEBUGSTR 0x6ffffff8 #define SHT_SUNW_DEBUG 0x6ffffff9 #define SHT_SUNW_move 0x6ffffffa #define SHT_SUNW_COMDAT 0x6ffffffb #define SHT_SUNW_syminfo 0x6ffffffc #define SHT_SUNW_verdef 0x6ffffffd #define SHT_GNU_verdef 0x6ffffffd /* Symbol versions provided */ #define SHT_SUNW_verneed 0x6ffffffe #define SHT_GNU_verneed 0x6ffffffe /* Symbol versions required */ #define SHT_SUNW_versym 0x6fffffff #define SHT_GNU_versym 0x6fffffff /* Symbol version table */ #define SHT_HISUNW 0x6fffffff #define SHT_HIOS 0x6fffffff /* Last of OS specific semantics */ #define SHT_LOPROC 0x70000000 /* reserved range for processor */ #define SHT_X86_64_UNWIND 0x70000001 /* unwind information */ #define SHT_AMD64_UNWIND SHT_X86_64_UNWIND #define SHT_ARM_EXIDX 0x70000001 /* Exception index table. */ #define SHT_ARM_PREEMPTMAP 0x70000002 /* BPABI DLL dynamic linking pre-emption map. */ #define SHT_ARM_ATTRIBUTES 0x70000003 /* Object file compatibility attributes. */ #define SHT_ARM_DEBUGOVERLAY 0x70000004 /* See DBGOVL for details. */ #define SHT_ARM_OVERLAYSECTION 0x70000005 /* See DBGOVL for details. */ #define SHT_MIPS_LIBLIST 0x70000000 #define SHT_MIPS_MSYM 0x70000001 #define SHT_MIPS_CONFLICT 0x70000002 #define SHT_MIPS_GPTAB 0x70000003 #define SHT_MIPS_UCODE 0x70000004 #define SHT_MIPS_DEBUG 0x70000005 #define SHT_MIPS_REGINFO 0x70000006 #define SHT_MIPS_PACKAGE 0x70000007 #define SHT_MIPS_PACKSYM 0x70000008 #define SHT_MIPS_RELD 0x70000009 #define SHT_MIPS_IFACE 0x7000000b #define SHT_MIPS_CONTENT 0x7000000c #define SHT_MIPS_OPTIONS 0x7000000d #define SHT_MIPS_DELTASYM 0x7000001b #define SHT_MIPS_DELTAINST 0x7000001c #define SHT_MIPS_DELTACLASS 0x7000001d #define SHT_MIPS_DWARF 0x7000001e /* MIPS gcc uses MIPS_DWARF */ #define SHT_MIPS_DELTADECL 0x7000001f #define SHT_MIPS_SYMBOL_LIB 0x70000020 #define SHT_MIPS_EVENTS 0x70000021 #define SHT_MIPS_TRANSLATE 0x70000022 #define SHT_MIPS_PIXIE 0x70000023 #define SHT_MIPS_XLATE 0x70000024 #define SHT_MIPS_XLATE_DEBUG 0x70000025 #define SHT_MIPS_WHIRL 0x70000026 #define SHT_MIPS_EH_REGION 0x70000027 #define SHT_MIPS_XLATE_OLD 0x70000028 #define SHT_MIPS_PDR_EXCEPTION 0x70000029 #define SHT_MIPS_ABIFLAGS 0x7000002a #define SHT_SPARC_GOTDATA 0x70000000 #define SHTORDERED #define SHT_HIPROC 0x7fffffff /* specific section header types */ #define SHT_LOUSER 0x80000000 /* reserved range for application */ #define SHT_HIUSER 0xffffffff /* specific indexes */ /* Flags for sh_flags. */ #define SHF_WRITE 0x1 /* Section contains writable data. */ #define SHF_ALLOC 0x2 /* Section occupies memory. */ #define SHF_EXECINSTR 0x4 /* Section contains instructions. */ #define SHF_MERGE 0x10 /* Section may be merged. */ #define SHF_STRINGS 0x20 /* Section contains strings. */ #define SHF_INFO_LINK 0x40 /* sh_info holds section index. */ #define SHF_LINK_ORDER 0x80 /* Special ordering requirements. */ #define SHF_OS_NONCONFORMING 0x100 /* OS-specific processing required. */ #define SHF_GROUP 0x200 /* Member of section group. */ #define SHF_TLS 0x400 /* Section contains TLS data. */ #define SHF_COMPRESSED 0x800 /* Section contains compressed data. */ #define SHF_MASKOS 0x0ff00000 /* OS-specific semantics. */ #define SHF_MASKPROC 0xf0000000 /* Processor-specific semantics. */ /* Flags for section groups. */ #define GRP_COMDAT 0x1 /* COMDAT semantics. */ /* * Flags / mask for .gnu.versym sections. */ #define VERSYM_VERSION 0x7fff #define VERSYM_HIDDEN 0x8000 /* Values for p_type. */ #define PT_NULL 0 /* Unused entry. */ #define PT_LOAD 1 /* Loadable segment. */ #define PT_DYNAMIC 2 /* Dynamic linking information segment. */ #define PT_INTERP 3 /* Pathname of interpreter. */ #define PT_NOTE 4 /* Auxiliary information. */ #define PT_SHLIB 5 /* Reserved (not used). */ #define PT_PHDR 6 /* Location of program header itself. */ #define PT_TLS 7 /* Thread local storage segment */ #define PT_LOOS 0x60000000 /* First OS-specific. */ #define PT_SUNW_UNWIND 0x6464e550 /* amd64 UNWIND program header */ #define PT_GNU_EH_FRAME 0x6474e550 #define PT_GNU_STACK 0x6474e551 #define PT_GNU_RELRO 0x6474e552 #define PT_DUMP_DELTA 0x6fb5d000 /* va->pa map for kernel dumps (currently arm). */ #define PT_LOSUNW 0x6ffffffa #define PT_SUNWBSS 0x6ffffffa /* Sun Specific segment */ #define PT_SUNWSTACK 0x6ffffffb /* describes the stack segment */ #define PT_SUNWDTRACE 0x6ffffffc /* private */ #define PT_SUNWCAP 0x6ffffffd /* hard/soft capabilities segment */ #define PT_HISUNW 0x6fffffff #define PT_HIOS 0x6fffffff /* Last OS-specific. */ #define PT_LOPROC 0x70000000 /* First processor-specific type. */ #define PT_ARM_ARCHEXT 0x70000000 /* ARM arch compat information. */ #define PT_ARM_EXIDX 0x70000001 /* ARM exception unwind tables. */ #define PT_HIPROC 0x7fffffff /* Last processor-specific type. */ #define PT_OPENBSD_RANDOMIZE 0x65A3DBE6 /* OpenBSD random data segment */ #define PT_OPENBSD_WXNEEDED 0x65A3DBE7 /* OpenBSD EXEC/WRITE pages needed */ #define PT_OPENBSD_BOOTDATA 0x65A41BE6 /* OpenBSD section for boot args */ /* Values for p_flags. */ #define PF_X 0x1 /* Executable. */ #define PF_W 0x2 /* Writable. */ #define PF_R 0x4 /* Readable. */ #define PF_MASKOS 0x0ff00000 /* Operating system-specific. */ #define PF_MASKPROC 0xf0000000 /* Processor-specific. */ /* Extended program header index. */ #define PN_XNUM 0xffff /* Values for d_tag. */ #define DT_NULL 0 /* Terminating entry. */ #define DT_NEEDED 1 /* String table offset of a needed shared library. */ #define DT_PLTRELSZ 2 /* Total size in bytes of PLT relocations. */ #define DT_PLTGOT 3 /* Processor-dependent address. */ #define DT_HASH 4 /* Address of symbol hash table. */ #define DT_STRTAB 5 /* Address of string table. */ #define DT_SYMTAB 6 /* Address of symbol table. */ #define DT_RELA 7 /* Address of ElfNN_Rela relocations. */ #define DT_RELASZ 8 /* Total size of ElfNN_Rela relocations. */ #define DT_RELAENT 9 /* Size of each ElfNN_Rela relocation entry. */ #define DT_STRSZ 10 /* Size of string table. */ #define DT_SYMENT 11 /* Size of each symbol table entry. */ #define DT_INIT 12 /* Address of initialization function. */ #define DT_FINI 13 /* Address of finalization function. */ #define DT_SONAME 14 /* String table offset of shared object name. */ #define DT_RPATH 15 /* String table offset of library path. [sup] */ #define DT_SYMBOLIC 16 /* Indicates "symbolic" linking. [sup] */ #define DT_REL 17 /* Address of ElfNN_Rel relocations. */ #define DT_RELSZ 18 /* Total size of ElfNN_Rel relocations. */ #define DT_RELENT 19 /* Size of each ElfNN_Rel relocation. */ #define DT_PLTREL 20 /* Type of relocation used for PLT. */ #define DT_DEBUG 21 /* Reserved (not used). */ #define DT_TEXTREL 22 /* Indicates there may be relocations in non-writable segments. [sup] */ #define DT_JMPREL 23 /* Address of PLT relocations. */ #define DT_BIND_NOW 24 /* [sup] */ #define DT_INIT_ARRAY 25 /* Address of the array of pointers to initialization functions */ #define DT_FINI_ARRAY 26 /* Address of the array of pointers to termination functions */ #define DT_INIT_ARRAYSZ 27 /* Size in bytes of the array of initialization functions. */ #define DT_FINI_ARRAYSZ 28 /* Size in bytes of the array of termination functions. */ #define DT_RUNPATH 29 /* String table offset of a null-terminated library search path string. */ #define DT_FLAGS 30 /* Object specific flag values. */ #define DT_ENCODING 32 /* Values greater than or equal to DT_ENCODING and less than DT_LOOS follow the rules for the interpretation of the d_un union as follows: even == 'd_ptr', odd == 'd_val' or none */ #define DT_PREINIT_ARRAY 32 /* Address of the array of pointers to pre-initialization functions. */ #define DT_PREINIT_ARRAYSZ 33 /* Size in bytes of the array of pre-initialization functions. */ #define DT_MAXPOSTAGS 34 /* number of positive tags */ #define DT_LOOS 0x6000000d /* First OS-specific */ #define DT_SUNW_AUXILIARY 0x6000000d /* symbol auxiliary name */ #define DT_SUNW_RTLDINF 0x6000000e /* ld.so.1 info (private) */ #define DT_SUNW_FILTER 0x6000000f /* symbol filter name */ #define DT_SUNW_CAP 0x60000010 /* hardware/software */ #define DT_SUNW_ASLR 0x60000023 /* ASLR control */ #define DT_HIOS 0x6ffff000 /* Last OS-specific */ /* * DT_* entries which fall between DT_VALRNGHI & DT_VALRNGLO use the * Dyn.d_un.d_val field of the Elf*_Dyn structure. */ #define DT_VALRNGLO 0x6ffffd00 #define DT_GNU_PRELINKED 0x6ffffdf5 /* prelinking timestamp */ #define DT_GNU_CONFLICTSZ 0x6ffffdf6 /* size of conflict section */ #define DT_GNU_LIBLISTSZ 0x6ffffdf7 /* size of library list */ #define DT_CHECKSUM 0x6ffffdf8 /* elf checksum */ #define DT_PLTPADSZ 0x6ffffdf9 /* pltpadding size */ #define DT_MOVEENT 0x6ffffdfa /* move table entry size */ #define DT_MOVESZ 0x6ffffdfb /* move table size */ #define DT_FEATURE 0x6ffffdfc /* feature holder */ #define DT_FEATURE_1 DT_FEATURE #define DT_POSFLAG_1 0x6ffffdfd /* flags for DT_* entries, effecting */ /* the following DT_* entry. */ /* See DF_P1_* definitions */ #define DT_SYMINSZ 0x6ffffdfe /* syminfo table size (in bytes) */ #define DT_SYMINENT 0x6ffffdff /* syminfo entry size (in bytes) */ #define DT_VALRNGHI 0x6ffffdff /* * DT_* entries which fall between DT_ADDRRNGHI & DT_ADDRRNGLO use the * Dyn.d_un.d_ptr field of the Elf*_Dyn structure. * * If any adjustment is made to the ELF object after it has been * built, these entries will need to be adjusted. */ #define DT_ADDRRNGLO 0x6ffffe00 #define DT_GNU_HASH 0x6ffffef5 /* GNU-style hash table */ #define DT_TLSDESC_PLT 0x6ffffef6 /* loc. of PLT for tlsdesc resolver */ #define DT_TLSDESC_GOT 0x6ffffef7 /* loc. of GOT for tlsdesc resolver */ #define DT_GNU_CONFLICT 0x6ffffef8 /* address of conflict section */ #define DT_GNU_LIBLIST 0x6ffffef9 /* address of library list */ #define DT_CONFIG 0x6ffffefa /* configuration information */ #define DT_DEPAUDIT 0x6ffffefb /* dependency auditing */ #define DT_AUDIT 0x6ffffefc /* object auditing */ #define DT_PLTPAD 0x6ffffefd /* pltpadding (sparcv9) */ #define DT_MOVETAB 0x6ffffefe /* move table */ #define DT_SYMINFO 0x6ffffeff /* syminfo table */ #define DT_ADDRRNGHI 0x6ffffeff #define DT_VERSYM 0x6ffffff0 /* Address of versym section. */ #define DT_RELACOUNT 0x6ffffff9 /* number of RELATIVE relocations */ #define DT_RELCOUNT 0x6ffffffa /* number of RELATIVE relocations */ #define DT_FLAGS_1 0x6ffffffb /* state flags - see DF_1_* defs */ #define DT_VERDEF 0x6ffffffc /* Address of verdef section. */ #define DT_VERDEFNUM 0x6ffffffd /* Number of elems in verdef section */ #define DT_VERNEED 0x6ffffffe /* Address of verneed section. */ #define DT_VERNEEDNUM 0x6fffffff /* Number of elems in verneed section */ #define DT_LOPROC 0x70000000 /* First processor-specific type. */ #define DT_ARM_SYMTABSZ 0x70000001 #define DT_ARM_PREEMPTMAP 0x70000002 #define DT_SPARC_REGISTER 0x70000001 #define DT_DEPRECATED_SPARC_REGISTER 0x7000001 #define DT_MIPS_RLD_VERSION 0x70000001 #define DT_MIPS_TIME_STAMP 0x70000002 #define DT_MIPS_ICHECKSUM 0x70000003 #define DT_MIPS_IVERSION 0x70000004 #define DT_MIPS_FLAGS 0x70000005 #define DT_MIPS_BASE_ADDRESS 0x70000006 #define DT_MIPS_CONFLICT 0x70000008 #define DT_MIPS_LIBLIST 0x70000009 #define DT_MIPS_LOCAL_GOTNO 0x7000000a #define DT_MIPS_CONFLICTNO 0x7000000b #define DT_MIPS_LIBLISTNO 0x70000010 #define DT_MIPS_SYMTABNO 0x70000011 #define DT_MIPS_UNREFEXTNO 0x70000012 #define DT_MIPS_GOTSYM 0x70000013 #define DT_MIPS_HIPAGENO 0x70000014 #define DT_MIPS_RLD_MAP 0x70000016 #define DT_MIPS_DELTA_CLASS 0x70000017 #define DT_MIPS_DELTA_CLASS_NO 0x70000018 #define DT_MIPS_DELTA_INSTANCE 0x70000019 #define DT_MIPS_DELTA_INSTANCE_NO 0x7000001A #define DT_MIPS_DELTA_RELOC 0x7000001B #define DT_MIPS_DELTA_RELOC_NO 0x7000001C #define DT_MIPS_DELTA_SYM 0x7000001D #define DT_MIPS_DELTA_SYM_NO 0x7000001E #define DT_MIPS_DELTA_CLASSSYM 0x70000020 #define DT_MIPS_DELTA_CLASSSYM_NO 0x70000021 #define DT_MIPS_CXX_FLAGS 0x70000022 #define DT_MIPS_PIXIE_INIT 0x70000023 #define DT_MIPS_SYMBOL_LIB 0x70000024 #define DT_MIPS_LOCALPAGE_GOTIDX 0x70000025 #define DT_MIPS_LOCAL_GOTIDX 0x70000026 #define DT_MIPS_HIDDEN_GOTIDX 0x70000027 #define DT_MIPS_PROTECTED_GOTIDX 0x70000028 #define DT_MIPS_OPTIONS 0x70000029 #define DT_MIPS_INTERFACE 0x7000002A #define DT_MIPS_DYNSTR_ALIGN 0x7000002B #define DT_MIPS_INTERFACE_SIZE 0x7000002C #define DT_MIPS_RLD_TEXT_RESOLVE_ADDR 0x7000002D #define DT_MIPS_PERF_SUFFIX 0x7000002E #define DT_MIPS_COMPACT_SIZE 0x7000002F #define DT_MIPS_GP_VALUE 0x70000030 #define DT_MIPS_AUX_DYNAMIC 0x70000031 #define DT_MIPS_PLTGOT 0x70000032 #define DT_MIPS_RLD_OBJ_UPDATE 0x70000033 #define DT_MIPS_RWPLT 0x70000034 #define DT_MIPS_RLD_MAP_REL 0x70000035 #define DT_PPC_GOT 0x70000000 #define DT_PPC_TLSOPT 0x70000001 #define DT_PPC64_GLINK 0x70000000 #define DT_PPC64_OPD 0x70000001 #define DT_PPC64_OPDSZ 0x70000002 #define DT_PPC64_TLSOPT 0x70000003 #define DT_AUXILIARY 0x7ffffffd /* shared library auxiliary name */ #define DT_USED 0x7ffffffe /* ignored - same as needed */ #define DT_FILTER 0x7fffffff /* shared library filter name */ #define DT_HIPROC 0x7fffffff /* Last processor-specific type. */ /* Values for DT_FLAGS */ #define DF_ORIGIN 0x0001 /* Indicates that the object being loaded may make reference to the $ORIGIN substitution string */ #define DF_SYMBOLIC 0x0002 /* Indicates "symbolic" linking. */ #define DF_TEXTREL 0x0004 /* Indicates there may be relocations in non-writable segments. */ #define DF_BIND_NOW 0x0008 /* Indicates that the dynamic linker should process all relocations for the object containing this entry before transferring control to the program. */ #define DF_STATIC_TLS 0x0010 /* Indicates that the shared object or executable contains code using a static thread-local storage scheme. */ /* Values for DT_FLAGS_1 */ #define DF_1_BIND_NOW 0x00000001 /* Same as DF_BIND_NOW */ #define DF_1_GLOBAL 0x00000002 /* Set the RTLD_GLOBAL for object */ #define DF_1_NODELETE 0x00000008 /* Set the RTLD_NODELETE for object */ #define DF_1_LOADFLTR 0x00000010 /* Immediate loading of filtees */ #define DF_1_NOOPEN 0x00000040 /* Do not allow loading on dlopen() */ #define DF_1_ORIGIN 0x00000080 /* Process $ORIGIN */ #define DF_1_INTERPOSE 0x00000400 /* Interpose all objects but main */ #define DF_1_NODEFLIB 0x00000800 /* Do not search default paths */ /* Values for l_flags. */ #define LL_NONE 0x0 /* no flags */ #define LL_EXACT_MATCH 0x1 /* require an exact match */ #define LL_IGNORE_INT_VER 0x2 /* ignore version incompatibilities */ #define LL_REQUIRE_MINOR 0x4 #define LL_EXPORTS 0x8 #define LL_DELAY_LOAD 0x10 #define LL_DELTA 0x20 +/* Note section names */ +#define ELF_NOTE_FREEBSD "FreeBSD" +#define ELF_NOTE_NETBSD "NetBSD" +#define ELF_NOTE_SOLARIS "SUNW Solaris" +#define ELF_NOTE_GNU "GNU" + /* Values for n_type used in executables. */ #define NT_FREEBSD_ABI_TAG 1 #define NT_FREEBSD_NOINIT_TAG 2 #define NT_FREEBSD_ARCH_TAG 3 #define NT_FREEBSD_FEATURE_CTL 4 /* NT_FREEBSD_FEATURE_CTL desc[0] bits */ #define NT_FREEBSD_FCTL_ASLR_DISABLE 0x00000001 #define NT_FREEBSD_FCTL_PROTMAX_DISABLE 0x00000002 /* Values for n_type. Used in core files. */ #define NT_PRSTATUS 1 /* Process status. */ #define NT_FPREGSET 2 /* Floating point registers. */ #define NT_PRPSINFO 3 /* Process state info. */ #define NT_THRMISC 7 /* Thread miscellaneous info. */ #define NT_PROCSTAT_PROC 8 /* Procstat proc data. */ #define NT_PROCSTAT_FILES 9 /* Procstat files data. */ #define NT_PROCSTAT_VMMAP 10 /* Procstat vmmap data. */ #define NT_PROCSTAT_GROUPS 11 /* Procstat groups data. */ #define NT_PROCSTAT_UMASK 12 /* Procstat umask data. */ #define NT_PROCSTAT_RLIMIT 13 /* Procstat rlimit data. */ #define NT_PROCSTAT_OSREL 14 /* Procstat osreldate data. */ #define NT_PROCSTAT_PSSTRINGS 15 /* Procstat ps_strings data. */ #define NT_PROCSTAT_AUXV 16 /* Procstat auxv data. */ #define NT_PTLWPINFO 17 /* Thread ptrace miscellaneous info. */ #define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ #define NT_X86_XSTATE 0x202 /* x86 XSAVE extended state. */ #define NT_ARM_VFP 0x400 /* ARM VFP registers */ /* GNU note types. */ #define NT_GNU_ABI_TAG 1 #define NT_GNU_HWCAP 2 #define NT_GNU_BUILD_ID 3 #define NT_GNU_GOLD_VERSION 4 #define NT_GNU_PROPERTY_TYPE_0 5 #define GNU_PROPERTY_LOPROC 0xc0000000 #define GNU_PROPERTY_HIPROC 0xdfffffff #define GNU_PROPERTY_X86_FEATURE_1_AND 0xc0000002 #define GNU_PROPERTY_X86_FEATURE_1_IBT 0x00000001 #define GNU_PROPERTY_X86_FEATURE_1_SHSTK 0x00000002 /* Symbol Binding - ELFNN_ST_BIND - st_info */ #define STB_LOCAL 0 /* Local symbol */ #define STB_GLOBAL 1 /* Global symbol */ #define STB_WEAK 2 /* like global - lower precedence */ #define STB_LOOS 10 /* Start of operating system reserved range. */ #define STB_GNU_UNIQUE 10 /* Unique symbol (GNU) */ #define STB_HIOS 12 /* End of operating system reserved range. */ #define STB_LOPROC 13 /* reserved range for processor */ #define STB_HIPROC 15 /* specific semantics. */ /* Symbol type - ELFNN_ST_TYPE - st_info */ #define STT_NOTYPE 0 /* Unspecified type. */ #define STT_OBJECT 1 /* Data object. */ #define STT_FUNC 2 /* Function. */ #define STT_SECTION 3 /* Section. */ #define STT_FILE 4 /* Source file. */ #define STT_COMMON 5 /* Uninitialized common block. */ #define STT_TLS 6 /* TLS object. */ #define STT_NUM 7 #define STT_LOOS 10 /* Reserved range for operating system */ #define STT_GNU_IFUNC 10 #define STT_HIOS 12 /* specific semantics. */ #define STT_LOPROC 13 /* Start of processor reserved range. */ #define STT_SPARC_REGISTER 13 /* SPARC register information. */ #define STT_HIPROC 15 /* End of processor reserved range. */ /* Symbol visibility - ELFNN_ST_VISIBILITY - st_other */ #define STV_DEFAULT 0x0 /* Default visibility (see binding). */ #define STV_INTERNAL 0x1 /* Special meaning in relocatable objects. */ #define STV_HIDDEN 0x2 /* Not visible. */ #define STV_PROTECTED 0x3 /* Visible but not preemptible. */ #define STV_EXPORTED 0x4 #define STV_SINGLETON 0x5 #define STV_ELIMINATE 0x6 /* Special symbol table indexes. */ #define STN_UNDEF 0 /* Undefined symbol index. */ /* Symbol versioning flags. */ #define VER_DEF_CURRENT 1 #define VER_DEF_IDX(x) VER_NDX(x) #define VER_FLG_BASE 0x01 #define VER_FLG_WEAK 0x02 #define VER_NEED_CURRENT 1 #define VER_NEED_WEAK (1u << 15) #define VER_NEED_HIDDEN VER_NDX_HIDDEN #define VER_NEED_IDX(x) VER_NDX(x) #define VER_NDX_LOCAL 0 #define VER_NDX_GLOBAL 1 #define VER_NDX_GIVEN 2 #define VER_NDX_HIDDEN (1u << 15) #define VER_NDX(x) ((x) & ~(1u << 15)) #define CA_SUNW_NULL 0 #define CA_SUNW_HW_1 1 /* first hardware capabilities entry */ #define CA_SUNW_SF_1 2 /* first software capabilities entry */ /* * Syminfo flag values */ #define SYMINFO_FLG_DIRECT 0x0001 /* symbol ref has direct association */ /* to object containing defn. */ #define SYMINFO_FLG_PASSTHRU 0x0002 /* ignored - see SYMINFO_FLG_FILTER */ #define SYMINFO_FLG_COPY 0x0004 /* symbol is a copy-reloc */ #define SYMINFO_FLG_LAZYLOAD 0x0008 /* object containing defn should be */ /* lazily-loaded */ #define SYMINFO_FLG_DIRECTBIND 0x0010 /* ref should be bound directly to */ /* object containing defn. */ #define SYMINFO_FLG_NOEXTDIRECT 0x0020 /* don't let an external reference */ /* directly bind to this symbol */ #define SYMINFO_FLG_FILTER 0x0002 /* symbol ref is associated to a */ #define SYMINFO_FLG_AUXILIARY 0x0040 /* standard or auxiliary filter */ /* * Syminfo.si_boundto values. */ #define SYMINFO_BT_SELF 0xffff /* symbol bound to self */ #define SYMINFO_BT_PARENT 0xfffe /* symbol bound to parent */ #define SYMINFO_BT_NONE 0xfffd /* no special symbol binding */ #define SYMINFO_BT_EXTERN 0xfffc /* symbol defined as external */ #define SYMINFO_BT_LOWRESERVE 0xff00 /* beginning of reserved entries */ /* * Syminfo version values. */ #define SYMINFO_NONE 0 /* Syminfo version */ #define SYMINFO_CURRENT 1 #define SYMINFO_NUM 2 /* Values for ch_type (compressed section headers). */ #define ELFCOMPRESS_ZLIB 1 /* ZLIB/DEFLATE */ #define ELFCOMPRESS_LOOS 0x60000000 /* OS-specific */ #define ELFCOMPRESS_HIOS 0x6fffffff #define ELFCOMPRESS_LOPROC 0x70000000 /* Processor-specific */ #define ELFCOMPRESS_HIPROC 0x7fffffff /* Values for a_type. */ #define AT_NULL 0 /* Terminates the vector. */ #define AT_IGNORE 1 /* Ignored entry. */ #define AT_EXECFD 2 /* File descriptor of program to load. */ #define AT_PHDR 3 /* Program header of program already loaded. */ #define AT_PHENT 4 /* Size of each program header entry. */ #define AT_PHNUM 5 /* Number of program header entries. */ #define AT_PAGESZ 6 /* Page size in bytes. */ #define AT_BASE 7 /* Interpreter's base address. */ #define AT_FLAGS 8 /* Flags. */ #define AT_ENTRY 9 /* Where interpreter should transfer control. */ #define AT_NOTELF 10 /* Program is not ELF ?? */ #define AT_UID 11 /* Real uid. */ #define AT_EUID 12 /* Effective uid. */ #ifndef __powerpc__ #define AT_GID 13 /* Real gid. */ #define AT_EGID 14 /* Effective gid. */ #define AT_EXECPATH 15 /* Path to the executable. */ #define AT_CANARY 16 /* Canary for SSP. */ #define AT_CANARYLEN 17 /* Length of the canary. */ #define AT_OSRELDATE 18 /* OSRELDATE. */ #define AT_NCPUS 19 /* Number of CPUs. */ #define AT_PAGESIZES 20 /* Pagesizes. */ #define AT_PAGESIZESLEN 21 /* Number of pagesizes. */ #else /* defined(__powerpc__) */ #define AT_EXECPATH 13 #define AT_CANARY 14 #define AT_CANARYLEN 15 #define AT_OSRELDATE 16 #define AT_NCPUS 17 #define AT_PAGESIZES 18 #define AT_PAGESIZESLEN 19 #define AT_STACKPROT 21 #endif /* defined(__powerpc__) */ #define AT_TIMEKEEP 22 /* Pointer to timehands. */ #ifndef __powerpc__ #define AT_STACKPROT 23 /* Initial stack protection. */ #endif #define AT_EHDRFLAGS 24 /* e_flags field from elf hdr */ #define AT_HWCAP 25 /* CPU feature flags. */ #define AT_HWCAP2 26 /* CPU feature flags 2. */ #define AT_COUNT 27 /* Count of defined aux entry types. */ /* * Relocation types. * * All machine architectures are defined here to allow tools on one to * handle others. */ #define R_386_NONE 0 /* No relocation. */ #define R_386_32 1 /* Add symbol value. */ #define R_386_PC32 2 /* Add PC-relative symbol value. */ #define R_386_GOT32 3 /* Add PC-relative GOT offset. */ #define R_386_PLT32 4 /* Add PC-relative PLT offset. */ #define R_386_COPY 5 /* Copy data from shared object. */ #define R_386_GLOB_DAT 6 /* Set GOT entry to data address. */ #define R_386_JMP_SLOT 7 /* Set GOT entry to code address. */ #define R_386_RELATIVE 8 /* Add load address of shared object. */ #define R_386_GOTOFF 9 /* Add GOT-relative symbol address. */ #define R_386_GOTPC 10 /* Add PC-relative GOT table address. */ #define R_386_TLS_TPOFF 14 /* Negative offset in static TLS block */ #define R_386_TLS_IE 15 /* Absolute address of GOT for -ve static TLS */ #define R_386_TLS_GOTIE 16 /* GOT entry for negative static TLS block */ #define R_386_TLS_LE 17 /* Negative offset relative to static TLS */ #define R_386_TLS_GD 18 /* 32 bit offset to GOT (index,off) pair */ #define R_386_TLS_LDM 19 /* 32 bit offset to GOT (index,zero) pair */ #define R_386_TLS_GD_32 24 /* 32 bit offset to GOT (index,off) pair */ #define R_386_TLS_GD_PUSH 25 /* pushl instruction for Sun ABI GD sequence */ #define R_386_TLS_GD_CALL 26 /* call instruction for Sun ABI GD sequence */ #define R_386_TLS_GD_POP 27 /* popl instruction for Sun ABI GD sequence */ #define R_386_TLS_LDM_32 28 /* 32 bit offset to GOT (index,zero) pair */ #define R_386_TLS_LDM_PUSH 29 /* pushl instruction for Sun ABI LD sequence */ #define R_386_TLS_LDM_CALL 30 /* call instruction for Sun ABI LD sequence */ #define R_386_TLS_LDM_POP 31 /* popl instruction for Sun ABI LD sequence */ #define R_386_TLS_LDO_32 32 /* 32 bit offset from start of TLS block */ #define R_386_TLS_IE_32 33 /* 32 bit offset to GOT static TLS offset entry */ #define R_386_TLS_LE_32 34 /* 32 bit offset within static TLS block */ #define R_386_TLS_DTPMOD32 35 /* GOT entry containing TLS index */ #define R_386_TLS_DTPOFF32 36 /* GOT entry containing TLS offset */ #define R_386_TLS_TPOFF32 37 /* GOT entry of -ve static TLS offset */ #define R_386_IRELATIVE 42 /* PLT entry resolved indirectly at runtime */ #define R_AARCH64_NONE 0 /* No relocation */ #define R_AARCH64_ABS64 257 /* Absolute offset */ #define R_AARCH64_ABS32 258 /* Absolute, 32-bit overflow check */ #define R_AARCH64_ABS16 259 /* Absolute, 16-bit overflow check */ #define R_AARCH64_PREL64 260 /* PC relative */ #define R_AARCH64_PREL32 261 /* PC relative, 32-bit overflow check */ #define R_AARCH64_PREL16 262 /* PC relative, 16-bit overflow check */ #define R_AARCH64_COPY 1024 /* Copy data from shared object */ #define R_AARCH64_GLOB_DAT 1025 /* Set GOT entry to data address */ #define R_AARCH64_JUMP_SLOT 1026 /* Set GOT entry to code address */ #define R_AARCH64_RELATIVE 1027 /* Add load address of shared object */ #define R_AARCH64_TLS_DTPREL64 1028 #define R_AARCH64_TLS_DTPMOD64 1029 #define R_AARCH64_TLS_TPREL64 1030 #define R_AARCH64_TLSDESC 1031 /* Identify the TLS descriptor */ #define R_AARCH64_IRELATIVE 1032 #define R_ARM_NONE 0 /* No relocation. */ #define R_ARM_PC24 1 #define R_ARM_ABS32 2 #define R_ARM_REL32 3 #define R_ARM_PC13 4 #define R_ARM_ABS16 5 #define R_ARM_ABS12 6 #define R_ARM_THM_ABS5 7 #define R_ARM_ABS8 8 #define R_ARM_SBREL32 9 #define R_ARM_THM_PC22 10 #define R_ARM_THM_PC8 11 #define R_ARM_AMP_VCALL9 12 #define R_ARM_SWI24 13 #define R_ARM_THM_SWI8 14 #define R_ARM_XPC25 15 #define R_ARM_THM_XPC22 16 /* TLS relocations */ #define R_ARM_TLS_DTPMOD32 17 /* ID of module containing symbol */ #define R_ARM_TLS_DTPOFF32 18 /* Offset in TLS block */ #define R_ARM_TLS_TPOFF32 19 /* Offset in static TLS block */ #define R_ARM_COPY 20 /* Copy data from shared object. */ #define R_ARM_GLOB_DAT 21 /* Set GOT entry to data address. */ #define R_ARM_JUMP_SLOT 22 /* Set GOT entry to code address. */ #define R_ARM_RELATIVE 23 /* Add load address of shared object. */ #define R_ARM_GOTOFF 24 /* Add GOT-relative symbol address. */ #define R_ARM_GOTPC 25 /* Add PC-relative GOT table address. */ #define R_ARM_GOT32 26 /* Add PC-relative GOT offset. */ #define R_ARM_PLT32 27 /* Add PC-relative PLT offset. */ #define R_ARM_GNU_VTENTRY 100 #define R_ARM_GNU_VTINHERIT 101 #define R_ARM_RSBREL32 250 #define R_ARM_THM_RPC22 251 #define R_ARM_RREL32 252 #define R_ARM_RABS32 253 #define R_ARM_RPC24 254 #define R_ARM_RBASE 255 /* Name Value Field Calculation */ #define R_IA_64_NONE 0 /* None */ #define R_IA_64_IMM14 0x21 /* immediate14 S + A */ #define R_IA_64_IMM22 0x22 /* immediate22 S + A */ #define R_IA_64_IMM64 0x23 /* immediate64 S + A */ #define R_IA_64_DIR32MSB 0x24 /* word32 MSB S + A */ #define R_IA_64_DIR32LSB 0x25 /* word32 LSB S + A */ #define R_IA_64_DIR64MSB 0x26 /* word64 MSB S + A */ #define R_IA_64_DIR64LSB 0x27 /* word64 LSB S + A */ #define R_IA_64_GPREL22 0x2a /* immediate22 @gprel(S + A) */ #define R_IA_64_GPREL64I 0x2b /* immediate64 @gprel(S + A) */ #define R_IA_64_GPREL32MSB 0x2c /* word32 MSB @gprel(S + A) */ #define R_IA_64_GPREL32LSB 0x2d /* word32 LSB @gprel(S + A) */ #define R_IA_64_GPREL64MSB 0x2e /* word64 MSB @gprel(S + A) */ #define R_IA_64_GPREL64LSB 0x2f /* word64 LSB @gprel(S + A) */ #define R_IA_64_LTOFF22 0x32 /* immediate22 @ltoff(S + A) */ #define R_IA_64_LTOFF64I 0x33 /* immediate64 @ltoff(S + A) */ #define R_IA_64_PLTOFF22 0x3a /* immediate22 @pltoff(S + A) */ #define R_IA_64_PLTOFF64I 0x3b /* immediate64 @pltoff(S + A) */ #define R_IA_64_PLTOFF64MSB 0x3e /* word64 MSB @pltoff(S + A) */ #define R_IA_64_PLTOFF64LSB 0x3f /* word64 LSB @pltoff(S + A) */ #define R_IA_64_FPTR64I 0x43 /* immediate64 @fptr(S + A) */ #define R_IA_64_FPTR32MSB 0x44 /* word32 MSB @fptr(S + A) */ #define R_IA_64_FPTR32LSB 0x45 /* word32 LSB @fptr(S + A) */ #define R_IA_64_FPTR64MSB 0x46 /* word64 MSB @fptr(S + A) */ #define R_IA_64_FPTR64LSB 0x47 /* word64 LSB @fptr(S + A) */ #define R_IA_64_PCREL60B 0x48 /* immediate60 form1 S + A - P */ #define R_IA_64_PCREL21B 0x49 /* immediate21 form1 S + A - P */ #define R_IA_64_PCREL21M 0x4a /* immediate21 form2 S + A - P */ #define R_IA_64_PCREL21F 0x4b /* immediate21 form3 S + A - P */ #define R_IA_64_PCREL32MSB 0x4c /* word32 MSB S + A - P */ #define R_IA_64_PCREL32LSB 0x4d /* word32 LSB S + A - P */ #define R_IA_64_PCREL64MSB 0x4e /* word64 MSB S + A - P */ #define R_IA_64_PCREL64LSB 0x4f /* word64 LSB S + A - P */ #define R_IA_64_LTOFF_FPTR22 0x52 /* immediate22 @ltoff(@fptr(S + A)) */ #define R_IA_64_LTOFF_FPTR64I 0x53 /* immediate64 @ltoff(@fptr(S + A)) */ #define R_IA_64_LTOFF_FPTR32MSB 0x54 /* word32 MSB @ltoff(@fptr(S + A)) */ #define R_IA_64_LTOFF_FPTR32LSB 0x55 /* word32 LSB @ltoff(@fptr(S + A)) */ #define R_IA_64_LTOFF_FPTR64MSB 0x56 /* word64 MSB @ltoff(@fptr(S + A)) */ #define R_IA_64_LTOFF_FPTR64LSB 0x57 /* word64 LSB @ltoff(@fptr(S + A)) */ #define R_IA_64_SEGREL32MSB 0x5c /* word32 MSB @segrel(S + A) */ #define R_IA_64_SEGREL32LSB 0x5d /* word32 LSB @segrel(S + A) */ #define R_IA_64_SEGREL64MSB 0x5e /* word64 MSB @segrel(S + A) */ #define R_IA_64_SEGREL64LSB 0x5f /* word64 LSB @segrel(S + A) */ #define R_IA_64_SECREL32MSB 0x64 /* word32 MSB @secrel(S + A) */ #define R_IA_64_SECREL32LSB 0x65 /* word32 LSB @secrel(S + A) */ #define R_IA_64_SECREL64MSB 0x66 /* word64 MSB @secrel(S + A) */ #define R_IA_64_SECREL64LSB 0x67 /* word64 LSB @secrel(S + A) */ #define R_IA_64_REL32MSB 0x6c /* word32 MSB BD + A */ #define R_IA_64_REL32LSB 0x6d /* word32 LSB BD + A */ #define R_IA_64_REL64MSB 0x6e /* word64 MSB BD + A */ #define R_IA_64_REL64LSB 0x6f /* word64 LSB BD + A */ #define R_IA_64_LTV32MSB 0x74 /* word32 MSB S + A */ #define R_IA_64_LTV32LSB 0x75 /* word32 LSB S + A */ #define R_IA_64_LTV64MSB 0x76 /* word64 MSB S + A */ #define R_IA_64_LTV64LSB 0x77 /* word64 LSB S + A */ #define R_IA_64_PCREL21BI 0x79 /* immediate21 form1 S + A - P */ #define R_IA_64_PCREL22 0x7a /* immediate22 S + A - P */ #define R_IA_64_PCREL64I 0x7b /* immediate64 S + A - P */ #define R_IA_64_IPLTMSB 0x80 /* function descriptor MSB special */ #define R_IA_64_IPLTLSB 0x81 /* function descriptor LSB speciaal */ #define R_IA_64_SUB 0x85 /* immediate64 A - S */ #define R_IA_64_LTOFF22X 0x86 /* immediate22 special */ #define R_IA_64_LDXMOV 0x87 /* immediate22 special */ #define R_IA_64_TPREL14 0x91 /* imm14 @tprel(S + A) */ #define R_IA_64_TPREL22 0x92 /* imm22 @tprel(S + A) */ #define R_IA_64_TPREL64I 0x93 /* imm64 @tprel(S + A) */ #define R_IA_64_TPREL64MSB 0x96 /* word64 MSB @tprel(S + A) */ #define R_IA_64_TPREL64LSB 0x97 /* word64 LSB @tprel(S + A) */ #define R_IA_64_LTOFF_TPREL22 0x9a /* imm22 @ltoff(@tprel(S+A)) */ #define R_IA_64_DTPMOD64MSB 0xa6 /* word64 MSB @dtpmod(S + A) */ #define R_IA_64_DTPMOD64LSB 0xa7 /* word64 LSB @dtpmod(S + A) */ #define R_IA_64_LTOFF_DTPMOD22 0xaa /* imm22 @ltoff(@dtpmod(S+A)) */ #define R_IA_64_DTPREL14 0xb1 /* imm14 @dtprel(S + A) */ #define R_IA_64_DTPREL22 0xb2 /* imm22 @dtprel(S + A) */ #define R_IA_64_DTPREL64I 0xb3 /* imm64 @dtprel(S + A) */ #define R_IA_64_DTPREL32MSB 0xb4 /* word32 MSB @dtprel(S + A) */ #define R_IA_64_DTPREL32LSB 0xb5 /* word32 LSB @dtprel(S + A) */ #define R_IA_64_DTPREL64MSB 0xb6 /* word64 MSB @dtprel(S + A) */ #define R_IA_64_DTPREL64LSB 0xb7 /* word64 LSB @dtprel(S + A) */ #define R_IA_64_LTOFF_DTPREL22 0xba /* imm22 @ltoff(@dtprel(S+A)) */ #define R_MIPS_NONE 0 /* No reloc */ #define R_MIPS_16 1 /* Direct 16 bit */ #define R_MIPS_32 2 /* Direct 32 bit */ #define R_MIPS_REL32 3 /* PC relative 32 bit */ #define R_MIPS_26 4 /* Direct 26 bit shifted */ #define R_MIPS_HI16 5 /* High 16 bit */ #define R_MIPS_LO16 6 /* Low 16 bit */ #define R_MIPS_GPREL16 7 /* GP relative 16 bit */ #define R_MIPS_LITERAL 8 /* 16 bit literal entry */ #define R_MIPS_GOT16 9 /* 16 bit GOT entry */ #define R_MIPS_PC16 10 /* PC relative 16 bit */ #define R_MIPS_CALL16 11 /* 16 bit GOT entry for function */ #define R_MIPS_GPREL32 12 /* GP relative 32 bit */ #define R_MIPS_64 18 /* Direct 64 bit */ #define R_MIPS_GOT_DISP 19 #define R_MIPS_GOT_PAGE 20 #define R_MIPS_GOT_OFST 21 #define R_MIPS_GOT_HI16 22 /* GOT HI 16 bit */ #define R_MIPS_GOT_LO16 23 /* GOT LO 16 bit */ #define R_MIPS_SUB 24 #define R_MIPS_CALLHI16 30 /* upper 16 bit GOT entry for function */ #define R_MIPS_CALLLO16 31 /* lower 16 bit GOT entry for function */ #define R_MIPS_JALR 37 #define R_MIPS_TLS_GD 42 #define R_MIPS_COPY 126 #define R_MIPS_JUMP_SLOT 127 #define R_PPC_NONE 0 /* No relocation. */ #define R_PPC_ADDR32 1 #define R_PPC_ADDR24 2 #define R_PPC_ADDR16 3 #define R_PPC_ADDR16_LO 4 #define R_PPC_ADDR16_HI 5 #define R_PPC_ADDR16_HA 6 #define R_PPC_ADDR14 7 #define R_PPC_ADDR14_BRTAKEN 8 #define R_PPC_ADDR14_BRNTAKEN 9 #define R_PPC_REL24 10 #define R_PPC_REL14 11 #define R_PPC_REL14_BRTAKEN 12 #define R_PPC_REL14_BRNTAKEN 13 #define R_PPC_GOT16 14 #define R_PPC_GOT16_LO 15 #define R_PPC_GOT16_HI 16 #define R_PPC_GOT16_HA 17 #define R_PPC_PLTREL24 18 #define R_PPC_COPY 19 #define R_PPC_GLOB_DAT 20 #define R_PPC_JMP_SLOT 21 #define R_PPC_RELATIVE 22 #define R_PPC_LOCAL24PC 23 #define R_PPC_UADDR32 24 #define R_PPC_UADDR16 25 #define R_PPC_REL32 26 #define R_PPC_PLT32 27 #define R_PPC_PLTREL32 28 #define R_PPC_PLT16_LO 29 #define R_PPC_PLT16_HI 30 #define R_PPC_PLT16_HA 31 #define R_PPC_SDAREL16 32 #define R_PPC_SECTOFF 33 #define R_PPC_SECTOFF_LO 34 #define R_PPC_SECTOFF_HI 35 #define R_PPC_SECTOFF_HA 36 #define R_PPC_IRELATIVE 248 /* * 64-bit relocations */ #define R_PPC64_ADDR64 38 #define R_PPC64_ADDR16_HIGHER 39 #define R_PPC64_ADDR16_HIGHERA 40 #define R_PPC64_ADDR16_HIGHEST 41 #define R_PPC64_ADDR16_HIGHESTA 42 #define R_PPC64_UADDR64 43 #define R_PPC64_REL64 44 #define R_PPC64_PLT64 45 #define R_PPC64_PLTREL64 46 #define R_PPC64_TOC16 47 #define R_PPC64_TOC16_LO 48 #define R_PPC64_TOC16_HI 49 #define R_PPC64_TOC16_HA 50 #define R_PPC64_TOC 51 #define R_PPC64_DTPMOD64 68 #define R_PPC64_TPREL64 73 #define R_PPC64_DTPREL64 78 /* * TLS relocations */ #define R_PPC_TLS 67 #define R_PPC_DTPMOD32 68 #define R_PPC_TPREL16 69 #define R_PPC_TPREL16_LO 70 #define R_PPC_TPREL16_HI 71 #define R_PPC_TPREL16_HA 72 #define R_PPC_TPREL32 73 #define R_PPC_DTPREL16 74 #define R_PPC_DTPREL16_LO 75 #define R_PPC_DTPREL16_HI 76 #define R_PPC_DTPREL16_HA 77 #define R_PPC_DTPREL32 78 #define R_PPC_GOT_TLSGD16 79 #define R_PPC_GOT_TLSGD16_LO 80 #define R_PPC_GOT_TLSGD16_HI 81 #define R_PPC_GOT_TLSGD16_HA 82 #define R_PPC_GOT_TLSLD16 83 #define R_PPC_GOT_TLSLD16_LO 84 #define R_PPC_GOT_TLSLD16_HI 85 #define R_PPC_GOT_TLSLD16_HA 86 #define R_PPC_GOT_TPREL16 87 #define R_PPC_GOT_TPREL16_LO 88 #define R_PPC_GOT_TPREL16_HI 89 #define R_PPC_GOT_TPREL16_HA 90 /* * The remaining relocs are from the Embedded ELF ABI, and are not in the * SVR4 ELF ABI. */ #define R_PPC_EMB_NADDR32 101 #define R_PPC_EMB_NADDR16 102 #define R_PPC_EMB_NADDR16_LO 103 #define R_PPC_EMB_NADDR16_HI 104 #define R_PPC_EMB_NADDR16_HA 105 #define R_PPC_EMB_SDAI16 106 #define R_PPC_EMB_SDA2I16 107 #define R_PPC_EMB_SDA2REL 108 #define R_PPC_EMB_SDA21 109 #define R_PPC_EMB_MRKREF 110 #define R_PPC_EMB_RELSEC16 111 #define R_PPC_EMB_RELST_LO 112 #define R_PPC_EMB_RELST_HI 113 #define R_PPC_EMB_RELST_HA 114 #define R_PPC_EMB_BIT_FLD 115 #define R_PPC_EMB_RELSDA 116 /* * RISC-V relocation types. */ /* Relocation types used by the dynamic linker. */ #define R_RISCV_NONE 0 #define R_RISCV_32 1 #define R_RISCV_64 2 #define R_RISCV_RELATIVE 3 #define R_RISCV_COPY 4 #define R_RISCV_JUMP_SLOT 5 #define R_RISCV_TLS_DTPMOD32 6 #define R_RISCV_TLS_DTPMOD64 7 #define R_RISCV_TLS_DTPREL32 8 #define R_RISCV_TLS_DTPREL64 9 #define R_RISCV_TLS_TPREL32 10 #define R_RISCV_TLS_TPREL64 11 /* Relocation types not used by the dynamic linker. */ #define R_RISCV_BRANCH 16 #define R_RISCV_JAL 17 #define R_RISCV_CALL 18 #define R_RISCV_CALL_PLT 19 #define R_RISCV_GOT_HI20 20 #define R_RISCV_TLS_GOT_HI20 21 #define R_RISCV_TLS_GD_HI20 22 #define R_RISCV_PCREL_HI20 23 #define R_RISCV_PCREL_LO12_I 24 #define R_RISCV_PCREL_LO12_S 25 #define R_RISCV_HI20 26 #define R_RISCV_LO12_I 27 #define R_RISCV_LO12_S 28 #define R_RISCV_TPREL_HI20 29 #define R_RISCV_TPREL_LO12_I 30 #define R_RISCV_TPREL_LO12_S 31 #define R_RISCV_TPREL_ADD 32 #define R_RISCV_ADD8 33 #define R_RISCV_ADD16 34 #define R_RISCV_ADD32 35 #define R_RISCV_ADD64 36 #define R_RISCV_SUB8 37 #define R_RISCV_SUB16 38 #define R_RISCV_SUB32 39 #define R_RISCV_SUB64 40 #define R_RISCV_GNU_VTINHERIT 41 #define R_RISCV_GNU_VTENTRY 42 #define R_RISCV_ALIGN 43 #define R_RISCV_RVC_BRANCH 44 #define R_RISCV_RVC_JUMP 45 #define R_RISCV_RVC_LUI 46 #define R_RISCV_GPREL_I 47 #define R_RISCV_GPREL_S 48 #define R_RISCV_TPREL_I 49 #define R_RISCV_TPREL_S 50 #define R_RISCV_RELAX 51 #define R_RISCV_SUB6 52 #define R_RISCV_SET6 53 #define R_RISCV_SET8 54 #define R_RISCV_SET16 55 #define R_RISCV_SET32 56 #define R_SPARC_NONE 0 #define R_SPARC_8 1 #define R_SPARC_16 2 #define R_SPARC_32 3 #define R_SPARC_DISP8 4 #define R_SPARC_DISP16 5 #define R_SPARC_DISP32 6 #define R_SPARC_WDISP30 7 #define R_SPARC_WDISP22 8 #define R_SPARC_HI22 9 #define R_SPARC_22 10 #define R_SPARC_13 11 #define R_SPARC_LO10 12 #define R_SPARC_GOT10 13 #define R_SPARC_GOT13 14 #define R_SPARC_GOT22 15 #define R_SPARC_PC10 16 #define R_SPARC_PC22 17 #define R_SPARC_WPLT30 18 #define R_SPARC_COPY 19 #define R_SPARC_GLOB_DAT 20 #define R_SPARC_JMP_SLOT 21 #define R_SPARC_RELATIVE 22 #define R_SPARC_UA32 23 #define R_SPARC_PLT32 24 #define R_SPARC_HIPLT22 25 #define R_SPARC_LOPLT10 26 #define R_SPARC_PCPLT32 27 #define R_SPARC_PCPLT22 28 #define R_SPARC_PCPLT10 29 #define R_SPARC_10 30 #define R_SPARC_11 31 #define R_SPARC_64 32 #define R_SPARC_OLO10 33 #define R_SPARC_HH22 34 #define R_SPARC_HM10 35 #define R_SPARC_LM22 36 #define R_SPARC_PC_HH22 37 #define R_SPARC_PC_HM10 38 #define R_SPARC_PC_LM22 39 #define R_SPARC_WDISP16 40 #define R_SPARC_WDISP19 41 #define R_SPARC_GLOB_JMP 42 #define R_SPARC_7 43 #define R_SPARC_5 44 #define R_SPARC_6 45 #define R_SPARC_DISP64 46 #define R_SPARC_PLT64 47 #define R_SPARC_HIX22 48 #define R_SPARC_LOX10 49 #define R_SPARC_H44 50 #define R_SPARC_M44 51 #define R_SPARC_L44 52 #define R_SPARC_REGISTER 53 #define R_SPARC_UA64 54 #define R_SPARC_UA16 55 #define R_SPARC_TLS_GD_HI22 56 #define R_SPARC_TLS_GD_LO10 57 #define R_SPARC_TLS_GD_ADD 58 #define R_SPARC_TLS_GD_CALL 59 #define R_SPARC_TLS_LDM_HI22 60 #define R_SPARC_TLS_LDM_LO10 61 #define R_SPARC_TLS_LDM_ADD 62 #define R_SPARC_TLS_LDM_CALL 63 #define R_SPARC_TLS_LDO_HIX22 64 #define R_SPARC_TLS_LDO_LOX10 65 #define R_SPARC_TLS_LDO_ADD 66 #define R_SPARC_TLS_IE_HI22 67 #define R_SPARC_TLS_IE_LO10 68 #define R_SPARC_TLS_IE_LD 69 #define R_SPARC_TLS_IE_LDX 70 #define R_SPARC_TLS_IE_ADD 71 #define R_SPARC_TLS_LE_HIX22 72 #define R_SPARC_TLS_LE_LOX10 73 #define R_SPARC_TLS_DTPMOD32 74 #define R_SPARC_TLS_DTPMOD64 75 #define R_SPARC_TLS_DTPOFF32 76 #define R_SPARC_TLS_DTPOFF64 77 #define R_SPARC_TLS_TPOFF32 78 #define R_SPARC_TLS_TPOFF64 79 #define R_X86_64_NONE 0 /* No relocation. */ #define R_X86_64_64 1 /* Add 64 bit symbol value. */ #define R_X86_64_PC32 2 /* PC-relative 32 bit signed sym value. */ #define R_X86_64_GOT32 3 /* PC-relative 32 bit GOT offset. */ #define R_X86_64_PLT32 4 /* PC-relative 32 bit PLT offset. */ #define R_X86_64_COPY 5 /* Copy data from shared object. */ #define R_X86_64_GLOB_DAT 6 /* Set GOT entry to data address. */ #define R_X86_64_JMP_SLOT 7 /* Set GOT entry to code address. */ #define R_X86_64_RELATIVE 8 /* Add load address of shared object. */ #define R_X86_64_GOTPCREL 9 /* Add 32 bit signed pcrel offset to GOT. */ #define R_X86_64_32 10 /* Add 32 bit zero extended symbol value */ #define R_X86_64_32S 11 /* Add 32 bit sign extended symbol value */ #define R_X86_64_16 12 /* Add 16 bit zero extended symbol value */ #define R_X86_64_PC16 13 /* Add 16 bit signed extended pc relative symbol value */ #define R_X86_64_8 14 /* Add 8 bit zero extended symbol value */ #define R_X86_64_PC8 15 /* Add 8 bit signed extended pc relative symbol value */ #define R_X86_64_DTPMOD64 16 /* ID of module containing symbol */ #define R_X86_64_DTPOFF64 17 /* Offset in TLS block */ #define R_X86_64_TPOFF64 18 /* Offset in static TLS block */ #define R_X86_64_TLSGD 19 /* PC relative offset to GD GOT entry */ #define R_X86_64_TLSLD 20 /* PC relative offset to LD GOT entry */ #define R_X86_64_DTPOFF32 21 /* Offset in TLS block */ #define R_X86_64_GOTTPOFF 22 /* PC relative offset to IE GOT entry */ #define R_X86_64_TPOFF32 23 /* Offset in static TLS block */ #define R_X86_64_PC64 24 /* PC-relative 64 bit signed sym value. */ #define R_X86_64_GOTOFF64 25 #define R_X86_64_GOTPC32 26 #define R_X86_64_GOT64 27 #define R_X86_64_GOTPCREL64 28 #define R_X86_64_GOTPC64 29 #define R_X86_64_GOTPLT64 30 #define R_X86_64_PLTOFF64 31 #define R_X86_64_SIZE32 32 #define R_X86_64_SIZE64 33 #define R_X86_64_GOTPC32_TLSDESC 34 #define R_X86_64_TLSDESC_CALL 35 #define R_X86_64_TLSDESC 36 #define R_X86_64_IRELATIVE 37 #endif /* !_SYS_ELF_COMMON_H_ */ Index: projects/clang900-import/sys/sys/mount.h =================================================================== --- projects/clang900-import/sys/sys/mount.h (revision 352586) +++ projects/clang900-import/sys/sys/mount.h (revision 352587) @@ -1,1049 +1,1049 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mount.h 8.21 (Berkeley) 5/20/95 * $FreeBSD$ */ #ifndef _SYS_MOUNT_H_ #define _SYS_MOUNT_H_ #include #include #ifdef _KERNEL #include #include #include #include #include #endif /* * NOTE: When changing statfs structure, mount structure, MNT_* flags or * MNTK_* flags also update DDB show mount command in vfs_subr.c. */ typedef struct fsid { int32_t val[2]; } fsid_t; /* filesystem id type */ /* * File identifier. * These are unique per filesystem on a single machine. */ #define MAXFIDSZ 16 struct fid { u_short fid_len; /* length of data in bytes */ u_short fid_data0; /* force longword alignment */ char fid_data[MAXFIDSZ]; /* data (variable length) */ }; /* * filesystem statistics */ #define MFSNAMELEN 16 /* length of type name including null */ #define MNAMELEN 1024 /* size of on/from name bufs */ #define STATFS_VERSION 0x20140518 /* current version number */ struct statfs { uint32_t f_version; /* structure version number */ uint32_t f_type; /* type of filesystem */ uint64_t f_flags; /* copy of mount exported flags */ uint64_t f_bsize; /* filesystem fragment size */ uint64_t f_iosize; /* optimal transfer block size */ uint64_t f_blocks; /* total data blocks in filesystem */ uint64_t f_bfree; /* free blocks in filesystem */ int64_t f_bavail; /* free blocks avail to non-superuser */ uint64_t f_files; /* total file nodes in filesystem */ int64_t f_ffree; /* free nodes avail to non-superuser */ uint64_t f_syncwrites; /* count of sync writes since mount */ uint64_t f_asyncwrites; /* count of async writes since mount */ uint64_t f_syncreads; /* count of sync reads since mount */ uint64_t f_asyncreads; /* count of async reads since mount */ uint64_t f_spare[10]; /* unused spare */ uint32_t f_namemax; /* maximum filename length */ uid_t f_owner; /* user that mounted the filesystem */ fsid_t f_fsid; /* filesystem id */ char f_charspare[80]; /* spare string space */ char f_fstypename[MFSNAMELEN]; /* filesystem type name */ char f_mntfromname[MNAMELEN]; /* mounted filesystem */ char f_mntonname[MNAMELEN]; /* directory on which mounted */ }; #if defined(_WANT_FREEBSD11_STATFS) || defined(_KERNEL) #define FREEBSD11_STATFS_VERSION 0x20030518 /* current version number */ struct freebsd11_statfs { uint32_t f_version; /* structure version number */ uint32_t f_type; /* type of filesystem */ uint64_t f_flags; /* copy of mount exported flags */ uint64_t f_bsize; /* filesystem fragment size */ uint64_t f_iosize; /* optimal transfer block size */ uint64_t f_blocks; /* total data blocks in filesystem */ uint64_t f_bfree; /* free blocks in filesystem */ int64_t f_bavail; /* free blocks avail to non-superuser */ uint64_t f_files; /* total file nodes in filesystem */ int64_t f_ffree; /* free nodes avail to non-superuser */ uint64_t f_syncwrites; /* count of sync writes since mount */ uint64_t f_asyncwrites; /* count of async writes since mount */ uint64_t f_syncreads; /* count of sync reads since mount */ uint64_t f_asyncreads; /* count of async reads since mount */ uint64_t f_spare[10]; /* unused spare */ uint32_t f_namemax; /* maximum filename length */ uid_t f_owner; /* user that mounted the filesystem */ fsid_t f_fsid; /* filesystem id */ char f_charspare[80]; /* spare string space */ char f_fstypename[16]; /* filesystem type name */ char f_mntfromname[88]; /* mounted filesystem */ char f_mntonname[88]; /* directory on which mounted */ }; #endif /* _WANT_FREEBSD11_STATFS || _KERNEL */ #ifdef _KERNEL #define OMFSNAMELEN 16 /* length of fs type name, including null */ #define OMNAMELEN (88 - 2 * sizeof(long)) /* size of on/from name bufs */ /* XXX getfsstat.2 is out of date with write and read counter changes here. */ /* XXX statfs.2 is out of date with read counter changes here. */ struct ostatfs { long f_spare2; /* placeholder */ long f_bsize; /* fundamental filesystem block size */ long f_iosize; /* optimal transfer block size */ long f_blocks; /* total data blocks in filesystem */ long f_bfree; /* free blocks in fs */ long f_bavail; /* free blocks avail to non-superuser */ long f_files; /* total file nodes in filesystem */ long f_ffree; /* free file nodes in fs */ fsid_t f_fsid; /* filesystem id */ uid_t f_owner; /* user that mounted the filesystem */ int f_type; /* type of filesystem */ int f_flags; /* copy of mount exported flags */ long f_syncwrites; /* count of sync writes since mount */ long f_asyncwrites; /* count of async writes since mount */ char f_fstypename[OMFSNAMELEN]; /* fs type name */ char f_mntonname[OMNAMELEN]; /* directory on which mounted */ long f_syncreads; /* count of sync reads since mount */ long f_asyncreads; /* count of async reads since mount */ short f_spares1; /* unused spare */ char f_mntfromname[OMNAMELEN];/* mounted filesystem */ short f_spares2; /* unused spare */ /* * XXX on machines where longs are aligned to 8-byte boundaries, there * is an unnamed int32_t here. This spare was after the apparent end * of the struct until we bit off the read counters from f_mntonname. */ long f_spare[2]; /* unused spare */ }; TAILQ_HEAD(vnodelst, vnode); /* Mount options list */ TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; int pos; int seen; }; /* * Structure per mounted filesystem. Each mounted filesystem has an * array of operations and an instance record. The filesystems are * put on a doubly linked list. * * Lock reference: * l - mnt_listmtx * m - mountlist_mtx * i - interlock * v - vnode freelist mutex * * Unmarked fields are considered stable as long as a ref is held. * */ struct mount { struct mtx mnt_mtx; /* mount structure interlock */ int mnt_gen; /* struct mount generation */ #define mnt_startzero mnt_list TAILQ_ENTRY(mount) mnt_list; /* (m) mount list */ struct vfsops *mnt_op; /* operations on fs */ struct vfsconf *mnt_vfc; /* configuration info */ struct vnode *mnt_vnodecovered; /* vnode we mounted on */ struct vnode *mnt_syncer; /* syncer vnode */ int mnt_ref; /* (i) Reference count */ struct vnodelst mnt_nvnodelist; /* (i) list of vnodes */ int mnt_nvnodelistsize; /* (i) # of vnodes */ int mnt_writeopcount; /* (i) write syscalls pending */ int mnt_kern_flag; /* (i) kernel only flags */ uint64_t mnt_flag; /* (i) flags shared with user */ struct vfsoptlist *mnt_opt; /* current mount options */ struct vfsoptlist *mnt_optnew; /* new options passed to fs */ int mnt_maxsymlinklen; /* max size of short symlink */ struct statfs mnt_stat; /* cache of filesystem stats */ struct ucred *mnt_cred; /* credentials of mounter */ void * mnt_data; /* private data */ time_t mnt_time; /* last time written*/ int mnt_iosize_max; /* max size for clusters, etc */ struct netexport *mnt_export; /* export list */ struct label *mnt_label; /* MAC label for the fs */ u_int mnt_hashseed; /* Random seed for vfs_hash */ int mnt_lockref; /* (i) Lock reference count */ int mnt_secondary_writes; /* (i) # of secondary writes */ int mnt_secondary_accwrites;/* (i) secondary wr. starts */ struct thread *mnt_susp_owner; /* (i) thread owning suspension */ #define mnt_endzero mnt_gjprovider char *mnt_gjprovider; /* gjournal provider name */ struct mtx mnt_listmtx; struct vnodelst mnt_activevnodelist; /* (l) list of active vnodes */ int mnt_activevnodelistsize;/* (l) # of active vnodes */ struct vnodelst mnt_tmpfreevnodelist; /* (l) list of free vnodes */ int mnt_tmpfreevnodelistsize;/* (l) # of free vnodes */ struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ - int mnt_vfs_ops; /* (i) pending vfs ops */ + int __aligned(CACHE_LINE_SIZE) mnt_vfs_ops;/* (i) pending vfs ops */ int *mnt_thread_in_ops_pcpu; int *mnt_ref_pcpu; int *mnt_lockref_pcpu; int *mnt_writeopcount_pcpu; }; /* * Definitions for MNT_VNODE_FOREACH_ALL. */ struct vnode *__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp); struct vnode *__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp); void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp); #define MNT_VNODE_FOREACH_ALL(vp, mp, mvp) \ for (vp = __mnt_vnode_first_all(&(mvp), (mp)); \ (vp) != NULL; vp = __mnt_vnode_next_all(&(mvp), (mp))) #define MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp) \ do { \ MNT_ILOCK(mp); \ __mnt_vnode_markerfree_all(&(mvp), (mp)); \ /* MNT_IUNLOCK(mp); -- done in above function */ \ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); \ } while (0) /* * Definitions for MNT_VNODE_FOREACH_ACTIVE. */ struct vnode *__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp); struct vnode *__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp); void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *); #define MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) \ for (vp = __mnt_vnode_first_active(&(mvp), (mp)); \ (vp) != NULL; vp = __mnt_vnode_next_active(&(mvp), (mp))) #define MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp) \ __mnt_vnode_markerfree_active(&(mvp), (mp)) #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) #define MNT_REF(mp) do { \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ mp->mnt_ref++; \ } while (0) #define MNT_REL(mp) do { \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ (mp)->mnt_ref--; \ if ((mp)->mnt_vfs_ops && (mp)->mnt_ref < 0) \ vfs_dump_mount_counters(mp); \ if ((mp)->mnt_ref == 0 && (mp)->mnt_vfs_ops) \ wakeup((mp)); \ } while (0) #endif /* _KERNEL */ /* * User specifiable flags, stored in mnt_flag. */ #define MNT_RDONLY 0x0000000000000001ULL /* read only filesystem */ #define MNT_SYNCHRONOUS 0x0000000000000002ULL /* fs written synchronously */ #define MNT_NOEXEC 0x0000000000000004ULL /* can't exec from filesystem */ #define MNT_NOSUID 0x0000000000000008ULL /* don't honor setuid fs bits */ #define MNT_NFS4ACLS 0x0000000000000010ULL /* enable NFS version 4 ACLs */ #define MNT_UNION 0x0000000000000020ULL /* union with underlying fs */ #define MNT_ASYNC 0x0000000000000040ULL /* fs written asynchronously */ #define MNT_SUIDDIR 0x0000000000100000ULL /* special SUID dir handling */ #define MNT_SOFTDEP 0x0000000000200000ULL /* using soft updates */ #define MNT_NOSYMFOLLOW 0x0000000000400000ULL /* do not follow symlinks */ #define MNT_GJOURNAL 0x0000000002000000ULL /* GEOM journal support enabled */ #define MNT_MULTILABEL 0x0000000004000000ULL /* MAC support for objects */ #define MNT_ACLS 0x0000000008000000ULL /* ACL support enabled */ #define MNT_NOATIME 0x0000000010000000ULL /* dont update file access time */ #define MNT_NOCLUSTERR 0x0000000040000000ULL /* disable cluster read */ #define MNT_NOCLUSTERW 0x0000000080000000ULL /* disable cluster write */ #define MNT_SUJ 0x0000000100000000ULL /* using journaled soft updates */ #define MNT_AUTOMOUNTED 0x0000000200000000ULL /* mounted by automountd(8) */ #define MNT_UNTRUSTED 0x0000000800000000ULL /* filesys metadata untrusted */ /* * NFS export related mount flags. */ #define MNT_EXRDONLY 0x0000000000000080ULL /* exported read only */ #define MNT_EXPORTED 0x0000000000000100ULL /* filesystem is exported */ #define MNT_DEFEXPORTED 0x0000000000000200ULL /* exported to the world */ #define MNT_EXPORTANON 0x0000000000000400ULL /* anon uid mapping for all */ #define MNT_EXKERB 0x0000000000000800ULL /* exported with Kerberos */ #define MNT_EXPUBLIC 0x0000000020000000ULL /* public export (WebNFS) */ /* * Flags set by internal operations, * but visible to the user. * XXX some of these are not quite right.. (I've never seen the root flag set) */ #define MNT_LOCAL 0x0000000000001000ULL /* filesystem is stored locally */ #define MNT_QUOTA 0x0000000000002000ULL /* quotas are enabled on fs */ #define MNT_ROOTFS 0x0000000000004000ULL /* identifies the root fs */ #define MNT_USER 0x0000000000008000ULL /* mounted by a user */ #define MNT_IGNORE 0x0000000000800000ULL /* do not show entry in df */ #define MNT_VERIFIED 0x0000000400000000ULL /* filesystem is verified */ /* * Mask of flags that are visible to statfs(). * XXX I think that this could now become (~(MNT_CMDFLAGS)) * but the 'mount' program may need changing to handle this. */ #define MNT_VISFLAGMASK (MNT_RDONLY | MNT_SYNCHRONOUS | MNT_NOEXEC | \ MNT_NOSUID | MNT_UNION | MNT_SUJ | \ MNT_ASYNC | MNT_EXRDONLY | MNT_EXPORTED | \ MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ MNT_LOCAL | MNT_USER | MNT_QUOTA | \ MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | \ MNT_NFS4ACLS | MNT_AUTOMOUNTED | MNT_VERIFIED | \ MNT_UNTRUSTED) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | \ MNT_NOATIME | \ MNT_NOSYMFOLLOW | MNT_IGNORE | \ MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR | \ MNT_ACLS | MNT_USER | MNT_NFS4ACLS | \ MNT_AUTOMOUNTED | MNT_UNTRUSTED) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. * XXX: These are not STATES and really should be somewhere else. * XXX: MNT_BYFSID and MNT_NONBUSY collide with MNT_ACLS and MNT_MULTILABEL, * but because MNT_ACLS and MNT_MULTILABEL are only used for mount(2), * and MNT_BYFSID and MNT_NONBUSY are only used for unmount(2), * it's harmless. */ #define MNT_UPDATE 0x0000000000010000ULL /* not real mount, just update */ #define MNT_DELEXPORT 0x0000000000020000ULL /* delete export host lists */ #define MNT_RELOAD 0x0000000000040000ULL /* reload filesystem data */ #define MNT_FORCE 0x0000000000080000ULL /* force unmount or readonly */ #define MNT_SNAPSHOT 0x0000000001000000ULL /* snapshot the filesystem */ #define MNT_NONBUSY 0x0000000004000000ULL /* check vnode use counts. */ #define MNT_BYFSID 0x0000000008000000ULL /* specify filesystem by ID. */ #define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_NONBUSY | \ MNT_BYFSID) /* * Internal filesystem control flags stored in mnt_kern_flag. * * MNTK_UNMOUNT locks the mount entry so that name lookup cannot * proceed past the mount point. This keeps the subtree stable during * mounts and unmounts. When non-forced unmount flushes all vnodes * from the mp queue, the MNTK_UNMOUNT flag prevents insmntque() from * queueing new vnodes. * * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while * dounmount() is still waiting to lock the mountpoint. This allows * the filesystem to cancel operations that might otherwise deadlock * with the unmount attempt (used by NFS). */ #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ #define MNTK_ASYNC 0x00000002 /* filtered async flag */ #define MNTK_SOFTDEP 0x00000004 /* async disabled by softdep */ #define MNTK_DRAINING 0x00000010 /* lock draining is happening */ #define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */ #define MNTK_EXTENDED_SHARED 0x00000040 /* Allow shared locking for more ops */ #define MNTK_SHARED_WRITES 0x00000080 /* Allow shared locking for writes */ #define MNTK_NO_IOPF 0x00000100 /* Disallow page faults during reads and writes. Filesystem shall properly handle i/o state on EFAULT. */ #define MNTK_VGONE_UPPER 0x00000200 #define MNTK_VGONE_WAITER 0x00000400 #define MNTK_LOOKUP_EXCL_DOTDOT 0x00000800 #define MNTK_MARKER 0x00001000 #define MNTK_UNMAPPED_BUFS 0x00002000 #define MNTK_USES_BCACHE 0x00004000 /* FS uses the buffer cache. */ #define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */ #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ #define MNTK_SUSPEND2 0x04000000 /* block secondary writes */ #define MNTK_SUSPENDED 0x10000000 /* write operations are suspended */ #define MNTK_NULL_NOCACHE 0x20000000 /* auto disable cache for nullfs mounts over this fs */ #define MNTK_LOOKUP_SHARED 0x40000000 /* FS supports shared lock lookups */ #define MNTK_NOKNOTE 0x80000000 /* Don't send KNOTEs from VOP hooks */ #ifdef _KERNEL static inline int MNT_SHARED_WRITES(struct mount *mp) { return (mp != NULL && (mp->mnt_kern_flag & MNTK_SHARED_WRITES) != 0); } static inline int MNT_EXTENDED_SHARED(struct mount *mp) { return (mp != NULL && (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED) != 0); } #endif /* * Sysctl CTL_VFS definitions. * * Second level identifier specifies which filesystem. Second level * identifier VFS_VFSCONF returns information about all filesystems. * Second level identifier VFS_GENERIC is non-terminal. */ #define VFS_VFSCONF 0 /* get configured filesystems */ #define VFS_GENERIC 0 /* generic filesystem information */ /* * Third level identifiers for VFS_GENERIC are given below; third * level identifiers for specific filesystems are given in their * mount specific header files. */ #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ #define VFS_CONF 2 /* struct: vfsconf for filesystem given as next argument */ /* * Flags for various system call interfaces. * * waitfor flags to vfs_sync() and getfsstat() */ #define MNT_WAIT 1 /* synchronously wait for I/O to complete */ #define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ #define MNT_LAZY 3 /* push data not written by filesystem syncer */ #define MNT_SUSPEND 4 /* Suspend file system after sync */ /* * Generic file handle */ struct fhandle { fsid_t fh_fsid; /* Filesystem id of mount point */ struct fid fh_fid; /* Filesys specific id */ }; typedef struct fhandle fhandle_t; /* * Old export arguments without security flavor list */ struct oexport_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ }; /* * Export arguments for local filesystem mount calls. */ #define MAXSECFLAVORS 5 struct export_args { int ex_flags; /* export related flags */ uid_t ex_root; /* mapping for root uid */ struct xucred ex_anon; /* mapping for anonymous user */ struct sockaddr *ex_addr; /* net address to which exported */ u_char ex_addrlen; /* and the net address length */ struct sockaddr *ex_mask; /* mask of valid bits in saddr */ u_char ex_masklen; /* and the smask length */ char *ex_indexfile; /* index file for WebNFS URLs */ int ex_numsecflavors; /* security flavor count */ int ex_secflavors[MAXSECFLAVORS]; /* list of security flavors */ }; /* * Structure holding information for a publicly exported filesystem * (WebNFS). Currently the specs allow just for one such filesystem. */ struct nfs_public { int np_valid; /* Do we hold valid information */ fhandle_t np_handle; /* Filehandle for pub fs (internal) */ struct mount *np_mount; /* Mountpoint of exported fs */ char *np_index; /* Index file */ }; /* * Filesystem configuration information. One of these exists for each * type of filesystem supported by the kernel. These are searched at * mount time to identify the requested filesystem. * * XXX: Never change the first two arguments! */ struct vfsconf { u_int vfc_version; /* ABI version number */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ struct vfsops *vfc_vfsops; /* filesystem operations vector */ struct vfsops *vfc_vfsops_sd; /* ... signal-deferred */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ int vfc_prison_flag; /* prison allow.mount.* flag */ struct vfsoptdecl *vfc_opts; /* mount options */ TAILQ_ENTRY(vfsconf) vfc_list; /* list of vfscons */ }; /* Userland version of the struct vfsconf. */ struct xvfsconf { struct vfsops *vfc_vfsops; /* filesystem operations vector */ char vfc_name[MFSNAMELEN]; /* filesystem type name */ int vfc_typenum; /* historic filesystem type number */ int vfc_refcount; /* number mounted of this type */ int vfc_flags; /* permanent flags */ struct vfsconf *vfc_next; /* next in list */ }; #ifndef BURN_BRIDGES struct ovfsconf { void *vfc_vfsops; char vfc_name[32]; int vfc_index; int vfc_refcount; int vfc_flags; }; #endif /* * NB: these flags refer to IMPLEMENTATION properties, not properties of * any actual mounts; i.e., it does not make sense to change the flags. */ #define VFCF_STATIC 0x00010000 /* statically compiled into kernel */ #define VFCF_NETWORK 0x00020000 /* may get data over the network */ #define VFCF_READONLY 0x00040000 /* writes are not implemented */ #define VFCF_SYNTHETIC 0x00080000 /* data does not represent real files */ #define VFCF_LOOPBACK 0x00100000 /* aliases some other mounted FS */ #define VFCF_UNICODE 0x00200000 /* stores file names as Unicode */ #define VFCF_JAIL 0x00400000 /* can be mounted from within a jail */ #define VFCF_DELEGADMIN 0x00800000 /* supports delegated administration */ #define VFCF_SBDRY 0x01000000 /* Stop at Boundary: defer stop requests to kernel->user (AST) transition */ typedef uint32_t fsctlop_t; struct vfsidctl { int vc_vers; /* should be VFSIDCTL_VERS1 (below) */ fsid_t vc_fsid; /* fsid to operate on */ char vc_fstypename[MFSNAMELEN]; /* type of fs 'nfs' or '*' */ fsctlop_t vc_op; /* operation VFS_CTL_* (below) */ void *vc_ptr; /* pointer to data structure */ size_t vc_len; /* sizeof said structure */ u_int32_t vc_spare[12]; /* spare (must be zero) */ }; /* vfsidctl API version. */ #define VFS_CTL_VERS1 0x01 /* * New style VFS sysctls, do not reuse/conflict with the namespace for * private sysctls. * All "global" sysctl ops have the 33rd bit set: * 0x...1.... * Private sysctl ops should have the 33rd bit unset. */ #define VFS_CTL_QUERY 0x00010001 /* anything wrong? (vfsquery) */ #define VFS_CTL_TIMEO 0x00010002 /* set timeout for vfs notification */ #define VFS_CTL_NOLOCKS 0x00010003 /* disable file locking */ struct vfsquery { u_int32_t vq_flags; u_int32_t vq_spare[31]; }; /* vfsquery flags */ #define VQ_NOTRESP 0x0001 /* server down */ #define VQ_NEEDAUTH 0x0002 /* server bad auth */ #define VQ_LOWDISK 0x0004 /* we're low on space */ #define VQ_MOUNT 0x0008 /* new filesystem arrived */ #define VQ_UNMOUNT 0x0010 /* filesystem has left */ #define VQ_DEAD 0x0020 /* filesystem is dead, needs force unmount */ #define VQ_ASSIST 0x0040 /* filesystem needs assistance from external program */ #define VQ_NOTRESPLOCK 0x0080 /* server lockd down */ #define VQ_FLAG0100 0x0100 /* placeholder */ #define VQ_FLAG0200 0x0200 /* placeholder */ #define VQ_FLAG0400 0x0400 /* placeholder */ #define VQ_FLAG0800 0x0800 /* placeholder */ #define VQ_FLAG1000 0x1000 /* placeholder */ #define VQ_FLAG2000 0x2000 /* placeholder */ #define VQ_FLAG4000 0x4000 /* placeholder */ #define VQ_FLAG8000 0x8000 /* placeholder */ #ifdef _KERNEL /* Point a sysctl request at a vfsidctl's data. */ #define VCTLTOREQ(vc, req) \ do { \ (req)->newptr = (vc)->vc_ptr; \ (req)->newlen = (vc)->vc_len; \ (req)->newidx = 0; \ } while (0) #endif struct iovec; struct uio; #ifdef _KERNEL /* * vfs_busy specific flags and mask. */ #define MBF_NOWAIT 0x01 #define MBF_MNTLSTLOCK 0x02 #define MBF_MASK (MBF_NOWAIT | MBF_MNTLSTLOCK) #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_MOUNT); MALLOC_DECLARE(M_STATFS); #endif extern int maxvfsconf; /* highest defined filesystem type */ TAILQ_HEAD(vfsconfhead, vfsconf); extern struct vfsconfhead vfsconf; /* * Operations supported on mounted filesystem. */ struct mount_args; struct nameidata; struct sysctl_req; struct mntarg; /* * N.B., vfs_cmount is the ancient vfsop invoked by the old mount(2) syscall. * The new way is vfs_mount. * * vfs_cmount implementations typically translate arguments from their * respective old per-FS structures into the key-value list supported by * nmount(2), then use kernel_mount(9) to mimic nmount(2) from kernelspace. * * Filesystems with mounters that use nmount(2) do not need to and should not * implement vfs_cmount. Hopefully a future cleanup can remove vfs_cmount and * mount(2) entirely. */ typedef int vfs_cmount_t(struct mntarg *ma, void *data, uint64_t flags); typedef int vfs_unmount_t(struct mount *mp, int mntflags); typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp); typedef int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid, void *arg); typedef int vfs_statfs_t(struct mount *mp, struct statfs *sbp); typedef int vfs_sync_t(struct mount *mp, int waitfor); typedef int vfs_vget_t(struct mount *mp, ino_t ino, int flags, struct vnode **vpp); typedef int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp); typedef int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); typedef int vfs_init_t(struct vfsconf *); typedef int vfs_uninit_t(struct vfsconf *); typedef int vfs_extattrctl_t(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname); typedef int vfs_mount_t(struct mount *mp); typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op, struct sysctl_req *req); typedef void vfs_susp_clean_t(struct mount *mp); typedef void vfs_notify_lowervp_t(struct mount *mp, struct vnode *lowervp); typedef void vfs_purge_t(struct mount *mp); struct vfsops { vfs_mount_t *vfs_mount; vfs_cmount_t *vfs_cmount; vfs_unmount_t *vfs_unmount; vfs_root_t *vfs_root; vfs_quotactl_t *vfs_quotactl; vfs_statfs_t *vfs_statfs; vfs_sync_t *vfs_sync; vfs_vget_t *vfs_vget; vfs_fhtovp_t *vfs_fhtovp; vfs_checkexp_t *vfs_checkexp; vfs_init_t *vfs_init; vfs_uninit_t *vfs_uninit; vfs_extattrctl_t *vfs_extattrctl; vfs_sysctl_t *vfs_sysctl; vfs_susp_clean_t *vfs_susp_clean; vfs_notify_lowervp_t *vfs_reclaim_lowervp; vfs_notify_lowervp_t *vfs_unlink_lowervp; vfs_purge_t *vfs_purge; vfs_mount_t *vfs_spare[6]; /* spares for ABI compat */ }; vfs_statfs_t __vfs_statfs; #define VFS_MOUNT(MP) ({ \ int _rc; \ \ TSRAW(curthread, TS_ENTER, "VFS_MOUNT", (MP)->mnt_vfc->vfc_name);\ _rc = (*(MP)->mnt_op->vfs_mount)(MP); \ TSRAW(curthread, TS_EXIT, "VFS_MOUNT", (MP)->mnt_vfc->vfc_name);\ _rc; }) #define VFS_UNMOUNT(MP, FORCE) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_unmount)(MP, FORCE); \ _rc; }) #define VFS_ROOT(MP, FLAGS, VPP) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP); \ _rc; }) #define VFS_QUOTACTL(MP, C, U, A) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A); \ _rc; }) #define VFS_STATFS(MP, SBP) ({ \ int _rc; \ \ _rc = __vfs_statfs((MP), (SBP)); \ _rc; }) #define VFS_SYNC(MP, WAIT) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_sync)(MP, WAIT); \ _rc; }) #define VFS_VGET(MP, INO, FLAGS, VPP) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP); \ _rc; }) #define VFS_FHTOVP(MP, FIDP, FLAGS, VPP) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, FLAGS, VPP); \ _rc; }) #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED, NUMSEC, SEC) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED, NUMSEC,\ SEC); \ _rc; }) #define VFS_EXTATTRCTL(MP, C, FN, NS, N) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N); \ _rc; }) #define VFS_SYSCTL(MP, OP, REQ) ({ \ int _rc; \ \ _rc = (*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ); \ _rc; }) #define VFS_SUSP_CLEAN(MP) do { \ if (*(MP)->mnt_op->vfs_susp_clean != NULL) { \ (*(MP)->mnt_op->vfs_susp_clean)(MP); \ } \ } while (0) #define VFS_RECLAIM_LOWERVP(MP, VP) do { \ if (*(MP)->mnt_op->vfs_reclaim_lowervp != NULL) { \ (*(MP)->mnt_op->vfs_reclaim_lowervp)((MP), (VP)); \ } \ } while (0) #define VFS_UNLINK_LOWERVP(MP, VP) do { \ if (*(MP)->mnt_op->vfs_unlink_lowervp != NULL) { \ (*(MP)->mnt_op->vfs_unlink_lowervp)((MP), (VP)); \ } \ } while (0) #define VFS_PURGE(MP) do { \ if (*(MP)->mnt_op->vfs_purge != NULL) { \ (*(MP)->mnt_op->vfs_purge)(MP); \ } \ } while (0) #define VFS_KNOTE_LOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), KNF_LISTLOCKED); \ } while (0) #define VFS_KNOTE_UNLOCKED(vp, hint) do \ { \ if (((vp)->v_vflag & VV_NOKNOTE) == 0) \ VN_KNOTE((vp), (hint), 0); \ } while (0) #define VFS_NOTIFY_UPPER_RECLAIM 1 #define VFS_NOTIFY_UPPER_UNLINK 2 #include /* * Version numbers. */ #define VFS_VERSION_00 0x19660120 #define VFS_VERSION_01 0x20121030 #define VFS_VERSION_02 0x20180504 #define VFS_VERSION VFS_VERSION_02 #define VFS_SET(vfsops, fsname, flags) \ static struct vfsconf fsname ## _vfsconf = { \ .vfc_version = VFS_VERSION, \ .vfc_name = #fsname, \ .vfc_vfsops = &vfsops, \ .vfc_typenum = -1, \ .vfc_flags = flags, \ }; \ static moduledata_t fsname ## _mod = { \ #fsname, \ vfs_modevent, \ & fsname ## _vfsconf \ }; \ DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE) /* * exported vnode operations */ int dounmount(struct mount *, int, struct thread *); int kernel_mount(struct mntarg *ma, uint64_t flags); int kernel_vmount(int flags, ...); struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len); struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name); struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...); struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len); void statfs_scale_blocks(struct statfs *sf, long max_size); struct vfsconf *vfs_byname(const char *); struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *); void vfs_mount_destroy(struct mount *); void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); void vfs_freeopts(struct vfsoptlist *opts); void vfs_deleteopt(struct vfsoptlist *opts, const char *name); int vfs_buildopts(struct uio *auio, struct vfsoptlist **options); int vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, uint64_t val); int vfs_getopt(struct vfsoptlist *, const char *, void **, int *); int vfs_getopt_pos(struct vfsoptlist *opts, const char *name); int vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value); char *vfs_getopts(struct vfsoptlist *, const char *, int *error); int vfs_copyopt(struct vfsoptlist *, const char *, void *, int); int vfs_filteropt(struct vfsoptlist *, const char **legal); void vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...); int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...); int vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len); int vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len); int vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value); int vfs_setpublicfs /* set publicly exported fs */ (struct mount *, struct netexport *, struct export_args *); void vfs_msync(struct mount *, int); int vfs_busy(struct mount *, int); int vfs_export /* process mount export info */ (struct mount *, struct export_args *); void vfs_allocate_syncvnode(struct mount *); void vfs_deallocate_syncvnode(struct mount *); int vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions); void vfs_getnewfsid(struct mount *); struct cdev *vfs_getrootfsid(struct mount *); struct mount *vfs_getvfs(fsid_t *); /* return vfs given fsid */ struct mount *vfs_busyfs(fsid_t *); int vfs_modevent(module_t, int, void *); void vfs_mount_error(struct mount *, const char *, ...); void vfs_mountroot(void); /* mount our root filesystem */ void vfs_mountedfrom(struct mount *, const char *from); void vfs_notify_upper(struct vnode *, int); void vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp); void vfs_ref(struct mount *); void vfs_rel(struct mount *); struct mount *vfs_mount_alloc(struct vnode *, struct vfsconf *, const char *, struct ucred *); int vfs_suser(struct mount *, struct thread *); void vfs_unbusy(struct mount *); void vfs_unmountall(void); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx mountlist_mtx; extern struct nfs_public nfs_pub; extern struct sx vfsconf_sx; #define vfsconf_lock() sx_xlock(&vfsconf_sx) #define vfsconf_unlock() sx_xunlock(&vfsconf_sx) #define vfsconf_slock() sx_slock(&vfsconf_sx) #define vfsconf_sunlock() sx_sunlock(&vfsconf_sx) /* * Declarations for these vfs default operations are located in * kern/vfs_default.c. They will be automatically used to replace * null entries in VFS ops tables when registering a new filesystem * type in the global table. */ vfs_root_t vfs_stdroot; vfs_quotactl_t vfs_stdquotactl; vfs_statfs_t vfs_stdstatfs; vfs_sync_t vfs_stdsync; vfs_sync_t vfs_stdnosync; vfs_vget_t vfs_stdvget; vfs_fhtovp_t vfs_stdfhtovp; vfs_checkexp_t vfs_stdcheckexp; vfs_init_t vfs_stdinit; vfs_uninit_t vfs_stduninit; vfs_extattrctl_t vfs_stdextattrctl; vfs_sysctl_t vfs_stdsysctl; void syncer_suspend(void); void syncer_resume(void); void vfs_op_barrier_wait(struct mount *); void vfs_op_enter(struct mount *); void vfs_op_exit_locked(struct mount *); void vfs_op_exit(struct mount *); #ifdef DIAGNOSTIC void vfs_assert_mount_counters(struct mount *); void vfs_dump_mount_counters(struct mount *); #else #define vfs_assert_mount_counters(mp) do { } while (0) #define vfs_dump_mount_counters(mp) do { } while (0) #endif enum mount_counter { MNT_COUNT_REF, MNT_COUNT_LOCKREF, MNT_COUNT_WRITEOPCOUNT }; int vfs_mount_fetch_counter(struct mount *, enum mount_counter); /* * We mark ourselves as entering the section and post a sequentially consistent * fence, meaning the store is completed before we get into the section and * mnt_vfs_ops is only read afterwards. * * Any thread transitioning the ops counter 0->1 does things in the opposite * order - first bumps the count, posts a sequentially consistent fence and * observes all CPUs not executing within the section. * * This provides an invariant that by the time the last CPU is observed not * executing, everyone else entering will see the counter > 0 and exit. * * Note there is no barrier between vfs_ops and the rest of the code in the * section. It is not necessary as the writer has to wait for everyone to drain * before making any changes or only make changes safe while the section is * executed. */ #define vfs_op_thread_entered(mp) ({ \ MPASS(curthread->td_critnest > 0); \ *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) == 1; \ }) #define vfs_op_thread_enter(mp) ({ \ bool _retval = true; \ critical_enter(); \ MPASS(!vfs_op_thread_entered(mp)); \ *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 1; \ atomic_thread_fence_seq_cst(); \ if (__predict_false(mp->mnt_vfs_ops > 0)) { \ vfs_op_thread_exit(mp); \ _retval = false; \ } \ _retval; \ }) #define vfs_op_thread_exit(mp) do { \ MPASS(vfs_op_thread_entered(mp)); \ atomic_thread_fence_rel(); \ *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 0; \ critical_exit(); \ } while (0) #define vfs_mp_count_add_pcpu(mp, count, val) do { \ MPASS(vfs_op_thread_entered(mp)); \ (*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) += val; \ } while (0) #define vfs_mp_count_sub_pcpu(mp, count, val) do { \ MPASS(vfs_op_thread_entered(mp)); \ (*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) -= val; \ } while (0) #else /* !_KERNEL */ #include struct stat; __BEGIN_DECLS int fhlink(struct fhandle *, const char *); int fhlinkat(struct fhandle *, int, const char *); int fhopen(const struct fhandle *, int); int fhreadlink(struct fhandle *, char *, size_t); int fhstat(const struct fhandle *, struct stat *); int fhstatfs(const struct fhandle *, struct statfs *); int fstatfs(int, struct statfs *); int getfh(const char *, fhandle_t *); int getfhat(int, char *, struct fhandle *, int); int getfsstat(struct statfs *, long, int); int getmntinfo(struct statfs **, int); int lgetfh(const char *, fhandle_t *); int mount(const char *, const char *, int, void *); int nmount(struct iovec *, unsigned int, int); int statfs(const char *, struct statfs *); int unmount(const char *, int); /* C library stuff */ int getvfsbyname(const char *, struct xvfsconf *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_MOUNT_H_ */ Index: projects/clang900-import/usr.bin/jot/jot.1 =================================================================== --- projects/clang900-import/usr.bin/jot/jot.1 (revision 352586) +++ projects/clang900-import/usr.bin/jot/jot.1 (revision 352587) @@ -1,328 +1,330 @@ .\" Copyright (c) 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)jot.1 8.1 (Berkeley) 6/6/93 .\" $FreeBSD$ .\" -.Dd April 7, 2015 +.Dd September 21, 2019 .Dt JOT 1 .Os .Sh NAME .Nm jot .Nd print sequential or random data .Sh SYNOPSIS .Nm .Op Fl cnr .Op Fl b Ar word .Op Fl w Ar word .Op Fl s Ar string .Op Fl p Ar precision .Op Ar reps Op Ar begin Op Ar end Op Ar s .Sh DESCRIPTION The .Nm utility is used to print out increasing, decreasing, random, or redundant data, usually numbers, one per line. .Pp The following options are available: .Bl -tag -width indent .It Fl r Generate random data instead of the default sequential data. .It Fl b Ar word Just print .Ar word repetitively. .It Fl w Ar word Print .Ar word with the generated data appended to it. Octal, hexadecimal, exponential, .Tn ASCII , zero padded, and right-adjusted representations are possible by using the appropriate .Xr printf 3 conversion specification inside .Ar word , in which case the data are inserted rather than appended. .It Fl c This is an abbreviation for .Fl w Ar %c . .It Fl s Ar string Print data separated by .Ar string . Normally, newlines separate data. .It Fl n Do not print the final newline normally appended to the output. .It Fl p Ar precision Print only as many digits or characters of the data as indicated by the integer .Ar precision . In the absence of .Fl p , the precision is the greater of the precisions of .Ar begin and .Ar end . The .Fl p option is overridden by whatever appears in a .Xr printf 3 conversion following .Fl w . .El .Pp The last four arguments indicate, respectively, the number of data, the lower bound, the upper bound, and the step size or, for random data, the seed. While at least one of them must appear, any of the other three may be omitted, and will be considered as such if given as .Fl "" or as an empty string. Any three of these arguments determines the fourth. If four are specified and the given and computed values of .Ar reps conflict, the lower value is used. -If fewer than three are specified, defaults are assigned -left to right, except for +If one or two are specified, defaults are assigned +starting with .Ar s , -which assumes a default of 1 or -1 if both +which assumes a default of 1 (or -1 if .Ar begin and .Ar end -are given. +specify a descending range). +Then the default values are assigned to the leftmost omitted arguments until +three arguments are set. .Pp Defaults for the four arguments are, respectively, 100, 1, 100, and 1, except that when random data are requested, the seed, .Ar s , is picked randomly. The .Ar reps argument is expected to be an unsigned integer, and if given as zero is taken to be infinite. The .Ar begin and .Ar end arguments may be given as real numbers or as characters representing the corresponding value in .Tn ASCII . The last argument must be a real number. .Pp Random numbers are obtained through .Xr arc4random 3 when no seed is specified, and through .Xr random 3 when a seed is given. When .Nm is asked to generate random integers or characters with begin and end values in the range of the random number generator function and no format is specified with one of the .Fl w , .Fl b , or .Fl p options, .Nm will arrange for all the values in the range to appear in the output with an equal probability. In all other cases be careful to ensure that the output format's rounding or truncation will not skew the distribution of output values in an unintended way. .Pp The name .Nm derives in part from .Nm iota , a function in APL. .Ss Rounding and truncation The .Nm utility uses double precision floating point arithmetic internally. Before printing a number, it is converted depending on the output format used. .Pp If no output format is specified or the output format is a floating point format .Po .Sq E , .Sq G , .Sq e , .Sq f , or .Sq g .Pc , the value is rounded using the .Xr printf 3 function, taking into account the requested precision. .Pp If the output format is an integer format .Po .Sq D , .Sq O , .Sq U , .Sq X , .Sq c , .Sq d , .Sq i , .Sq o , .Sq u , or .Sq x .Pc , the value is converted to an integer value by truncation. .Pp As an illustration, consider the following command: .Bd -literal -offset indent $ jot 6 1 10 0.5 1 2 2 2 3 4 .Ed .Pp By requesting an explicit precision of 1, the values generated before rounding can be seen. The .5 values are rounded down if the integer part is even, up otherwise. .Bd -literal -offset indent $ jot -p 1 6 1 10 0.5 1.0 1.5 2.0 2.5 3.0 3.5 .Ed .Pp By offsetting the values slightly, the values generated by the following command are always rounded down: .Bd -literal -offset indent $ jot -p 0 6 .9999999999 10 0.5 1 1 2 2 3 3 .Ed .Pp Another way of achieving the same result is to force truncation by specifying an integer format: .Bd -literal -offset indent $ jot -w %d 6 1 10 0.5 .Ed .Sh EXIT STATUS .Ex -std .Sh EXAMPLES The command .Dl jot - 1 10 .Pp prints the integers from 1 to 10, while the command .Dl jot 21 -1 1.00 .Pp prints 21 evenly spaced numbers increasing from -1 to 1. The .Tn ASCII character set is generated with .Dl jot -c 128 0 .Pp and the strings xaa through xaz with .Dl jot -w xa%c 26 a .Pp while 20 random 8-letter strings are produced with .Dl "jot -r -c 160 a z | rs -g 0 8" .Pp Infinitely many .Em yes Ns 's may be obtained through .Dl jot -b yes 0 .Pp and thirty .Xr ed 1 substitution commands applying to lines 2, 7, 12, etc.\& is the result of .Dl jot -w %ds/old/new/ 30 2 - 5 .Pp The stuttering sequence 9, 9, 8, 8, 7, etc.\& can be produced by truncating the output precision and a suitable choice of step size, as in .Dl jot -w %d - 9.5 0 -.5 .Pp and a file containing exactly 1024 bytes is created with .Dl jot -b x 512 > block .Pp Finally, to set tabs four spaces apart starting from column 10 and ending in column 132, use .Dl expand -`jot -s, - 10 132 4` .Pp and to print all lines 80 characters or longer, .Dl grep `jot -s \&"\&" -b \&. 80` .Sh DIAGNOSTICS The following diagnostic messages deserve special explanation: .Bl -diag .It "illegal or unsupported format '%s'" The requested conversion format specifier for .Xr printf 3 was not of the form .Dl %[#][ ][{+,-}][0-9]*[.[0-9]*]? where .Dq ?\& must be one of .Dl [l]{d,i,o,u,x} or .Dl {c,e,f,g,D,E,G,O,U,X} .It "range error in conversion" A value to be printed fell outside the range of the data type associated with the requested output format. .It "too many conversions" More than one conversion format specifier has been supplied, but only one is allowed. .El .Sh SEE ALSO .Xr ed 1 , .Xr expand 1 , .Xr rs 1 , .Xr seq 1 , .Xr yes 1 , .Xr arc4random 3 , .Xr printf 3 , .Xr random 3 .Sh HISTORY The .Nm utility first appeared in .Bx 4.2 . .Sh AUTHORS .An John A. Kunze Index: projects/clang900-import/usr.bin/quota/quota.c =================================================================== --- projects/clang900-import/usr.bin/quota/quota.c (revision 352586) +++ projects/clang900-import/usr.bin/quota/quota.c (revision 352587) @@ -1,699 +1,699 @@ /* * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1980, 1990, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif #ifndef lint static const char sccsid[] = "from: @(#)quota.c 8.1 (Berkeley) 6/6/93"; #endif /* not lint */ /* * Disk quota reporting program. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static const char *qfextension[] = INITQFNAMES; struct quotause { struct quotause *next; long flags; struct dqblk dqblk; char fsname[MAXPATHLEN + 1]; }; static char *timeprt(int64_t seconds); static struct quotause *getprivs(long id, int quotatype); static void usage(void); static int showuid(u_long uid); static int showgid(u_long gid); static int showusrname(char *name); static int showgrpname(char *name); static int showquotas(int type, u_long id, const char *name); static void showrawquotas(int type, u_long id, struct quotause *qup); static void heading(int type, u_long id, const char *name, const char *tag); static int getufsquota(struct fstab *fs, struct quotause *qup, long id, int quotatype); static int getnfsquota(struct statfs *fst, struct quotause *qup, long id, int quotatype); static enum clnt_stat callaurpc(char *host, int prognum, int versnum, int procnum, xdrproc_t inproc, char *in, xdrproc_t outproc, char *out); static int alldigits(char *s); static int hflag; static int lflag; static int rflag; static int qflag; static int vflag; static char *filename = NULL; int main(int argc, char *argv[]) { int ngroups; gid_t mygid, gidset[NGROUPS]; int i, ch, gflag = 0, uflag = 0, errflag = 0; while ((ch = getopt(argc, argv, "f:ghlrquv")) != -1) { switch(ch) { case 'f': filename = optarg; break; case 'g': gflag++; break; case 'h': hflag++; break; case 'l': lflag++; break; case 'q': qflag++; break; case 'r': rflag++; break; case 'u': uflag++; break; case 'v': vflag++; break; default: usage(); } } argc -= optind; argv += optind; if (!uflag && !gflag) uflag++; if (argc == 0) { if (uflag) errflag += showuid(getuid()); if (gflag) { mygid = getgid(); ngroups = getgroups(NGROUPS, gidset); if (ngroups < 0) err(1, "getgroups"); errflag += showgid(mygid); for (i = 0; i < ngroups; i++) if (gidset[i] != mygid) errflag += showgid(gidset[i]); } return(errflag); } if (uflag && gflag) usage(); if (uflag) { for (; argc > 0; argc--, argv++) { if (alldigits(*argv)) errflag += showuid(atoi(*argv)); else errflag += showusrname(*argv); } return(errflag); } if (gflag) { for (; argc > 0; argc--, argv++) { if (alldigits(*argv)) errflag += showgid(atoi(*argv)); else errflag += showgrpname(*argv); } } return(errflag); } static void usage(void) { fprintf(stderr, "%s\n%s\n%s\n", "usage: quota [-ghlu] [-f path] [-v | -q | -r]", " quota [-hlu] [-f path] [-v | -q | -r] user ...", " quota -g [-hl] [-f path] [-v | -q | -r] group ..."); exit(1); } /* * Print out quotas for a specified user identifier. */ static int showuid(u_long uid) { struct passwd *pwd = getpwuid(uid); const char *name; if (pwd == NULL) name = "(no account)"; else name = pwd->pw_name; return(showquotas(USRQUOTA, uid, name)); } /* * Print out quotas for a specifed user name. */ static int showusrname(char *name) { struct passwd *pwd = getpwnam(name); if (pwd == NULL) { warnx("%s: unknown user", name); return(1); } return(showquotas(USRQUOTA, pwd->pw_uid, name)); } /* * Print out quotas for a specified group identifier. */ static int showgid(u_long gid) { struct group *grp = getgrgid(gid); const char *name; if (grp == NULL) name = "(no entry)"; else name = grp->gr_name; return(showquotas(GRPQUOTA, gid, name)); } /* * Print out quotas for a specifed group name. */ static int showgrpname(char *name) { struct group *grp = getgrnam(name); if (grp == NULL) { warnx("%s: unknown group", name); return(1); } return(showquotas(GRPQUOTA, grp->gr_gid, name)); } static void prthumanval(int len, u_int64_t bytes) { char buf[len + 1]; /* * Limit the width to 5 bytes as that is what users expect. */ humanize_number(buf, MIN(sizeof(buf), 5), bytes, "", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); (void)printf(" %*s", len, buf); } static int showquotas(int type, u_long id, const char *name) { struct quotause *qup; struct quotause *quplist; const char *msgi, *msgb; const char *nam; char *bgrace = NULL, *igrace = NULL; int lines = 0, overquota = 0; static time_t now; if (now == 0) time(&now); quplist = getprivs(id, type); for (qup = quplist; qup; qup = qup->next) { msgi = NULL; if (qup->dqblk.dqb_ihardlimit && qup->dqblk.dqb_curinodes >= qup->dqblk.dqb_ihardlimit) { overquota++; msgi = "File limit reached on"; } else if (qup->dqblk.dqb_isoftlimit && qup->dqblk.dqb_curinodes >= qup->dqblk.dqb_isoftlimit) { overquota++; if (qup->dqblk.dqb_itime > now) msgi = "In file grace period on"; else msgi = "Over file quota on"; } msgb = NULL; if (qup->dqblk.dqb_bhardlimit && qup->dqblk.dqb_curblocks >= qup->dqblk.dqb_bhardlimit) { overquota++; msgb = "Block limit reached on"; } else if (qup->dqblk.dqb_bsoftlimit && qup->dqblk.dqb_curblocks >= qup->dqblk.dqb_bsoftlimit) { overquota++; if (qup->dqblk.dqb_btime > now) msgb = "In block grace period on"; else msgb = "Over block quota on"; } if (rflag) { showrawquotas(type, id, qup); continue; } if (!vflag && qup->dqblk.dqb_isoftlimit == 0 && qup->dqblk.dqb_ihardlimit == 0 && qup->dqblk.dqb_bsoftlimit == 0 && qup->dqblk.dqb_bhardlimit == 0) continue; if (qflag) { if ((msgi != NULL || msgb != NULL) && lines++ == 0) heading(type, id, name, ""); if (msgi != NULL) printf("\t%s %s\n", msgi, qup->fsname); if (msgb != NULL) printf("\t%s %s\n", msgb, qup->fsname); continue; } if (!vflag && qup->dqblk.dqb_curblocks == 0 && qup->dqblk.dqb_curinodes == 0) continue; if (lines++ == 0) heading(type, id, name, ""); nam = qup->fsname; if (strlen(qup->fsname) > 15) { printf("%s\n", qup->fsname); nam = ""; } printf("%-15s", nam); if (hflag) { prthumanval(7, dbtob(qup->dqblk.dqb_curblocks)); printf("%c", (msgb == NULL) ? ' ' : '*'); prthumanval(7, dbtob(qup->dqblk.dqb_bsoftlimit)); prthumanval(7, dbtob(qup->dqblk.dqb_bhardlimit)); } else { printf(" %7ju%c %7ju %7ju", (uintmax_t)dbtob(qup->dqblk.dqb_curblocks) / 1024, (msgb == NULL) ? ' ' : '*', (uintmax_t)dbtob(qup->dqblk.dqb_bsoftlimit) / 1024, (uintmax_t)dbtob(qup->dqblk.dqb_bhardlimit) / 1024); } if (msgb != NULL) bgrace = timeprt(qup->dqblk.dqb_btime); if (msgi != NULL) igrace = timeprt(qup->dqblk.dqb_itime); printf("%8s %6ju%c %6ju %6ju%8s\n" , (msgb == NULL) ? "" : bgrace , (uintmax_t)qup->dqblk.dqb_curinodes , (msgi == NULL) ? ' ' : '*' , (uintmax_t)qup->dqblk.dqb_isoftlimit , (uintmax_t)qup->dqblk.dqb_ihardlimit , (msgi == NULL) ? "" : igrace ); if (msgb != NULL) free(bgrace); if (msgi != NULL) free(igrace); } if (!qflag && !rflag && lines == 0) heading(type, id, name, "none"); return (overquota); } static void showrawquotas(int type, u_long id, struct quotause *qup) { time_t t; printf("Raw %s quota information for id %lu on %s\n", type == USRQUOTA ? "user" : "group", id, qup->fsname); printf("block hard limit: %ju\n", (uintmax_t)qup->dqblk.dqb_bhardlimit); printf("block soft limit: %ju\n", (uintmax_t)qup->dqblk.dqb_bsoftlimit); printf("current block count: %ju\n", (uintmax_t)qup->dqblk.dqb_curblocks); printf("i-node hard limit: %ju\n", (uintmax_t)qup->dqblk.dqb_ihardlimit); printf("i-node soft limit: %ju\n", (uintmax_t)qup->dqblk.dqb_isoftlimit); printf("current i-node count: %ju\n", (uintmax_t)qup->dqblk.dqb_curinodes); printf("block grace time: %jd", (intmax_t)qup->dqblk.dqb_btime); if (qup->dqblk.dqb_btime != 0) { t = qup->dqblk.dqb_btime; printf(" %s", ctime(&t)); } else { printf("\n"); } printf("i-node grace time: %jd", (intmax_t)qup->dqblk.dqb_itime); if (qup->dqblk.dqb_itime != 0) { t = qup->dqblk.dqb_itime; printf(" %s", ctime(&t)); } else { printf("\n"); } } static void heading(int type, u_long id, const char *name, const char *tag) { printf("Disk quotas for %s %s (%cid %lu): %s\n", qfextension[type], name, *qfextension[type], id, tag); if (!qflag && tag[0] == '\0') { printf("%-15s %7s %8s %7s %7s %6s %7s %6s%8s\n" , "Filesystem" , "usage" , "quota" , "limit" , "grace" , "files" , "quota" , "limit" , "grace" ); } } /* * Calculate the grace period and return a printable string for it. */ static char * timeprt(int64_t seconds) { time_t hours, minutes; char *buf; static time_t now; if (now == 0) time(&now); if (now > seconds) { if ((buf = strdup("none")) == NULL) errx(1, "strdup() failed in timeprt()"); return (buf); } seconds -= now; minutes = (seconds + 30) / 60; hours = (minutes + 30) / 60; if (hours >= 36) { if (asprintf(&buf, "%lddays", ((long)hours + 12) / 24) < 0) errx(1, "asprintf() failed in timeprt(1)"); return (buf); } if (minutes >= 60) { if (asprintf(&buf, "%2ld:%ld", (long)minutes / 60, (long)minutes % 60) < 0) errx(1, "asprintf() failed in timeprt(2)"); return (buf); } if (asprintf(&buf, "%2ld", (long)minutes) < 0) errx(1, "asprintf() failed in timeprt(3)"); return (buf); } /* * Collect the requested quota information. */ static struct quotause * getprivs(long id, int quotatype) { struct quotause *qup, *quptail = NULL; struct fstab *fs; struct quotause *quphead; struct statfs *fst; int nfst, i; struct statfs sfb; qup = quphead = (struct quotause *)0; if (filename != NULL && statfs(filename, &sfb) != 0) err(1, "cannot statfs %s", filename); nfst = getmntinfo(&fst, MNT_NOWAIT); if (nfst == 0) errx(2, "no filesystems mounted!"); setfsent(); for (i = 0; i < nfst; i++) { if (qup == NULL) { if ((qup = (struct quotause *)malloc(sizeof *qup)) == NULL) errx(2, "out of memory"); } /* * See if the user requested a specific file system * or specified a file inside a mounted file system. */ if (filename != NULL && strcmp(sfb.f_mntonname, fst[i].f_mntonname) != 0) continue; if (strcmp(fst[i].f_fstypename, "nfs") == 0) { if (lflag) continue; if (getnfsquota(&fst[i], qup, id, quotatype) == 0) continue; } else if (strcmp(fst[i].f_fstypename, "ufs") == 0) { /* * XXX * UFS filesystems must be in /etc/fstab, and must * indicate that they have quotas on (?!) This is quite * unlike SunOS where quotas can be enabled/disabled * on a filesystem independent of /etc/fstab, and it * will still print quotas for them. */ if ((fs = getfsspec(fst[i].f_mntfromname)) == NULL) continue; if (getufsquota(fs, qup, id, quotatype) == 0) continue; } else continue; strcpy(qup->fsname, fst[i].f_mntonname); if (quphead == NULL) quphead = qup; else quptail->next = qup; quptail = qup; quptail->next = 0; qup = NULL; } if (qup) free(qup); endfsent(); return (quphead); } /* * Check to see if a particular quota is available. */ static int getufsquota(struct fstab *fs, struct quotause *qup, long id, int quotatype) { struct quotafile *qf; if ((qf = quota_open(fs, quotatype, O_RDONLY)) == NULL) return (0); if (quota_read(qf, &qup->dqblk, id) != 0) return (0); quota_close(qf); return (1); } static int getnfsquota(struct statfs *fst, struct quotause *qup, long id, int quotatype) { struct ext_getquota_args gq_args; struct getquota_args old_gq_args; struct getquota_rslt gq_rslt; struct dqblk *dqp = &qup->dqblk; struct timeval tv; char *cp, host[NI_MAXHOST]; enum clnt_stat call_stat; if (fst->f_flags & MNT_LOCAL) return (0); /* * must be some form of "hostname:/path" */ cp = fst->f_mntfromname; do { cp = strrchr(cp, ':'); } while (cp != NULL && *(cp + 1) != '/'); if (cp == NULL) { warnx("cannot find hostname for %s", fst->f_mntfromname); return (0); } memset(host, 0, sizeof(host)); memcpy(host, fst->f_mntfromname, cp - fst->f_mntfromname); host[sizeof(host) - 1] = '\0'; /* Avoid attempting the RPC for special amd(8) filesystems. */ if (strncmp(fst->f_mntfromname, "pid", 3) == 0 && strchr(fst->f_mntfromname, '@') != NULL) return (0); gq_args.gqa_pathp = cp + 1; gq_args.gqa_id = id; gq_args.gqa_type = quotatype; call_stat = callaurpc(host, RQUOTAPROG, EXT_RQUOTAVERS, RQUOTAPROC_GETQUOTA, (xdrproc_t)xdr_ext_getquota_args, (char *)&gq_args, (xdrproc_t)xdr_getquota_rslt, (char *)&gq_rslt); - if (call_stat == RPC_PROGVERSMISMATCH) { + if (call_stat == RPC_PROGVERSMISMATCH || call_stat == RPC_PROGNOTREGISTERED) { if (quotatype == USRQUOTA) { old_gq_args.gqa_pathp = cp + 1; old_gq_args.gqa_uid = id; call_stat = callaurpc(host, RQUOTAPROG, RQUOTAVERS, RQUOTAPROC_GETQUOTA, (xdrproc_t)xdr_getquota_args, (char *)&old_gq_args, (xdrproc_t)xdr_getquota_rslt, (char *)&gq_rslt); } else { /* Old rpc quota does not support group type */ return (0); } } if (call_stat != 0) return (call_stat); switch (gq_rslt.status) { case Q_NOQUOTA: break; case Q_EPERM: warnx("quota permission error, host: %s", fst->f_mntfromname); break; case Q_OK: gettimeofday(&tv, NULL); /* blocks*/ dqp->dqb_bhardlimit = ((uint64_t)gq_rslt.getquota_rslt_u.gqr_rquota.rq_bhardlimit * gq_rslt.getquota_rslt_u.gqr_rquota.rq_bsize) / DEV_BSIZE; dqp->dqb_bsoftlimit = ((uint64_t)gq_rslt.getquota_rslt_u.gqr_rquota.rq_bsoftlimit * gq_rslt.getquota_rslt_u.gqr_rquota.rq_bsize) / DEV_BSIZE; dqp->dqb_curblocks = ((uint64_t)gq_rslt.getquota_rslt_u.gqr_rquota.rq_curblocks * gq_rslt.getquota_rslt_u.gqr_rquota.rq_bsize) / DEV_BSIZE; /* inodes */ dqp->dqb_ihardlimit = gq_rslt.getquota_rslt_u.gqr_rquota.rq_fhardlimit; dqp->dqb_isoftlimit = gq_rslt.getquota_rslt_u.gqr_rquota.rq_fsoftlimit; dqp->dqb_curinodes = gq_rslt.getquota_rslt_u.gqr_rquota.rq_curfiles; /* grace times */ dqp->dqb_btime = tv.tv_sec + gq_rslt.getquota_rslt_u.gqr_rquota.rq_btimeleft; dqp->dqb_itime = tv.tv_sec + gq_rslt.getquota_rslt_u.gqr_rquota.rq_ftimeleft; return (1); default: warnx("bad rpc result, host: %s", fst->f_mntfromname); break; } return (0); } static enum clnt_stat callaurpc(char *host, int prognum, int versnum, int procnum, xdrproc_t inproc, char *in, xdrproc_t outproc, char *out) { enum clnt_stat clnt_stat; struct timeval timeout, tottimeout; CLIENT *client = NULL; client = clnt_create(host, prognum, versnum, "udp"); if (client == NULL) return ((int)rpc_createerr.cf_stat); timeout.tv_usec = 0; timeout.tv_sec = 6; CLNT_CONTROL(client, CLSET_RETRY_TIMEOUT, (char *)(void *)&timeout); client->cl_auth = authunix_create_default(); tottimeout.tv_sec = 25; tottimeout.tv_usec = 0; clnt_stat = clnt_call(client, procnum, inproc, in, outproc, out, tottimeout); return (clnt_stat); } static int alldigits(char *s) { int c; c = *s++; do { if (!isdigit(c)) return (0); } while ((c = *s++)); return (1); } Index: projects/clang900-import/usr.bin/top/display.c =================================================================== --- projects/clang900-import/usr.bin/top/display.c (revision 352586) +++ projects/clang900-import/usr.bin/top/display.c (revision 352587) @@ -1,1379 +1,1354 @@ /* * Top users/processes display for Unix * Version 3 * * This program may be freely redistributed, * but this entire comment MUST remain intact. * * Copyright (c) 1984, 1989, William LeFebvre, Rice University * Copyright (c) 1989, 1990, 1992, William LeFebvre, Northwestern University * * $FreeBSD$ */ /* * This file contains the routines that display information on the screen. * Each section of the screen has two routines: one for initially writing * all constant and dynamic text, and one for only updating the text that * changes. The prefix "i_" is used on all the "initial" routines and the * prefix "u_" is used for all the "updating" routines. * * ASSUMPTIONS: * None of the "i_" routines use any of the termcap capabilities. * In this way, those routines can be safely used on terminals that * have minimal (or nonexistant) terminal capabilities. * * The routines are called in this order: *_loadave, i_timeofday, * *_procstates, *_cpustates, *_memory, *_message, *_header, * *_process, u_endscreen. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "screen.h" /* interface to screen package */ #include "layout.h" /* defines for screen position layout */ #include "display.h" #include "top.h" #include "machine.h" /* we should eliminate this!!! */ #include "utils.h" #ifdef DEBUG FILE *debug; #endif static int lmpid = 0; static int last_hi = 0; /* used in u_process and u_endscreen */ static int lastline = 0; #define lineindex(l) ((l)*screen_width) /* things initialized by display_init and used thruout */ /* buffer of proc information lines for display updating */ static char *screenbuf = NULL; static const char * const *procstate_names; static const char * const *cpustate_names; static const char * const *memory_names; static const char * const *arc_names; static const char * const *carc_names; static const char * const *swap_names; static int num_procstates; static int num_cpustates; static int num_memory; static int num_swap; static int *lprocstates; static int *lcpustates; static int *lmemory; static int *lswap; static int num_cpus; static int *cpustate_columns; static int cpustate_total_length; static int cpustates_column; static enum { OFF, ON, ERASE } header_status = ON; static void summary_format(char *, int *, const char * const *); static void line_update(char *, char *, int, int); static int setup_buffer_bufsiz = 0; static char * setup_buffer(char *, int); int x_lastpid = 10; int y_lastpid = 0; int x_loadave = 33; int x_loadave_nompid = 15; int y_loadave = 0; int x_procstate = 0; int y_procstate = 1; int x_brkdn = 15; int y_brkdn = 1; int x_mem = 5; int y_mem = 3; int x_arc = 5; int y_arc = 4; int x_carc = 5; int y_carc = 5; int x_swap = 6; int y_swap = 4; int y_message = 5; int x_header = 0; int y_header = 6; int x_idlecursor = 0; int y_idlecursor = 5; int y_procs = 7; int y_cpustates = 2; int Header_lines = 7; int display_resize(void) { int lines; /* first, deallocate any previous buffer that may have been there */ if (screenbuf != NULL) { free(screenbuf); } /* calculate the current dimensions */ /* if operating in "dumb" mode, we only need one line */ lines = smart_terminal ? screen_length - Header_lines : 1; if (lines < 0) lines = 0; /* now, allocate space for the screen buffer */ screenbuf = calloc(lines, screen_width); if (screenbuf == NULL) { /* oops! */ return(-1); } /* return number of lines available */ /* for dumb terminals, pretend like we can show any amount */ return(smart_terminal ? lines : Largest); } int display_updatecpus(struct statics *statics) { int lines; int i; /* call resize to do the dirty work */ lines = display_resize(); if (pcpu_stats) num_cpus = statics->ncpus; else num_cpus = 1; cpustates_column = 5; /* CPU: */ if (num_cpus > 1) { cpustates_column += 1 + digits(num_cpus); /* CPU #: */ } /* fill the "last" array with all -1s, to insure correct updating */ for (i = 0; i < num_cpustates * num_cpus; ++i) { lcpustates[i] = -1; } return(lines); } int display_init(struct statics * statics) { int lines; const char * const *pp; int *ip; int i; lines = display_updatecpus(statics); /* only do the rest if we need to */ if (lines > -1) { /* save pointers and allocate space for names */ procstate_names = statics->procstate_names; num_procstates = 8; assert(num_procstates > 0); lprocstates = calloc(num_procstates, sizeof(int)); cpustate_names = statics->cpustate_names; swap_names = statics->swap_names; num_swap = 7; assert(num_swap > 0); lswap = calloc(num_swap, sizeof(int)); num_cpustates = CPUSTATES; assert(num_cpustates > 0); lcpustates = calloc(num_cpustates * sizeof(int), statics->ncpus); cpustate_columns = calloc(num_cpustates, sizeof(int)); memory_names = statics->memory_names; num_memory = 7; assert(num_memory > 0); lmemory = calloc(num_memory, sizeof(int)); arc_names = statics->arc_names; carc_names = statics->carc_names; /* calculate starting columns where needed */ cpustate_total_length = 0; pp = cpustate_names; ip = cpustate_columns; while (*pp != NULL) { *ip++ = cpustate_total_length; if ((i = strlen(*pp++)) > 0) { cpustate_total_length += i + 8; } } } /* return number of lines available */ return(lines); } void i_loadave(int mpid, double avenrun[]) { int i; /* i_loadave also clears the screen, since it is first */ top_clear(); /* mpid == -1 implies this system doesn't have an _mpid */ if (mpid != -1) { printf("last pid: %5d; ", mpid); } printf("load averages"); for (i = 0; i < 3; i++) { printf("%c %5.2f", i == 0 ? ':' : ',', avenrun[i]); } lmpid = mpid; } void u_loadave(int mpid, double *avenrun) { int i; if (mpid != -1) { /* change screen only when value has really changed */ if (mpid != lmpid) { Move_to(x_lastpid, y_lastpid); printf("%5d", mpid); lmpid = mpid; } /* i remembers x coordinate to move to */ i = x_loadave; } else { i = x_loadave_nompid; } /* move into position for load averages */ Move_to(i, y_loadave); /* display new load averages */ /* we should optimize this and only display changes */ for (i = 0; i < 3; i++) { printf("%s%5.2f", i == 0 ? "" : ", ", avenrun[i]); } } void i_timeofday(time_t *tod) { /* * Display the current time. * "ctime" always returns a string that looks like this: * * Sun Sep 16 01:03:52 1973 * 012345678901234567890123 * 1 2 * * We want indices 11 thru 18 (length 8). */ if (smart_terminal) { Move_to(screen_width - 8, 0); } else { fputs(" ", stdout); } #ifdef DEBUG { char *foo; foo = ctime(tod); fputs(foo, stdout); } #endif printf("%-8.8s\n", &(ctime(tod)[11])); lastline = 1; } static int ltotal = 0; static char *procstates_buffer = NULL; /* * *_procstates(total, brkdn, names) - print the process summary line * * Assumptions: cursor is at the beginning of the line on entry * lastline is valid */ void i_procstates(int total, int *brkdn) { int i; procstates_buffer = setup_buffer(procstates_buffer, 0); /* write current number of processes and remember the value */ printf("%d %s:", total, ps.thread ? "threads" : "processes"); ltotal = total; /* put out enough spaces to get to column 15 */ i = digits(total); while (i++ < (ps.thread ? 6 : 4)) { putchar(' '); } /* format and print the process state summary */ summary_format(procstates_buffer, brkdn, procstate_names); fputs(procstates_buffer, stdout); /* save the numbers for next time */ memcpy(lprocstates, brkdn, num_procstates * sizeof(int)); } void u_procstates(int total, int *brkdn) { static char *new = NULL; int i; new = setup_buffer(new, 0); /* update number of processes only if it has changed */ if (ltotal != total) { /* move and overwrite */ if (x_procstate == 0) { Move_to(x_procstate, y_procstate); } else { /* cursor is already there...no motion needed */ assert(lastline == 1); } printf("%d", total); /* if number of digits differs, rewrite the label */ if (digits(total) != digits(ltotal)) { printf(" %s:", ps.thread ? "threads" : "processes"); /* put out enough spaces to get to column 15 */ i = digits(total); while (i++ < (ps.thread ? 6 : 4)) { putchar(' '); } /* cursor may end up right where we want it!!! */ } /* save new total */ ltotal = total; } /* see if any of the state numbers has changed */ if (memcmp(lprocstates, brkdn, num_procstates * sizeof(int)) != 0) { /* format and update the line */ summary_format(new, brkdn, procstate_names); line_update(procstates_buffer, new, x_brkdn, y_brkdn); memcpy(lprocstates, brkdn, num_procstates * sizeof(int)); } } void i_cpustates(int *states) { int i = 0; int value; const char * const *names; const char *thisname; int *hstates = states; int cpu; for (cpu = 0; cpu < num_cpus; cpu++) { names = cpustate_names; /* print tag and bump lastline */ if (num_cpus == 1) printf("\nCPU: "); else { value = printf("\nCPU %d: ", cpu); while (value++ <= cpustates_column) printf(" "); } lastline++; /* now walk thru the names and print the line */ while ((thisname = *names++) != NULL) { if (*thisname != '\0') { /* retrieve the value and remember it */ value = *states++; /* if percentage is >= 1000, print it as 100% */ printf((value >= 1000 ? "%s%4.0f%% %s" : "%s%4.1f%% %s"), (i++ % num_cpustates) == 0 ? "" : ", ", ((float)value)/10., thisname); } } } /* copy over values into "last" array */ states = hstates; memcpy(lcpustates, states, num_cpustates * sizeof(int) * num_cpus); } void u_cpustates(int *states) { int value; const char * const *names; const char *thisname; int *hstates = states; int *lp; int *colp; int cpu; for (cpu = 0; cpu < num_cpus; cpu++) { names = cpustate_names; Move_to(cpustates_column, y_cpustates + cpu); lastline = y_cpustates + cpu; lp = lcpustates + (cpu * num_cpustates); colp = cpustate_columns; /* we could be much more optimal about this */ while ((thisname = *names++) != NULL) { if (*thisname != '\0') { /* did the value change since last time? */ if (*lp != *states) { /* yes, move and change */ Move_to(cpustates_column + *colp, y_cpustates + cpu); lastline = y_cpustates + cpu; /* retrieve value and remember it */ value = *states; /* if percentage is >= 1000, print it as 100% */ printf((value >= 1000 ? "%4.0f" : "%4.1f"), ((double)value)/10.); /* remember it for next time */ *lp = value; } } /* increment and move on */ lp++; states++; colp++; } } states = hstates; } void z_cpustates(void) { int i = 0; const char * const *names; const char *thisname; int cpu, value; for (cpu = 0; cpu < num_cpus; cpu++) { names = cpustate_names; /* show tag and bump lastline */ if (num_cpus == 1) printf("\nCPU: "); else { value = printf("\nCPU %d: ", cpu); while (value++ <= cpustates_column) printf(" "); } lastline++; while ((thisname = *names++) != NULL) { if (*thisname != '\0') { printf("%s %% %s", (i++ % num_cpustates) == 0 ? "" : ", ", thisname); } } } /* fill the "last" array with all -1s, to insure correct updating */ for (i = 0; i < num_cpustates * num_cpus; ++i) { lcpustates[i] = -1; } } /* * *_memory(stats) - print "Memory: " followed by the memory summary string * * Assumptions: cursor is on "lastline" * for i_memory ONLY: cursor is on the previous line */ static char *memory_buffer = NULL; void i_memory(int *stats) { memory_buffer = setup_buffer(memory_buffer, 0); fputs("\nMem: ", stdout); lastline++; /* format and print the memory summary */ summary_format(memory_buffer, stats, memory_names); fputs(memory_buffer, stdout); } void u_memory(int *stats) { static char *new = NULL; new = setup_buffer(new, 0); /* format the new line */ summary_format(new, stats, memory_names); line_update(memory_buffer, new, x_mem, y_mem); } /* * *_arc(stats) - print "ARC: " followed by the ARC summary string * * Assumptions: cursor is on "lastline" * for i_arc ONLY: cursor is on the previous line */ static char *arc_buffer = NULL; void i_arc(int *stats) { arc_buffer = setup_buffer(arc_buffer, 0); if (arc_names == NULL) return; fputs("\nARC: ", stdout); lastline++; /* format and print the memory summary */ summary_format(arc_buffer, stats, arc_names); fputs(arc_buffer, stdout); } void u_arc(int *stats) { static char *new = NULL; new = setup_buffer(new, 0); if (arc_names == NULL) return; /* format the new line */ summary_format(new, stats, arc_names); line_update(arc_buffer, new, x_arc, y_arc); } /* * *_carc(stats) - print "Compressed ARC: " followed by the summary string * * Assumptions: cursor is on "lastline" * for i_carc ONLY: cursor is on the previous line */ static char *carc_buffer = NULL; void i_carc(int *stats) { carc_buffer = setup_buffer(carc_buffer, 0); if (carc_names == NULL) return; fputs("\n ", stdout); lastline++; /* format and print the memory summary */ summary_format(carc_buffer, stats, carc_names); fputs(carc_buffer, stdout); } void u_carc(int *stats) { static char *new = NULL; new = setup_buffer(new, 0); if (carc_names == NULL) return; /* format the new line */ summary_format(new, stats, carc_names); line_update(carc_buffer, new, x_carc, y_carc); } /* * *_swap(stats) - print "Swap: " followed by the swap summary string * * Assumptions: cursor is on "lastline" * for i_swap ONLY: cursor is on the previous line */ static char *swap_buffer = NULL; void i_swap(int *stats) { swap_buffer = setup_buffer(swap_buffer, 0); if (swap_names == NULL) return; fputs("\nSwap: ", stdout); lastline++; /* format and print the swap summary */ summary_format(swap_buffer, stats, swap_names); fputs(swap_buffer, stdout); } void u_swap(int *stats) { static char *new = NULL; new = setup_buffer(new, 0); if (swap_names == NULL) return; /* format the new line */ summary_format(new, stats, swap_names); line_update(swap_buffer, new, x_swap, y_swap); } /* * *_message() - print the next pending message line, or erase the one * that is there. * * Note that u_message is (currently) the same as i_message. * * Assumptions: lastline is consistent */ /* * i_message is funny because it gets its message asynchronously (with * respect to screen updates). */ #define NEXT_MSG_ADDLEN 5 static char *next_msg = NULL; static int msglen = 0; /* Invariant: msglen is always the length of the message currently displayed on the screen (even when next_msg doesn't contain that message). */ void i_message(void) { next_msg = setup_buffer(next_msg, NEXT_MSG_ADDLEN); while (lastline < y_message) { fputc('\n', stdout); lastline++; } if (next_msg[0] != '\0') { top_standout(next_msg); msglen = strlen(next_msg); next_msg[0] = '\0'; } else if (msglen > 0) { (void) clear_eol(msglen); msglen = 0; } } void u_message(void) { i_message(); } static int header_length; /* * Trim a header string to the current display width and return a newly * allocated area with the trimmed header. */ char * trim_header(const char *text) { char *s; int width; s = NULL; width = screen_width; header_length = strlen(text); if (header_length >= width) { s = strndup(text, width); if (s == NULL) return (NULL); } return (s); } /* * *_header(text) - print the header for the process area * * Assumptions: cursor is on the previous line and lastline is consistent */ void i_header(const char *text) { char *s; s = trim_header(text); if (s != NULL) text = s; if (header_status == ON) { putchar('\n'); fputs(text, stdout); lastline++; } else if (header_status == ERASE) { header_status = OFF; } free(s); } void u_header(const char *text __unused) { if (header_status == ERASE) { putchar('\n'); lastline++; clear_eol(header_length); header_status = OFF; } } /* * *_process(line, thisline) - print one process line * * Assumptions: lastline is consistent */ void i_process(int line, char *thisline) { char *p; char *base; /* make sure we are on the correct line */ while (lastline < y_procs + line) { putchar('\n'); lastline++; } /* truncate the line to conform to our current screen width */ int len = strlen(thisline); if (screen_width < len) { thisline[screen_width] = '\0'; } /* write the line out */ fputs(thisline, stdout); /* copy it in to our buffer */ base = smart_terminal ? screenbuf + lineindex(line) : screenbuf; p = stpcpy(base, thisline); /* zero fill the rest of it */ if (p - base < screen_width) { memset(p, 0, screen_width - (p - base)); } } void u_process(int line, char *newline) { char *optr; int screen_line = line + Header_lines; char *bufferline; /* remember a pointer to the current line in the screen buffer */ bufferline = &screenbuf[lineindex(line)]; /* truncate the line to conform to our current screen width */ int len = strlen(newline); if (screen_width < len) { newline[screen_width] = '\0'; } /* is line higher than we went on the last display? */ if (line >= last_hi) { /* yes, just ignore screenbuf and write it out directly */ /* get positioned on the correct line */ if (screen_line - lastline == 1) { putchar('\n'); lastline++; } else { Move_to(0, screen_line); lastline = screen_line; } /* now write the line */ fputs(newline, stdout); /* copy it in to the buffer */ optr = stpcpy(bufferline, newline); /* zero fill the rest of it */ if (optr - bufferline < screen_width) { memset(optr, 0, screen_width - (optr - bufferline)); } } else { line_update(bufferline, newline, 0, line + Header_lines); } } void u_endscreen(int hi) { int screen_line = hi + Header_lines; int i; if (smart_terminal) { if (hi < last_hi) { /* need to blank the remainder of the screen */ /* but only if there is any screen left below this line */ if (lastline + 1 < screen_length) { /* efficiently move to the end of currently displayed info */ if (screen_line - lastline < 5) { while (lastline < screen_line) { putchar('\n'); lastline++; } } else { Move_to(0, screen_line); lastline = screen_line; } if (clear_to_end) { /* we can do this the easy way */ putcap(clear_to_end); } else { /* use clear_eol on each line */ i = hi; while ((void) clear_eol(strlen(&screenbuf[lineindex(i++)])), i < last_hi) { putchar('\n'); } } } } last_hi = hi; /* move the cursor to a pleasant place */ Move_to(x_idlecursor, y_idlecursor); lastline = y_idlecursor; } else { /* separate this display from the next with some vertical room */ fputs("\n\n", stdout); } } void display_header(int t) { if (t) { header_status = ON; } else if (header_status == ON) { header_status = ERASE; } } void new_message(int type, const char *msgfmt, ...) { va_list args; size_t i; va_start(args, msgfmt); /* first, format the message */ vsnprintf(next_msg, setup_buffer_bufsiz + NEXT_MSG_ADDLEN, msgfmt, args); va_end(args); if (msglen > 0) { /* message there already -- can we clear it? */ if (!overstrike) { /* yes -- write it and clear to end */ i = strlen(next_msg); if ((type & MT_delayed) == 0) { if (type & MT_standout) { top_standout(next_msg); } else { fputs(next_msg, stdout); } clear_eol(msglen - i); msglen = i; next_msg[0] = '\0'; } } } else { if ((type & MT_delayed) == 0) { if (type & MT_standout) { top_standout(next_msg); } else { fputs(next_msg, stdout); } msglen = strlen(next_msg); next_msg[0] = '\0'; } } } void clear_message(void) { if (clear_eol(msglen) == 1) { putchar('\r'); } } int readline(char *buffer, int size, int numeric) { char *ptr = buffer; char ch; char cnt = 0; char maxcnt = 0; /* allow room for null terminator */ size -= 1; /* read loop */ while ((fflush(stdout), read(0, ptr, 1) > 0)) { /* newline means we are done */ if ((ch = *ptr) == '\n' || ch == '\r') { break; } /* handle special editing characters */ if (ch == ch_kill) { /* kill line -- account for overstriking */ if (overstrike) { msglen += maxcnt; } /* return null string */ *buffer = '\0'; putchar('\r'); return(-1); } else if (ch == ch_erase) { /* erase previous character */ if (cnt <= 0) { /* none to erase! */ putchar('\7'); } else { fputs("\b \b", stdout); ptr--; cnt--; } } /* check for character validity and buffer overflow */ else if (cnt == size || (numeric && !isdigit(ch)) || !isprint(ch)) { /* not legal */ putchar('\7'); } else { /* echo it and store it in the buffer */ putchar(ch); ptr++; cnt++; if (cnt > maxcnt) { maxcnt = cnt; } } } /* all done -- null terminate the string */ *ptr = '\0'; /* account for the extra characters in the message area */ /* (if terminal overstrikes, remember the furthest they went) */ msglen += overstrike ? maxcnt : cnt; /* return either inputted number or string length */ putchar('\r'); return(cnt == 0 ? -1 : numeric ? atoi(buffer) : cnt); } /* internal support routines */ static void summary_format(char *str, int *numbers, const char * const *names) { char *p; int num; const char *thisname; char rbuf[6]; /* format each number followed by its string */ p = str; while ((thisname = *names++) != NULL) { /* get the number to format */ num = *numbers++; /* display only non-zero numbers */ if (num > 0) { /* is this number in kilobytes? */ if (thisname[0] == 'K') { /* yes: format it as a memory value */ p = stpcpy(p, format_k(num)); /* skip over the K, since it was included by format_k */ p = stpcpy(p, thisname+1); } /* is this number a ratio? */ else if (thisname[0] == ':') { (void) snprintf(rbuf, sizeof(rbuf), "%.2f", (float)*(numbers - 2) / (float)num); p = stpcpy(p, rbuf); p = stpcpy(p, thisname); } else { p = stpcpy(p, itoa(num)); p = stpcpy(p, thisname); } } /* ignore negative numbers, but display corresponding string */ else if (num < 0) { p = stpcpy(p, thisname); } } /* if the last two characters in the string are ", ", delete them */ p -= 2; if (p >= str && p[0] == ',' && p[1] == ' ') { *p = '\0'; } } static void line_update(char *old, char *new, int start, int line) { int ch; int diff; int newcol = start + 1; int lastcol = start; char cursor_on_line = false; char *current; /* compare the two strings and only rewrite what has changed */ current = old; #ifdef DEBUG fprintf(debug, "line_update, starting at %d\n", start); fputs(old, debug); fputc('\n', debug); fputs(new, debug); fputs("\n-\n", debug); #endif /* start things off on the right foot */ /* this is to make sure the invariants get set up right */ if ((ch = *new++) != *old) { if (line - lastline == 1 && start == 0) { putchar('\n'); } else { Move_to(start, line); } cursor_on_line = true; putchar(ch); *old = ch; lastcol = start + 1; } old++; /* * main loop -- check each character. If the old and new aren't the * same, then update the display. When the distance from the * current cursor position to the new change is small enough, * the characters that belong there are written to move the * cursor over. * * Invariants: * lastcol is the column where the cursor currently is sitting * (always one beyond the end of the last mismatch). */ do /* yes, a do...while */ { if ((ch = *new++) != *old) { /* new character is different from old */ /* make sure the cursor is on top of this character */ diff = newcol - lastcol; if (diff > 0) { /* some motion is required--figure out which is shorter */ if (diff < 6 && cursor_on_line) { /* overwrite old stuff--get it out of the old buffer */ printf("%.*s", diff, ¤t[lastcol-start]); } else { /* use cursor addressing */ Move_to(newcol, line); cursor_on_line = true; } /* remember where the cursor is */ lastcol = newcol + 1; } else { /* already there, update position */ lastcol++; } /* write what we need to */ if (ch == '\0') { /* at the end--terminate with a clear-to-end-of-line */ (void) clear_eol(strlen(old)); } else { /* write the new character */ putchar(ch); } /* put the new character in the screen buffer */ *old = ch; } /* update working column and screen buffer pointer */ newcol++; old++; } while (ch != '\0'); /* zero out the rest of the line buffer -- MUST BE DONE! */ diff = screen_width - newcol; if (diff > 0) { memset(old, 0, diff); } /* remember where the current line is */ if (cursor_on_line) { lastline = line; } } -/* - * printable(str) - make the string pointed to by "str" into one that is - * printable (i.e.: all ascii), by converting all non-printable - * characters into '?'. Replacements are done in place and a pointer - * to the original buffer is returned. - */ - -char * -printable(char str[]) -{ - char *ptr; - char ch; - - ptr = str; - while ((ch = *ptr) != '\0') - { - if (!isprint(ch)) - { - *ptr = '?'; - } - ptr++; - } - return(str); -} - void i_uptime(struct timeval *bt, time_t *tod) { time_t uptime; int days, hrs, mins, secs; if (bt->tv_sec != -1) { uptime = *tod - bt->tv_sec; days = uptime / 86400; uptime %= 86400; hrs = uptime / 3600; uptime %= 3600; mins = uptime / 60; secs = uptime % 60; /* * Display the uptime. */ if (smart_terminal) { Move_to((screen_width - 24) - (days > 9 ? 1 : 0), 0); } else { fputs(" ", stdout); } printf(" up %d+%02d:%02d:%02d", days, hrs, mins, secs); } } #define SETUPBUFFER_MIN_SCREENWIDTH 80 #define SETUPBUFFER_REQUIRED_ADDBUFSIZ 2 static char * setup_buffer(char *buffer, int addlen) { size_t len, old_len; char *new_buffer; setup_buffer_bufsiz = screen_width; if (setup_buffer_bufsiz < SETUPBUFFER_MIN_SCREENWIDTH) { setup_buffer_bufsiz = SETUPBUFFER_MIN_SCREENWIDTH; } len = setup_buffer_bufsiz + addlen + SETUPBUFFER_REQUIRED_ADDBUFSIZ; new_buffer = calloc(len, sizeof(char)); if (new_buffer == NULL) { errx(4, "can't allocate sufficient memory"); } if (buffer != NULL) { old_len = strlen(buffer); memcpy(new_buffer, buffer, old_len < len - 1 ? old_len : len - 1); free(buffer); } return new_buffer; } Index: projects/clang900-import/usr.bin/top/display.h =================================================================== --- projects/clang900-import/usr.bin/top/display.h (revision 352586) +++ projects/clang900-import/usr.bin/top/display.h (revision 352587) @@ -1,42 +1,41 @@ /* $FreeBSD$ */ /* constants needed for display.c */ #define MT_standout 1 #define MT_delayed 2 #include struct statics; int display_updatecpus(struct statics *statics); void clear_message(void); int display_resize(void); void i_header(const char *text); -char *printable(char *string); void display_header(int t); int display_init(struct statics *statics); void i_arc(int *stats); void i_carc(int *stats); void i_cpustates(int *states); void i_loadave(int mpid, double *avenrun); void i_memory(int *stats); void i_message(void); void i_process(int line, char *thisline); void i_procstates(int total, int *brkdn); void i_swap(int *stats); void i_timeofday(time_t *tod); void i_uptime(struct timeval *bt, time_t *tod); void new_message(int type, const char *msgfmt, ...); int readline(char *buffer, int size, int numeric); char *trim_header(const char *text); void u_arc(int *stats); void u_carc(int *stats); void u_cpustates(int *states); void u_endscreen(int hi); void u_header(const char *text); void u_loadave(int mpid, double *avenrun); void u_memory(int *stats); void u_message(void); void u_process(int line, char *newline); void u_procstates(int total, int *brkdn); void u_swap(int *stats); void z_cpustates(void); Index: projects/clang900-import/usr.bin/top/machine.c =================================================================== --- projects/clang900-import/usr.bin/top/machine.c (revision 352586) +++ projects/clang900-import/usr.bin/top/machine.c (revision 352587) @@ -1,1561 +1,1561 @@ /* * top - a top users display for Unix * * DESCRIPTION: * Originally written for BSD4.4 system by Christos Zoulas. * Ported to FreeBSD 2.x by Steven Wallace && Wolfram Schneider * Order support hacked in from top-3.5beta6/machine/m_aix41.c * by Monte Mitzelfelt (for latest top see http://www.groupsys.com/topinfo/) * * AUTHOR: Christos Zoulas * Steven Wallace * Wolfram Schneider * Thomas Moestl * Eitan Adler * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "top.h" #include "display.h" #include "machine.h" #include "loadavg.h" #include "screen.h" #include "utils.h" #include "layout.h" #define GETSYSCTL(name, var) getsysctl(name, &(var), sizeof(var)) extern struct timeval timeout; static int smpmode; enum displaymodes displaymode; static const int namelength = 10; /* TOP_JID_LEN based on max of 999999 */ #define TOP_JID_LEN 6 #define TOP_SWAP_LEN 5 /* get_process_info passes back a handle. This is what it looks like: */ struct handle { struct kinfo_proc **next_proc; /* points to next valid proc pointer */ int remaining; /* number of pointers remaining */ }; /* define what weighted cpu is. */ #define weighted_cpu(pct, pp) ((pp)->ki_swtime == 0 ? 0.0 : \ ((pct) / (1.0 - exp((pp)->ki_swtime * logcpu)))) /* what we consider to be process size: */ #define PROCSIZE(pp) ((pp)->ki_size / 1024) #define RU(pp) (&(pp)->ki_rusage) #define PCTCPU(pp) (pcpu[pp - pbase]) /* process state names for the "STATE" column of the display */ /* the extra nulls in the string "run" are for adding a slash and the processor number when needed */ static const char *state_abbrev[] = { "", "START", "RUN\0\0\0", "SLEEP", "STOP", "ZOMB", "WAIT", "LOCK" }; static kvm_t *kd; /* values that we stash away in _init and use in later routines */ static double logcpu; /* these are retrieved from the kernel in _init */ static load_avg ccpu; /* these are used in the get_ functions */ static int lastpid; /* these are for calculating cpu state percentages */ static long cp_time[CPUSTATES]; static long cp_old[CPUSTATES]; static long cp_diff[CPUSTATES]; /* these are for detailing the process states */ static const char *procstatenames[] = { "", " starting, ", " running, ", " sleeping, ", " stopped, ", " zombie, ", " waiting, ", " lock, ", NULL }; static int process_states[nitems(procstatenames)]; /* these are for detailing the cpu states */ static int cpu_states[CPUSTATES]; static const char *cpustatenames[] = { "user", "nice", "system", "interrupt", "idle", NULL }; /* these are for detailing the memory statistics */ static const char *memorynames[] = { "K Active, ", "K Inact, ", "K Laundry, ", "K Wired, ", "K Buf, ", "K Free", NULL }; static int memory_stats[nitems(memorynames)]; static const char *arcnames[] = { "K Total, ", "K MFU, ", "K MRU, ", "K Anon, ", "K Header, ", "K Other", NULL }; static int arc_stats[nitems(arcnames)]; static const char *carcnames[] = { "K Compressed, ", "K Uncompressed, ", ":1 Ratio, ", NULL }; static int carc_stats[nitems(carcnames)]; static const char *swapnames[] = { "K Total, ", "K Used, ", "K Free, ", "% Inuse, ", "K In, ", "K Out", NULL }; static int swap_stats[nitems(swapnames)]; static int has_swap; /* these are for keeping track of the proc array */ static int nproc; static int onproc = -1; static int pref_len; static struct kinfo_proc *pbase; static struct kinfo_proc **pref; static struct kinfo_proc *previous_procs; static struct kinfo_proc **previous_pref; static int previous_proc_count = 0; static int previous_proc_count_max = 0; static int previous_thread; /* data used for recalculating pctcpu */ static double *pcpu; static struct timespec proc_uptime; static struct timeval proc_wall_time; static struct timeval previous_wall_time; static uint64_t previous_interval = 0; /* total number of io operations */ static long total_inblock; static long total_oublock; static long total_majflt; /* these are for getting the memory statistics */ static int arc_enabled; static int carc_enabled; static int pageshift; /* log base 2 of the pagesize */ /* define pagetok in terms of pageshift */ #define pagetok(size) ((size) << pageshift) /* swap usage */ #define ki_swap(kip) \ ((kip)->ki_swrss > (kip)->ki_rssize ? (kip)->ki_swrss - (kip)->ki_rssize : 0) /* * Sorting orders. The first element is the default. */ static const char *ordernames[] = { "cpu", "size", "res", "time", "pri", "threads", "total", "read", "write", "fault", "vcsw", "ivcsw", "jid", "swap", "pid", NULL }; /* Per-cpu time states */ static int maxcpu; static int maxid; static int ncpus; static unsigned long cpumask; static long *times; static long *pcpu_cp_time; static long *pcpu_cp_old; static long *pcpu_cp_diff; static int *pcpu_cpu_states; static int compare_swap(const void *a, const void *b); static int compare_jid(const void *a, const void *b); static int compare_pid(const void *a, const void *b); static int compare_tid(const void *a, const void *b); static const char *format_nice(const struct kinfo_proc *pp); static void getsysctl(const char *name, void *ptr, size_t len); static int swapmode(int *retavail, int *retfree); static void update_layout(void); static int find_uid(uid_t needle, int *haystack); static int find_uid(uid_t needle, int *haystack) { size_t i = 0; for (; i < TOP_MAX_UIDS; ++i) if ((uid_t)haystack[i] == needle) return 1; return (0); } void toggle_pcpustats(void) { if (ncpus == 1) return; update_layout(); } /* Adjust display based on ncpus and the ARC state. */ static void update_layout(void) { y_mem = 3; y_arc = 4; y_carc = 5; y_swap = 3 + arc_enabled + carc_enabled + has_swap; y_idlecursor = 4 + arc_enabled + carc_enabled + has_swap; y_message = 4 + arc_enabled + carc_enabled + has_swap; y_header = 5 + arc_enabled + carc_enabled + has_swap; y_procs = 6 + arc_enabled + carc_enabled + has_swap; Header_lines = 6 + arc_enabled + carc_enabled + has_swap; if (pcpu_stats) { y_mem += ncpus - 1; y_arc += ncpus - 1; y_carc += ncpus - 1; y_swap += ncpus - 1; y_idlecursor += ncpus - 1; y_message += ncpus - 1; y_header += ncpus - 1; y_procs += ncpus - 1; Header_lines += ncpus - 1; } } int machine_init(struct statics *statics) { int i, j, empty, pagesize; uint64_t arc_size; int carc_en, nswapdev; size_t size; size = sizeof(smpmode); if ((sysctlbyname("machdep.smp_active", &smpmode, &size, NULL, 0) != 0 && sysctlbyname("kern.smp.active", &smpmode, &size, NULL, 0) != 0) || size != sizeof(smpmode)) smpmode = 0; size = sizeof(arc_size); if (sysctlbyname("kstat.zfs.misc.arcstats.size", &arc_size, &size, NULL, 0) == 0 && arc_size != 0) arc_enabled = 1; size = sizeof(carc_en); if (arc_enabled && sysctlbyname("vfs.zfs.compressed_arc_enabled", &carc_en, &size, NULL, 0) == 0 && carc_en == 1) carc_enabled = 1; kd = kvm_open(NULL, _PATH_DEVNULL, NULL, O_RDONLY, "kvm_open"); if (kd == NULL) return (-1); size = sizeof(nswapdev); if (sysctlbyname("vm.nswapdev", &nswapdev, &size, NULL, 0) == 0 && nswapdev != 0) has_swap = 1; GETSYSCTL("kern.ccpu", ccpu); /* this is used in calculating WCPU -- calculate it ahead of time */ logcpu = log(loaddouble(ccpu)); pbase = NULL; pref = NULL; pcpu = NULL; nproc = 0; onproc = -1; /* get the page size and calculate pageshift from it */ pagesize = getpagesize(); pageshift = 0; while (pagesize > 1) { pageshift++; pagesize >>= 1; } /* we only need the amount of log(2)1024 for our conversion */ pageshift -= LOG1024; /* fill in the statics information */ statics->procstate_names = procstatenames; statics->cpustate_names = cpustatenames; statics->memory_names = memorynames; if (arc_enabled) statics->arc_names = arcnames; else statics->arc_names = NULL; if (carc_enabled) statics->carc_names = carcnames; else statics->carc_names = NULL; if (has_swap) statics->swap_names = swapnames; else statics->swap_names = NULL; statics->order_names = ordernames; /* Allocate state for per-CPU stats. */ cpumask = 0; ncpus = 0; GETSYSCTL("kern.smp.maxcpus", maxcpu); times = calloc(maxcpu * CPUSTATES, sizeof(long)); if (times == NULL) err(1, "calloc for kern.smp.maxcpus"); size = sizeof(long) * maxcpu * CPUSTATES; if (sysctlbyname("kern.cp_times", times, &size, NULL, 0) == -1) err(1, "sysctlbyname kern.cp_times"); pcpu_cp_time = calloc(1, size); maxid = (size / CPUSTATES / sizeof(long)) - 1; for (i = 0; i <= maxid; i++) { empty = 1; for (j = 0; empty && j < CPUSTATES; j++) { if (times[i * CPUSTATES + j] != 0) empty = 0; } if (!empty) { cpumask |= (1ul << i); ncpus++; } } assert(ncpus > 0); pcpu_cp_old = calloc(ncpus * CPUSTATES, sizeof(long)); pcpu_cp_diff = calloc(ncpus * CPUSTATES, sizeof(long)); pcpu_cpu_states = calloc(ncpus * CPUSTATES, sizeof(int)); statics->ncpus = ncpus; update_layout(); /* all done! */ return (0); } char * format_header(const char *uname_field) { static struct sbuf* header = NULL; /* clean up from last time. */ if (header != NULL) { sbuf_clear(header); } else { header = sbuf_new_auto(); } switch (displaymode) { case DISP_CPU: { sbuf_printf(header, " %s", ps.thread_id ? " THR" : "PID"); sbuf_printf(header, "%*s", ps.jail ? TOP_JID_LEN : 0, ps.jail ? " JID" : ""); sbuf_printf(header, " %-*.*s ", namelength, namelength, uname_field); if (!ps.thread) { sbuf_cat(header, "THR "); } sbuf_cat(header, "PRI NICE SIZE RES "); if (ps.swap) { sbuf_printf(header, "%*s ", TOP_SWAP_LEN - 1, "SWAP"); } sbuf_cat(header, "STATE "); if (smpmode) { sbuf_cat(header, "C "); } sbuf_cat(header, "TIME "); sbuf_printf(header, " %6s ", ps.wcpu ? "WCPU" : "CPU"); sbuf_cat(header, "COMMAND"); sbuf_finish(header); break; } case DISP_IO: { sbuf_printf(header, " %s%*s %-*.*s", ps.thread_id ? " THR" : "PID", ps.jail ? TOP_JID_LEN : 0, ps.jail ? " JID" : "", namelength, namelength, uname_field); sbuf_cat(header, " VCSW IVCSW READ WRITE FAULT TOTAL PERCENT COMMAND"); sbuf_finish(header); break; } case DISP_MAX: assert("displaymode must not be set to DISP_MAX"); } return sbuf_data(header); } static int swappgsin = -1; static int swappgsout = -1; void get_system_info(struct system_info *si) { struct loadavg sysload; int mib[2]; struct timeval boottime; uint64_t arc_stat, arc_stat2; int i, j; size_t size; /* get the CPU stats */ size = (maxid + 1) * CPUSTATES * sizeof(long); if (sysctlbyname("kern.cp_times", pcpu_cp_time, &size, NULL, 0) == -1) err(1, "sysctlbyname kern.cp_times"); GETSYSCTL("kern.cp_time", cp_time); GETSYSCTL("vm.loadavg", sysload); GETSYSCTL("kern.lastpid", lastpid); /* convert load averages to doubles */ for (i = 0; i < 3; i++) si->load_avg[i] = (double)sysload.ldavg[i] / sysload.fscale; /* convert cp_time counts to percentages */ for (i = j = 0; i <= maxid; i++) { if ((cpumask & (1ul << i)) == 0) continue; percentages(CPUSTATES, &pcpu_cpu_states[j * CPUSTATES], &pcpu_cp_time[j * CPUSTATES], &pcpu_cp_old[j * CPUSTATES], &pcpu_cp_diff[j * CPUSTATES]); j++; } percentages(CPUSTATES, cpu_states, cp_time, cp_old, cp_diff); /* sum memory & swap statistics */ { static unsigned int swap_delay = 0; static int swapavail = 0; static int swapfree = 0; static long bufspace = 0; static uint64_t nspgsin, nspgsout; GETSYSCTL("vfs.bufspace", bufspace); GETSYSCTL("vm.stats.vm.v_active_count", memory_stats[0]); GETSYSCTL("vm.stats.vm.v_inactive_count", memory_stats[1]); GETSYSCTL("vm.stats.vm.v_laundry_count", memory_stats[2]); GETSYSCTL("vm.stats.vm.v_wire_count", memory_stats[3]); GETSYSCTL("vm.stats.vm.v_free_count", memory_stats[5]); GETSYSCTL("vm.stats.vm.v_swappgsin", nspgsin); GETSYSCTL("vm.stats.vm.v_swappgsout", nspgsout); /* convert memory stats to Kbytes */ memory_stats[0] = pagetok(memory_stats[0]); memory_stats[1] = pagetok(memory_stats[1]); memory_stats[2] = pagetok(memory_stats[2]); memory_stats[3] = pagetok(memory_stats[3]); memory_stats[4] = bufspace / 1024; memory_stats[5] = pagetok(memory_stats[5]); memory_stats[6] = -1; /* first interval */ if (swappgsin < 0) { swap_stats[4] = 0; swap_stats[5] = 0; } /* compute differences between old and new swap statistic */ else { swap_stats[4] = pagetok(((nspgsin - swappgsin))); swap_stats[5] = pagetok(((nspgsout - swappgsout))); } swappgsin = nspgsin; swappgsout = nspgsout; /* call CPU heavy swapmode() only for changes */ if (swap_stats[4] > 0 || swap_stats[5] > 0 || swap_delay == 0) { swap_stats[3] = swapmode(&swapavail, &swapfree); swap_stats[0] = swapavail; swap_stats[1] = swapavail - swapfree; swap_stats[2] = swapfree; } swap_delay = 1; swap_stats[6] = -1; } if (arc_enabled) { GETSYSCTL("kstat.zfs.misc.arcstats.size", arc_stat); arc_stats[0] = arc_stat >> 10; GETSYSCTL("vfs.zfs.mfu_size", arc_stat); arc_stats[1] = arc_stat >> 10; GETSYSCTL("vfs.zfs.mru_size", arc_stat); arc_stats[2] = arc_stat >> 10; GETSYSCTL("vfs.zfs.anon_size", arc_stat); arc_stats[3] = arc_stat >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.hdr_size", arc_stat); GETSYSCTL("kstat.zfs.misc.arcstats.l2_hdr_size", arc_stat2); arc_stats[4] = (arc_stat + arc_stat2) >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.bonus_size", arc_stat); arc_stats[5] = arc_stat >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.dnode_size", arc_stat); arc_stats[5] += arc_stat >> 10; GETSYSCTL("kstat.zfs.misc.arcstats.dbuf_size", arc_stat); arc_stats[5] += arc_stat >> 10; si->arc = arc_stats; } if (carc_enabled) { GETSYSCTL("kstat.zfs.misc.arcstats.compressed_size", arc_stat); carc_stats[0] = arc_stat >> 10; carc_stats[2] = arc_stat >> 10; /* For ratio */ GETSYSCTL("kstat.zfs.misc.arcstats.uncompressed_size", arc_stat); carc_stats[1] = arc_stat >> 10; si->carc = carc_stats; } /* set arrays and strings */ if (pcpu_stats) { si->cpustates = pcpu_cpu_states; si->ncpus = ncpus; } else { si->cpustates = cpu_states; si->ncpus = 1; } si->memory = memory_stats; si->swap = swap_stats; if (lastpid > 0) { si->last_pid = lastpid; } else { si->last_pid = -1; } /* * Print how long system has been up. * (Found by looking getting "boottime" from the kernel) */ mib[0] = CTL_KERN; mib[1] = KERN_BOOTTIME; size = sizeof(boottime); if (sysctl(mib, nitems(mib), &boottime, &size, NULL, 0) != -1 && boottime.tv_sec != 0) { si->boottime = boottime; } else { si->boottime.tv_sec = -1; } } #define NOPROC ((void *)-1) /* * We need to compare data from the old process entry with the new * process entry. * To facilitate doing this quickly we stash a pointer in the kinfo_proc * structure to cache the mapping. We also use a negative cache pointer * of NOPROC to avoid duplicate lookups. * XXX: this could be done when the actual processes are fetched, we do * it here out of laziness. */ static const struct kinfo_proc * get_old_proc(struct kinfo_proc *pp) { const struct kinfo_proc * const *oldpp, *oldp; /* * If this is the first fetch of the kinfo_procs then we don't have * any previous entries. */ if (previous_proc_count == 0) return (NULL); /* negative cache? */ if (pp->ki_udata == NOPROC) return (NULL); /* cached? */ if (pp->ki_udata != NULL) return (pp->ki_udata); /* * Not cached, * 1) look up based on pid. * 2) compare process start. * If we fail here, then setup a negative cache entry, otherwise * cache it. */ oldpp = bsearch(&pp, previous_pref, previous_proc_count, sizeof(*previous_pref), ps.thread ? compare_tid : compare_pid); if (oldpp == NULL) { pp->ki_udata = NOPROC; return (NULL); } oldp = *oldpp; if (memcmp(&oldp->ki_start, &pp->ki_start, sizeof(pp->ki_start)) != 0) { pp->ki_udata = NOPROC; return (NULL); } pp->ki_udata = __DECONST(void *, oldp); return (oldp); } /* * Return the total amount of IO done in blocks in/out and faults. * store the values individually in the pointers passed in. */ static long get_io_stats(const struct kinfo_proc *pp, long *inp, long *oup, long *flp, long *vcsw, long *ivcsw) { const struct kinfo_proc *oldp; static struct kinfo_proc dummy; long ret; oldp = get_old_proc(__DECONST(struct kinfo_proc *, pp)); if (oldp == NULL) { memset(&dummy, 0, sizeof(dummy)); oldp = &dummy; } *inp = RU(pp)->ru_inblock - RU(oldp)->ru_inblock; *oup = RU(pp)->ru_oublock - RU(oldp)->ru_oublock; *flp = RU(pp)->ru_majflt - RU(oldp)->ru_majflt; *vcsw = RU(pp)->ru_nvcsw - RU(oldp)->ru_nvcsw; *ivcsw = RU(pp)->ru_nivcsw - RU(oldp)->ru_nivcsw; ret = (RU(pp)->ru_inblock - RU(oldp)->ru_inblock) + (RU(pp)->ru_oublock - RU(oldp)->ru_oublock) + (RU(pp)->ru_majflt - RU(oldp)->ru_majflt); return (ret); } /* * If there was a previous update, use the delta in ki_runtime over * the previous interval to calculate pctcpu. Otherwise, fall back * to using the kernel's ki_pctcpu. */ static double proc_calc_pctcpu(struct kinfo_proc *pp) { const struct kinfo_proc *oldp; if (previous_interval != 0) { oldp = get_old_proc(pp); if (oldp != NULL) return ((double)(pp->ki_runtime - oldp->ki_runtime) / previous_interval); /* * If this process/thread was created during the previous * interval, charge it's total runtime to the previous * interval. */ else if (pp->ki_start.tv_sec > previous_wall_time.tv_sec || (pp->ki_start.tv_sec == previous_wall_time.tv_sec && pp->ki_start.tv_usec >= previous_wall_time.tv_usec)) return ((double)pp->ki_runtime / previous_interval); } return (pctdouble(pp->ki_pctcpu)); } /* * Return true if this process has used any CPU time since the * previous update. */ static int proc_used_cpu(struct kinfo_proc *pp) { const struct kinfo_proc *oldp; oldp = get_old_proc(pp); if (oldp == NULL) return (PCTCPU(pp) != 0); return (pp->ki_runtime != oldp->ki_runtime || RU(pp)->ru_nvcsw != RU(oldp)->ru_nvcsw || RU(pp)->ru_nivcsw != RU(oldp)->ru_nivcsw); } /* * Return the total number of block in/out and faults by a process. */ static long get_io_total(const struct kinfo_proc *pp) { long dummy; return (get_io_stats(pp, &dummy, &dummy, &dummy, &dummy, &dummy)); } static struct handle handle; void * get_process_info(struct system_info *si, struct process_select *sel, int (*compare)(const void *, const void *)) { int i; int total_procs; long p_io; long p_inblock, p_oublock, p_majflt, p_vcsw, p_ivcsw; long nsec; int active_procs; struct kinfo_proc **prefp; struct kinfo_proc *pp; struct timespec previous_proc_uptime; /* * If thread state was toggled, don't cache the previous processes. */ if (previous_thread != sel->thread) nproc = 0; previous_thread = sel->thread; /* * Save the previous process info. */ if (previous_proc_count_max < nproc) { free(previous_procs); previous_procs = calloc(nproc, sizeof(*previous_procs)); free(previous_pref); previous_pref = calloc(nproc, sizeof(*previous_pref)); if (previous_procs == NULL || previous_pref == NULL) { fprintf(stderr, "top: Out of memory.\n"); quit(TOP_EX_SYS_ERROR); } previous_proc_count_max = nproc; } if (nproc) { for (i = 0; i < nproc; i++) previous_pref[i] = &previous_procs[i]; memcpy(previous_procs, pbase, nproc * sizeof(*previous_procs)); qsort(previous_pref, nproc, sizeof(*previous_pref), ps.thread ? compare_tid : compare_pid); } previous_proc_count = nproc; previous_proc_uptime = proc_uptime; previous_wall_time = proc_wall_time; previous_interval = 0; pbase = kvm_getprocs(kd, sel->thread ? KERN_PROC_ALL : KERN_PROC_PROC, 0, &nproc); gettimeofday(&proc_wall_time, NULL); if (clock_gettime(CLOCK_UPTIME, &proc_uptime) != 0) memset(&proc_uptime, 0, sizeof(proc_uptime)); else if (previous_proc_uptime.tv_sec != 0 && previous_proc_uptime.tv_nsec != 0) { previous_interval = (proc_uptime.tv_sec - previous_proc_uptime.tv_sec) * 1000000; nsec = proc_uptime.tv_nsec - previous_proc_uptime.tv_nsec; if (nsec < 0) { previous_interval -= 1000000; nsec += 1000000000; } previous_interval += nsec / 1000; } if (nproc > onproc) { pref = realloc(pref, sizeof(*pref) * nproc); pcpu = realloc(pcpu, sizeof(*pcpu) * nproc); onproc = nproc; } if (pref == NULL || pbase == NULL || pcpu == NULL) { fprintf(stderr, "top: Out of memory.\n"); quit(TOP_EX_SYS_ERROR); } /* get a pointer to the states summary array */ si->procstates = process_states; /* count up process states and get pointers to interesting procs */ total_procs = 0; active_procs = 0; total_inblock = 0; total_oublock = 0; total_majflt = 0; memset(process_states, 0, sizeof(process_states)); prefp = pref; for (pp = pbase, i = 0; i < nproc; pp++, i++) { if (pp->ki_stat == 0) /* not in use */ continue; if (!sel->self && pp->ki_pid == mypid && sel->pid == -1) /* skip self */ continue; if (!sel->system && (pp->ki_flag & P_SYSTEM) && sel->pid == -1) /* skip system process */ continue; p_io = get_io_stats(pp, &p_inblock, &p_oublock, &p_majflt, &p_vcsw, &p_ivcsw); total_inblock += p_inblock; total_oublock += p_oublock; total_majflt += p_majflt; total_procs++; process_states[(unsigned char)pp->ki_stat]++; if (pp->ki_stat == SZOMB) /* skip zombies */ continue; if (!sel->kidle && pp->ki_tdflags & TDF_IDLETD && sel->pid == -1) /* skip kernel idle process */ continue; PCTCPU(pp) = proc_calc_pctcpu(pp); if (sel->thread && PCTCPU(pp) > 1.0) PCTCPU(pp) = 1.0; if (displaymode == DISP_CPU && !sel->idle && (!proc_used_cpu(pp) || pp->ki_stat == SSTOP || pp->ki_stat == SIDL)) /* skip idle or non-running processes */ continue; if (displaymode == DISP_IO && !sel->idle && p_io == 0) /* skip processes that aren't doing I/O */ continue; if (sel->jid != -1 && pp->ki_jid != sel->jid) /* skip proc. that don't belong to the selected JID */ continue; if (sel->uid[0] != -1 && !find_uid(pp->ki_ruid, sel->uid)) /* skip proc. that don't belong to the selected UID */ continue; if (sel->pid != -1 && pp->ki_pid != sel->pid) continue; *prefp++ = pp; active_procs++; } /* if requested, sort the "interesting" processes */ if (compare != NULL) qsort(pref, active_procs, sizeof(*pref), compare); /* remember active and total counts */ si->p_total = total_procs; si->p_pactive = pref_len = active_procs; /* pass back a handle */ handle.next_proc = pref; handle.remaining = active_procs; return (&handle); } char * format_next_process(struct handle * xhandle, char *(*get_userid)(int), int flags) { struct kinfo_proc *pp; const struct kinfo_proc *oldp; long cputime; char status[22]; size_t state; struct rusage ru, *rup; long p_tot, s_tot; char *cmdbuf = NULL; char **args; static struct sbuf* procbuf = NULL; /* clean up from last time. */ if (procbuf != NULL) { sbuf_clear(procbuf); } else { procbuf = sbuf_new_auto(); } /* find and remember the next proc structure */ pp = *(xhandle->next_proc++); xhandle->remaining--; /* get the process's command name */ if ((pp->ki_flag & P_INMEM) == 0) { /* * Print swapped processes as */ size_t len; len = strlen(pp->ki_comm); if (len > sizeof(pp->ki_comm) - 3) len = sizeof(pp->ki_comm) - 3; memmove(pp->ki_comm + 1, pp->ki_comm, len); pp->ki_comm[0] = '<'; pp->ki_comm[len + 1] = '>'; pp->ki_comm[len + 2] = '\0'; } /* * Convert the process's runtime from microseconds to seconds. This * time includes the interrupt time although that is not wanted here. * ps(1) is similarly sloppy. */ cputime = (pp->ki_runtime + 500000) / 1000000; /* generate "STATE" field */ switch (state = pp->ki_stat) { case SRUN: if (smpmode && pp->ki_oncpu != NOCPU) sprintf(status, "CPU%d", pp->ki_oncpu); else strcpy(status, "RUN"); break; case SLOCK: if (pp->ki_kiflag & KI_LOCKBLOCK) { sprintf(status, "*%.6s", pp->ki_lockname); break; } /* fall through */ case SSLEEP: sprintf(status, "%.6s", pp->ki_wmesg); break; default: if (state < nitems(state_abbrev)) { sprintf(status, "%.6s", state_abbrev[state]); } else { sprintf(status, "?%5zu", state); } break; } cmdbuf = calloc(screen_width + 1, 1); if (cmdbuf == NULL) { warn("calloc(%d)", screen_width + 1); return NULL; } if (!(flags & FMT_SHOWARGS)) { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) { snprintf(cmdbuf, screen_width, "%s{%s%s}", pp->ki_comm, pp->ki_tdname, pp->ki_moretdname); } else { snprintf(cmdbuf, screen_width, "%s", pp->ki_comm); } } else { if (pp->ki_flag & P_SYSTEM || (args = kvm_getargv(kd, pp, screen_width)) == NULL || !(*args)) { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) { snprintf(cmdbuf, screen_width, "[%s{%s%s}]", pp->ki_comm, pp->ki_tdname, pp->ki_moretdname); } else { snprintf(cmdbuf, screen_width, "[%s]", pp->ki_comm); } } else { const char *src; char *dst, *argbuf; const char *cmd; size_t argbuflen; size_t len; argbuflen = screen_width * 4; argbuf = calloc(argbuflen + 1, 1); if (argbuf == NULL) { warn("calloc(%zu)", argbuflen + 1); free(cmdbuf); return NULL; } dst = argbuf; /* Extract cmd name from argv */ cmd = basename(*args); for (; (src = *args++) != NULL; ) { if (*src == '\0') continue; len = (argbuflen - (dst - argbuf) - 1) / 4; strvisx(dst, src, MIN(strlen(src), len), - VIS_NL | VIS_CSTYLE); + VIS_NL | VIS_CSTYLE | VIS_OCTAL | VIS_SAFE); while (*dst != '\0') dst++; if ((argbuflen - (dst - argbuf) - 1) / 4 > 0) *dst++ = ' '; /* add delimiting space */ } if (dst != argbuf && dst[-1] == ' ') dst--; *dst = '\0'; if (strcmp(cmd, pp->ki_comm) != 0) { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) snprintf(cmdbuf, screen_width, "%s (%s){%s%s}", argbuf, pp->ki_comm, pp->ki_tdname, pp->ki_moretdname); else snprintf(cmdbuf, screen_width, "%s (%s)", argbuf, pp->ki_comm); } else { if (ps.thread && pp->ki_flag & P_HADTHREADS && pp->ki_tdname[0]) snprintf(cmdbuf, screen_width, "%s{%s%s}", argbuf, pp->ki_tdname, pp->ki_moretdname); else strlcpy(cmdbuf, argbuf, screen_width); } free(argbuf); } } if (displaymode == DISP_IO) { oldp = get_old_proc(pp); if (oldp != NULL) { ru.ru_inblock = RU(pp)->ru_inblock - RU(oldp)->ru_inblock; ru.ru_oublock = RU(pp)->ru_oublock - RU(oldp)->ru_oublock; ru.ru_majflt = RU(pp)->ru_majflt - RU(oldp)->ru_majflt; ru.ru_nvcsw = RU(pp)->ru_nvcsw - RU(oldp)->ru_nvcsw; ru.ru_nivcsw = RU(pp)->ru_nivcsw - RU(oldp)->ru_nivcsw; rup = &ru; } else { rup = RU(pp); } p_tot = rup->ru_inblock + rup->ru_oublock + rup->ru_majflt; s_tot = total_inblock + total_oublock + total_majflt; sbuf_printf(procbuf, "%5d ", (ps.thread_id) ? pp->ki_tid : pp->ki_pid); if (ps.jail) { sbuf_printf(procbuf, "%*d ", TOP_JID_LEN - 1, pp->ki_jid); } sbuf_printf(procbuf, "%-*.*s", namelength, namelength, (*get_userid)(pp->ki_ruid)); sbuf_printf(procbuf, "%6ld ", rup->ru_nvcsw); sbuf_printf(procbuf, "%6ld ", rup->ru_nivcsw); sbuf_printf(procbuf, "%6ld ", rup->ru_inblock); sbuf_printf(procbuf, "%6ld ", rup->ru_oublock); sbuf_printf(procbuf, "%6ld ", rup->ru_majflt); sbuf_printf(procbuf, "%6ld ", p_tot); sbuf_printf(procbuf, "%6.2f%% ", s_tot == 0 ? 0.0 : (p_tot * 100.0 / s_tot)); } else { sbuf_printf(procbuf, "%5d ", (ps.thread_id) ? pp->ki_tid : pp->ki_pid); if (ps.jail) { sbuf_printf(procbuf, "%*d ", TOP_JID_LEN - 1, pp->ki_jid); } sbuf_printf(procbuf, "%-*.*s ", namelength, namelength, (*get_userid)(pp->ki_ruid)); if (!ps.thread) { sbuf_printf(procbuf, "%4d ", pp->ki_numthreads); } else { sbuf_printf(procbuf, " "); } sbuf_printf(procbuf, "%3d ", pp->ki_pri.pri_level - PZERO); sbuf_printf(procbuf, "%4s", format_nice(pp)); sbuf_printf(procbuf, "%7s ", format_k(PROCSIZE(pp))); sbuf_printf(procbuf, "%6s ", format_k(pagetok(pp->ki_rssize))); if (ps.swap) { sbuf_printf(procbuf, "%*s ", TOP_SWAP_LEN - 1, format_k(pagetok(ki_swap(pp)))); } sbuf_printf(procbuf, "%-6.6s ", status); if (smpmode) { int cpu; if (state == SRUN && pp->ki_oncpu != NOCPU) { cpu = pp->ki_oncpu; } else { cpu = pp->ki_lastcpu; } sbuf_printf(procbuf, "%3d ", cpu); } sbuf_printf(procbuf, "%6s ", format_time(cputime)); sbuf_printf(procbuf, "%6.2f%% ", ps.wcpu ? 100.0 * weighted_cpu(PCTCPU(pp), pp) : 100.0 * PCTCPU(pp)); } - sbuf_printf(procbuf, "%s", printable(cmdbuf)); + sbuf_printf(procbuf, "%s", cmdbuf); free(cmdbuf); return (sbuf_data(procbuf)); } static void getsysctl(const char *name, void *ptr, size_t len) { size_t nlen = len; if (sysctlbyname(name, ptr, &nlen, NULL, 0) == -1) { fprintf(stderr, "top: sysctl(%s...) failed: %s\n", name, strerror(errno)); quit(TOP_EX_SYS_ERROR); } if (nlen != len) { fprintf(stderr, "top: sysctl(%s...) expected %lu, got %lu\n", name, (unsigned long)len, (unsigned long)nlen); quit(TOP_EX_SYS_ERROR); } } static const char * format_nice(const struct kinfo_proc *pp) { const char *fifo, *kproc; int rtpri; static char nicebuf[4 + 1]; fifo = PRI_NEED_RR(pp->ki_pri.pri_class) ? "" : "F"; kproc = (pp->ki_flag & P_KPROC) ? "k" : ""; switch (PRI_BASE(pp->ki_pri.pri_class)) { case PRI_ITHD: return ("-"); case PRI_REALTIME: /* * XXX: the kernel doesn't tell us the original rtprio and * doesn't really know what it was, so to recover it we * must be more chummy with the implementation than the * implementation is with itself. pri_user gives a * constant "base" priority, but is only initialized * properly for user threads. pri_native gives what the * kernel calls the "base" priority, but it isn't constant * since it is changed by priority propagation. pri_native * also isn't properly initialized for all threads, but it * is properly initialized for kernel realtime and idletime * threads. Thus we use pri_user for the base priority of * user threads (it is always correct) and pri_native for * the base priority of kernel realtime and idletime threads * (there is nothing better, and it is usually correct). * * The field width and thus the buffer are too small for * values like "kr31F", but such values shouldn't occur, * and if they do then the tailing "F" is not displayed. */ rtpri = ((pp->ki_flag & P_KPROC) ? pp->ki_pri.pri_native : pp->ki_pri.pri_user) - PRI_MIN_REALTIME; snprintf(nicebuf, sizeof(nicebuf), "%sr%d%s", kproc, rtpri, fifo); break; case PRI_TIMESHARE: if (pp->ki_flag & P_KPROC) return ("-"); snprintf(nicebuf, sizeof(nicebuf), "%d", pp->ki_nice - NZERO); break; case PRI_IDLE: /* XXX: as above. */ rtpri = ((pp->ki_flag & P_KPROC) ? pp->ki_pri.pri_native : pp->ki_pri.pri_user) - PRI_MIN_IDLE; snprintf(nicebuf, sizeof(nicebuf), "%si%d%s", kproc, rtpri, fifo); break; default: return ("?"); } return (nicebuf); } /* comparison routines for qsort */ static int compare_pid(const void *p1, const void *p2) { const struct kinfo_proc * const *pp1 = p1; const struct kinfo_proc * const *pp2 = p2; assert((*pp2)->ki_pid >= 0 && (*pp1)->ki_pid >= 0); return ((*pp1)->ki_pid - (*pp2)->ki_pid); } static int compare_tid(const void *p1, const void *p2) { const struct kinfo_proc * const *pp1 = p1; const struct kinfo_proc * const *pp2 = p2; assert((*pp2)->ki_tid >= 0 && (*pp1)->ki_tid >= 0); return ((*pp1)->ki_tid - (*pp2)->ki_tid); } /* * proc_compare - comparison function for "qsort" * Compares the resource consumption of two processes using five * distinct keys. The keys (in descending order of importance) are: * percent cpu, cpu ticks, state, resident set size, total virtual * memory usage. The process states are ordered as follows (from least * to most important): WAIT, zombie, sleep, stop, start, run. The * array declaration below maps a process state index into a number * that reflects this ordering. */ static int sorted_state[] = { 0, /* not used */ 3, /* sleep */ 1, /* ABANDONED (WAIT) */ 6, /* run */ 5, /* start */ 2, /* zombie */ 4 /* stop */ }; #define ORDERKEY_PCTCPU(a, b) do { \ double diff; \ if (ps.wcpu) \ diff = weighted_cpu(PCTCPU((b)), (b)) - \ weighted_cpu(PCTCPU((a)), (a)); \ else \ diff = PCTCPU((b)) - PCTCPU((a)); \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_CPTICKS(a, b) do { \ int64_t diff = (int64_t)(b)->ki_runtime - (int64_t)(a)->ki_runtime; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_STATE(a, b) do { \ int diff = sorted_state[(unsigned char)(b)->ki_stat] - sorted_state[(unsigned char)(a)->ki_stat]; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_PRIO(a, b) do { \ int diff = (int)(b)->ki_pri.pri_level - (int)(a)->ki_pri.pri_level; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_THREADS(a, b) do { \ int diff = (int)(b)->ki_numthreads - (int)(a)->ki_numthreads; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_RSSIZE(a, b) do { \ long diff = (long)(b)->ki_rssize - (long)(a)->ki_rssize; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_MEM(a, b) do { \ long diff = (long)PROCSIZE((b)) - (long)PROCSIZE((a)); \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_JID(a, b) do { \ int diff = (int)(b)->ki_jid - (int)(a)->ki_jid; \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) #define ORDERKEY_SWAP(a, b) do { \ int diff = (int)ki_swap(b) - (int)ki_swap(a); \ if (diff != 0) \ return (diff > 0 ? 1 : -1); \ } while (0) /* compare_cpu - the comparison function for sorting by cpu percentage */ static int compare_cpu(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_size - the comparison function for sorting by total memory usage */ static int compare_size(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; ORDERKEY_MEM(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); return (0); } /* compare_res - the comparison function for sorting by resident set size */ static int compare_res(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); return (0); } /* compare_time - the comparison function for sorting by total cpu time */ static int compare_time(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *) arg2; ORDERKEY_CPTICKS(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_prio - the comparison function for sorting by priority */ static int compare_prio(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; ORDERKEY_PRIO(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_threads - the comparison function for sorting by threads */ static int compare_threads(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; ORDERKEY_THREADS(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_jid - the comparison function for sorting by jid */ static int compare_jid(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; ORDERKEY_JID(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* compare_swap - the comparison function for sorting by swap */ static int compare_swap(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; ORDERKEY_SWAP(p1, p2); ORDERKEY_PCTCPU(p1, p2); ORDERKEY_CPTICKS(p1, p2); ORDERKEY_STATE(p1, p2); ORDERKEY_PRIO(p1, p2); ORDERKEY_RSSIZE(p1, p2); ORDERKEY_MEM(p1, p2); return (0); } /* assorted comparison functions for sorting by i/o */ static int compare_iototal(const void *arg1, const void *arg2) { const struct kinfo_proc * const p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc * const p2 = *(const struct kinfo_proc * const *)arg2; return (get_io_total(p2) - get_io_total(p1)); } static int compare_ioread(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; long dummy, inp1, inp2; (void) get_io_stats(p1, &inp1, &dummy, &dummy, &dummy, &dummy); (void) get_io_stats(p2, &inp2, &dummy, &dummy, &dummy, &dummy); return (inp2 - inp1); } static int compare_iowrite(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; long dummy, oup1, oup2; (void) get_io_stats(p1, &dummy, &oup1, &dummy, &dummy, &dummy); (void) get_io_stats(p2, &dummy, &oup2, &dummy, &dummy, &dummy); return (oup2 - oup1); } static int compare_iofault(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; long dummy, flp1, flp2; (void) get_io_stats(p1, &dummy, &dummy, &flp1, &dummy, &dummy); (void) get_io_stats(p2, &dummy, &dummy, &flp2, &dummy, &dummy); return (flp2 - flp1); } static int compare_vcsw(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; long dummy, flp1, flp2; (void) get_io_stats(p1, &dummy, &dummy, &dummy, &flp1, &dummy); (void) get_io_stats(p2, &dummy, &dummy, &dummy, &flp2, &dummy); return (flp2 - flp1); } static int compare_ivcsw(const void *arg1, const void *arg2) { const struct kinfo_proc *p1 = *(const struct kinfo_proc * const *)arg1; const struct kinfo_proc *p2 = *(const struct kinfo_proc * const *)arg2; long dummy, flp1, flp2; (void) get_io_stats(p1, &dummy, &dummy, &dummy, &dummy, &flp1); (void) get_io_stats(p2, &dummy, &dummy, &dummy, &dummy, &flp2); return (flp2 - flp1); } int (*compares[])(const void *arg1, const void *arg2) = { compare_cpu, compare_size, compare_res, compare_time, compare_prio, compare_threads, compare_iototal, compare_ioread, compare_iowrite, compare_iofault, compare_vcsw, compare_ivcsw, compare_jid, compare_swap, NULL }; static int swapmode(int *retavail, int *retfree) { int n; struct kvm_swap swapary[1]; static int pagesize = 0; static unsigned long swap_maxpages = 0; *retavail = 0; *retfree = 0; #define CONVERT(v) ((quad_t)(v) * pagesize / 1024) n = kvm_getswapinfo(kd, swapary, 1, 0); if (n < 0 || swapary[0].ksw_total == 0) return (0); if (pagesize == 0) pagesize = getpagesize(); if (swap_maxpages == 0) GETSYSCTL("vm.swap_maxpages", swap_maxpages); /* ksw_total contains the total size of swap all devices which may exceed the maximum swap size allocatable in the system */ if ( swapary[0].ksw_total > swap_maxpages ) swapary[0].ksw_total = swap_maxpages; *retavail = CONVERT(swapary[0].ksw_total); *retfree = CONVERT(swapary[0].ksw_total - swapary[0].ksw_used); #undef CONVERT n = (int)(swapary[0].ksw_used * 100.0 / swapary[0].ksw_total); return (n); } Index: projects/clang900-import/usr.bin/top/top.1 =================================================================== --- projects/clang900-import/usr.bin/top/top.1 (revision 352586) +++ projects/clang900-import/usr.bin/top/top.1 (revision 352587) @@ -1,445 +1,454 @@ .\" $FreeBSD$ -.Dd October 2, 2018 +.Dd September 21, 2019 .Dt TOP 1 .Os .Sh NAME .Nm top .Nd display and update information about the top cpu processes .Sh SYNOPSIS .Nm .Op Fl CHIPSTabijnpqtuvxz .Op Fl J Ar jail .Op Fl U Ar uid .Op Fl d Ar count .Op Fl m Ar cpu|io .Op Fl s Ar time .Op Fl o Ar field .Op Fl p Ar pid .Op Ar count .Sh DESCRIPTION .Nm displays the top processes on the system and periodically updates this information. If standard output is an intelligent terminal (see below) then as many processes as will fit on the terminal screen are displayed by default. Otherwise, a good number of them are shown (around 20). Raw cpu percentage is used to rank the processes. If .Ar number is given, then the top .Ar number processes will be displayed instead of the default. .Pp .Nm makes a distinction between terminals that support advanced capabilities and those that do not. This distinction affects the choice of defaults for certain options. In the remainder of this document, an \*(lqintelligent\*(rq terminal is one that supports cursor addressing, clear screen, and clear to end of line. Conversely, a \*(lqdumb\*(rq terminal is one that does not support such features. If the output of .Nm is redirected to a file, it acts as if it were being run on a dumb terminal. .Bl -tag -width indent -compact .It Fl C Toggle CPU display mode. By default top displays the weighted CPU percentage in the WCPU column (this is the same value that .Xr ps 1 displays as CPU). Each time .Fl C flag is passed it toggles between \*(lqraw cpu\*(rq mode and \*(lqweighted cpu\*(rq mode, showing the \*(lqCPU\*(rq or the \*(lqWCPU\*(rq column respectively. .It Fl S Show system processes in the display. Normally, system processes such as the pager and the swapper are not shown. This option makes them visible. .It Fl a Display command names derived from the argv[] vector, rather than real executable name. It it useful when you want to watch applications, that puts their status information there. If the real name differs from argv[0], it will be displayed in parenthesis. +Non-printable characters in the command line are +encoded in C-style backslash sequences or +a three digit octal sequences. .It Fl b Use \*(lqbatch\*(rq mode. In this mode, all input from the terminal is ignored. Interrupt characters (such as ^C and ^\e) still have an effect. This is the default on a dumb terminal, or when the output is not a terminal. .It Fl H Display each thread for a multithreaded process individually. By default a single summary line is displayed for each process. .It Fl i Use \*(lqinteractive\*(rq mode. In this mode, any input is immediately read for processing. See the section on \*(lqInteractive Mode\*(rq for an explanation of which keys perform what functions. After the command is processed, the screen will immediately be updated, even if the command was not understood. This mode is the default when standard output is an intelligent terminal. .It Fl I Do not display idle processes. By default, top displays both active and idle processes. .It Fl j Display the .Xr jail 8 ID. .It Fl T Toggle displaying thread ID (tid) instead of process id (pid). .It Fl t Do not display the .Nm process itself. .It Fl display Display either 'cpu' or 'io' statistics. Default is 'cpu'. .It Fl n Use \*(lqnon-interactive\*(rq mode. This is identical to \*(lqbatch\*(rq mode. .It Fl P Display per-cpu CPU usage statistics. .It Fl q Renice .Nm to -20 so that it will run faster. This can be used when the system is being very sluggish to improve the possibility of discovering the problem. This option can only be used by root. .It Fl u Do not map uid numbers to usernames. Normally, .Nm will read as much of the file \*(lq/etc/passwd\*(rq as is necessary to map all the user id numbers it encounters into login names. This option disables all that, while possibly decreasing execution time. The uid numbers are displayed instead of the names. .It Fl v Write version number information to stderr then exit immediately. .It Fl w Display approximate swap usage for each process. .It Fl z Do not display the system idle process. .It Fl d Ar count Show only .Ar count displays, then exit. A display is considered to be one update of the screen. The default is 1 for dumb terminals. Note that for .Ar count = 1 no information is available about the percentage of time spent by the CPU in every state. .It Fl s Ar time Set the delay between screen updates to .Ar time seconds. The default delay between updates is 1 second. .It Fl o Ar field Sort the process display area on the specified field. The field name is the name of the column as seen in the output, but in lower case: \*(lqcpu\*(lq, \*(rqsize\*(lq, \*(rqres\*(lq, \*(rqtime\*(lq, \*(rqpri\*(lq, \*(rqthreads\*(lq, \*(lqtotal\*(lq, \*(rqread\*(lq, \*(rqwrite\*(lq, \*(rqfault\*(lq, \*(rqvcsw\*(lq, \*(rqivcsw\*(lq, \*(lqjid\*(lq, \*(rqswap\*(lq or \*(rqpid\*(lq. .It Fl p Ar pid Show only the process .Ar pid . .It Fl J Ar jail Show only those processes owned by .Ar jail . This may be either the .Ar jid or .Ar name of the jail. Use 0 to limit to host processes. Using this option implies .Fl j . .Pp .It Fl U Ar username Show only those processes owned by .Ar username . This option currently only accepts usernames and will not understand uid numbers. .El .Pp Both .Ar count and .Ar number fields can be specified as \*(lqinfinite\*(rq, indicating that they can stretch as far as possible. This is accomplished by using any proper prefix of the keywords \*(lqinfinity\*(rq, \*(lqmaximum\*(rq, or \*(lqall\*(rq. Boolean flags are toggles. A second specification of any of these options will negate the first. .Sh "INTERACTIVE MODE" When .Nm is running in \*(lqinteractive mode\*(rq, it reads commands from the terminal and acts upon them accordingly. In this mode, the terminal is put in \*(lqCBREAK\*(rq, so that a character will be processed as soon as it is typed. Almost always, a key will be pressed when .Nm is between displays; that is, while it is waiting for .Ar time seconds to elapse. If this is the case, the command will be processed and the display will be updated immediately thereafter (reflecting any changes that the command may have specified). This happens even if the command was incorrect. If a key is pressed while .Nm is in the middle of updating the display, it will finish the update and then process the command. Some commands require additional information, and the user will be prompted accordingly. While typing this information in, the user's erase and kill keys (as set up by the command .Xr stty 1 ) are recognized, and a newline terminates the input. .Pp These commands are currently recognized (^L refers to control-L): .Bl -tag -width indent .It ^L Redraw the screen. .It h Display a summary of the commands (help screen). Version information is included in this display. .It q Quit .Nm .It d Change the number of displays to show (prompt for new number). Remember that the next display counts as one, so typing .It d1 will make .Nm show one final display and then immediately exit. .It m Toggle the display between 'cpu' and 'io' modes. .It n or # Change the number of processes to display (prompt for new number). .It s Change the number of seconds to delay between displays (prompt for new number). .It S Toggle the display of system processes. .It a Toggle the display of process titles. .It k Send a signal (\*(lqkill\*(rq by default) to a list of processes. This acts similarly to the command .Xr kill 1 . .It r Change the priority (the \*(lqnice\*(rq) of a list of processes. This acts similarly to .Xr renice 8 . .It u Display only processes owned by a specific set of usernames (prompt for username). If the username specified is simply \*(lq+\*(rq or \*(lq-\*(rq, then processes belonging to all users will be displayed. Usernames can be added to and removed from the set by prepending them with \*(lq+\*(rq and \*(lq-\*(rq, respectively. .It o Change the order in which the display is sorted. The sort key names include \*(lqcpu\*(rq, \*(lqres\*(rq, \*(lqsize\*(rq, \*(lqtime\*(rq. The default is cpu. .It p Display a specific process (prompt for pid). If the pid specified is simply \*(lq+\*(rq, then show all processes. .It e Display a list of system errors (if any) generated by the last command. .It B H Toggle the display of threads. .It i or I Toggle the display of idle processes. .It j Toggle the display of .Xr jail 8 ID. .It J Display only processes owned by a specific jail (prompt for jail). If the jail specified is simply \*(lq+\*(rq, then processes belonging to all jails and the host will be displayed. This will also enable the display of JID. .It P Toggle the display of per-CPU statistics. .It T Toggle display of TID and PID .It t Toggle the display of the .Nm process. .It w Toggle the display of swap usage. .It z Toggle the display of the system idle process. .El .Sh "THE DISPLAY" The top few lines of the display show general information about the state of the system, including the last process id assigned to a process (on most systems), the three load averages, the current time, the number of existing processes, the number of processes in each state (sleeping, running, starting, zombies, and stopped), and a percentage of time spent in each of the processor states (user, nice, system, and idle). It also includes information about physical and virtual memory allocation. .Pp The remainder of the screen displays information about individual processes. This display is similar in spirit to .Xr ps 1 but it is not exactly the same. PID is the process id, JID, when displayed, is the .Xr jail 8 ID corresponding to the process, USERNAME is the name of the process's owner (if .Fl u is specified, a UID column will be substituted for USERNAME), PRI is the current priority of the process, NICE is the .Xr nice 1 amount, SIZE is the total size of the process (text, data, and stack), RES is the current amount of resident memory, SWAP is the approximate amount of swap, if enabled (SIZE, RES and SWAP are given in kilobytes), STATE is the current state (one of \*(lqSTART\*(rq, \*(lqRUN\*(rq (shown as \*(lqCPUn\*(rq on SMP systems), \*(lqSLEEP\*(rq, \*(lqSTOP\*(rq, \*(lqZOMB\*(rq, \*(lqWAIT\*(rq, \*(lqLOCK\*(rq or the event on which the process waits), C is the processor number on which the process is executing (visible only on SMP systems), TIME is the number of system and user cpu seconds that the process has used, WCPU, when displayed, is the weighted cpu percentage (this is the same value that .Xr ps 1 displays as CPU), CPU is the raw percentage and is the field that is sorted to determine the order of the processes, and COMMAND is the name of the command that the process is currently running (if the process is swapped out, this column is marked \*(lq\*(rq). .Pp If a process is in the \*(lqSLEEP\*(rq or \*(lqLOCK\*(rq state, the state column will report the name of the event or lock on which the process is waiting. Lock names are prefixed with an asterisk \*(lq*\*(rq while sleep events are not. .Sh DESCRIPTION OF MEMORY .Bd -literal Mem: 61M Active, 86M Inact, 368K Laundry, 22G Wired, 102G Free ARC: 15G Total, 9303M MFU, 6155M MRU, 1464K Anon, 98M Header, 35M Other 15G Compressed, 27G Uncompressed, 1.75:1 Ratio, 174M Overhead Swap: 4096M Total, 532M Free, 13% Inuse, 80K In, 104K Out .Ed .Ss Physical Memory Stats .Bl -tag -width "Uncompressed" -compact .It Em Active number of bytes active .It Em Inact number of clean bytes inactive .It Em Laundry number of dirty bytes queued for laundering .It Em Wired number of bytes wired down, including IO-level cached file data pages .It Em Buf number of bytes used for IO-level disk caching .It Em Free number of bytes free .El .Ss ZFS ARC Stats These stats are only displayed when the ARC is in use. .Pp .Bl -tag -width "Uncompressed" -compact .It Em Total number of wired bytes used for the ZFS ARC .It Em MRU number of ARC bytes holding most recently used data .It Em MFU number of ARC bytes holding most frequently used data .It Em Anon number of ARC bytes holding in flight data .It Em Header number of ARC bytes holding headers .It Em Other miscellaneous ARC bytes .It Em Compressed bytes of memory used by ARC caches .It Em Uncompressed bytes of data stored in ARC caches before compression .It Em Ratio compression ratio of data cached in the ARC .El .Ss Swap Stats .Bl -tag -width "Uncompressed" -compact .It Em Total total available swap usage .It Em Free total free swap usage .It Em Inuse swap usage .It Em \&In bytes paged in from swap devices (last interval) .It Em Out bytes paged out to swap devices (last interval) .El .Sh ENVIRONMENT .Bl -tag -width "Uncompressed" .It Ev TOP Default set of arguments to .Nm . +.It Ev LC_CTYPE +The locale to use when displaying the +.Va argv +vector when +.Fl a +flag is specified. .El .Sh SEE ALSO .Xr kill 1 , .Xr ps 1 , .Xr stty 1 , .Xr getrusage 2 , .Xr humanize_number 3 , .Xr mem 4 , .Xr renice 8 .Sh AUTHORS .An William LeFebvre, EECS Department, Northwestern University .Sh BUGS The command name for swapped processes should be tracked down, but this would make the program run slower. .Pp As with .Xr ps 1 , things can change while .Nm is collecting information for an update. The picture it gives is only a close approximation to reality. Index: projects/clang900-import/usr.bin/top/top.c =================================================================== --- projects/clang900-import/usr.bin/top/top.c (revision 352586) +++ projects/clang900-import/usr.bin/top/top.c (revision 352587) @@ -1,1198 +1,1204 @@ /*- * Top users/processes display for Unix * * This program may be freely redistributed, * but this entire comment MUST remain intact. * * Copyright (c) 1984, 1989, William LeFebvre, Rice University * Copyright (c) 1989 - 1994, William LeFebvre, Northwestern University * Copyright (c) 1994, 1995, William LeFebvre, Argonne National Laboratory * Copyright (c) 1996, William LeFebvre, Group sys Consulting * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include "commands.h" #include "display.h" /* interface to display package */ #include "screen.h" /* interface to screen package */ #include "top.h" #include "machine.h" #include "utils.h" #include "username.h" /* Size of the stdio buffer given to stdout */ #define Buffersize 2048 char copyright[] = "Copyright (c) 1984 through 1996, William LeFebvre"; typedef void sigret_t; /* The buffer that stdio will use */ static char stdoutbuf[Buffersize]; static int fmt_flags = 0; int pcpu_stats = false; /* signal handling routines */ static sigret_t leave(int); static sigret_t tstop(int); static sigret_t top_winch(int); static volatile sig_atomic_t leaveflag; static volatile sig_atomic_t tstopflag; static volatile sig_atomic_t winchflag; /* values which need to be accessed by signal handlers */ static int max_topn; /* maximum displayable processes */ /* miscellaneous things */ struct process_select ps; pid_t mypid; /* pointers to display routines */ static void (*d_loadave)(int mpid, double *avenrun) = i_loadave; static void (*d_procstates)(int total, int *brkdn) = i_procstates; static void (*d_cpustates)(int *states) = i_cpustates; static void (*d_memory)(int *stats) = i_memory; static void (*d_arc)(int *stats) = i_arc; static void (*d_carc)(int *stats) = i_carc; static void (*d_swap)(int *stats) = i_swap; static void (*d_message)(void) = i_message; static void (*d_header)(const char *text) = i_header; static void (*d_process)(int line, char *thisline) = i_process; static void reset_display(void); static const struct option longopts[] = { { "cpu-display-mode", no_argument, NULL, 'C' }, /* differs from orignal */ /* D reserved */ { "thread", no_argument, NULL, 'H' }, { "idle-procs", no_argument, NULL, 'I' }, { "jail", required_argument, NULL, 'J' }, { "per-cpu", no_argument, NULL, 'P' }, { "system-procs", no_argument, NULL, 'S' }, { "thread-id", no_argument, NULL, 'T' }, /* differs from orignal */ { "user", required_argument, NULL, 'U' }, { "all", no_argument, NULL, 'a' }, { "batch", no_argument, NULL, 'b' }, /* c reserved */ { "displays", required_argument, NULL, 'd' }, { "interactive", no_argument, NULL, 'i' }, { "jail-id", no_argument, NULL, 'j' }, { "display-mode", required_argument, NULL, 'm' }, /* n is identical to batch */ { "sort-order", required_argument, NULL, 'o' }, { "pid", required_argument, NULL, 'p' }, { "quick", no_argument, NULL, 'q' }, { "delay", required_argument, NULL, 's' }, { "threads", no_argument, NULL, 't' }, { "uids", no_argument, NULL, 'u' }, { "version", no_argument, NULL, 'v' }, { "swap", no_argument, NULL, 'w' }, { "system-idle-procs", no_argument, NULL, 'z' }, { NULL, 0, NULL, 0 } }; static void reset_uids(void) { for (size_t i = 0; i < TOP_MAX_UIDS; ++i) ps.uid[i] = -1; } static int add_uid(int uid) { size_t i = 0; /* Add the uid if there's room */ for (; i < TOP_MAX_UIDS; ++i) { if (ps.uid[i] == -1 || ps.uid[i] == uid) { ps.uid[i] = uid; break; } } return (i == TOP_MAX_UIDS); } static void rem_uid(int uid) { size_t i = 0; size_t where = TOP_MAX_UIDS; /* Look for the user to remove - no problem if it's not there */ for (; i < TOP_MAX_UIDS; ++i) { if (ps.uid[i] == -1) break; if (ps.uid[i] == uid) where = i; } /* Make sure we don't leave a hole in the middle */ if (where != TOP_MAX_UIDS) { ps.uid[where] = ps.uid[i-1]; ps.uid[i-1] = -1; } } static int handle_user(char *buf, size_t buflen) { int rc = 0; int uid = -1; char *buf2 = buf; new_message(MT_standout, "Username to show (+ for all): "); if (readline(buf, buflen, false) <= 0) { clear_message(); return (rc); } if (buf[0] == '+' || buf[0] == '-') { if (buf[1] == '\0') { reset_uids(); goto end; } else ++buf2; } if ((uid = userid(buf2)) == -1) { new_message(MT_standout, " %s: unknown user", buf2); rc = 1; goto end; } if (buf2 == buf) { reset_uids(); ps.uid[0] = uid; goto end; } if (buf[0] == '+') { if (add_uid(uid)) { new_message(MT_standout, " too many users, reset with '+'"); rc = 1; goto end; } } else rem_uid(uid); end: putchar('\r'); return (rc); } int main(int argc, const char *argv[]) { int i; int active_procs; struct system_info system_info; struct statics statics; void * processes; static char tempbuf1[50]; static char tempbuf2[50]; sigset_t old_sigmask, new_sigmask; int topn = Infinity; double delay = 2; int displays = 0; /* indicates unspecified */ int sel_ret = 0; time_t curr_time; char *(*get_userid)(int) = username; const char *uname_field = "USERNAME"; const char *header_text; char *env_top; const char **preset_argv; int preset_argc = 0; const char **av = NULL; int ac = -1; bool do_unames = true; char interactive = 2; char warnings = 0; char topn_specified = false; char ch; char no_command = 1; struct timeval timeout; char *order_name = NULL; int order_index = 0; fd_set readfds; char *nptr; /* set the buffer for stdout */ #ifdef DEBUG extern FILE *debug; debug = fopen("debug.run", "w"); setbuffer(stdout, NULL, 0); #else setbuffer(stdout, stdoutbuf, Buffersize); #endif + + if (setlocale(LC_ALL, "") == NULL) { + fprintf(stderr, "invalid locale.\n"); + exit(1); + } mypid = getpid(); /* get our name */ /* initialize some selection options */ ps.idle = true; ps.self = true; ps.system = false; reset_uids(); ps.thread = false; ps.wcpu = 1; ps.jid = -1; ps.jail = false; ps.swap = false; ps.kidle = true; ps.pid = -1; ps.command = NULL; ps.thread_id = false; /* get preset options from the environment */ if ((env_top = getenv("TOP")) != NULL) { av = preset_argv = argparse(env_top, &preset_argc); ac = preset_argc; /* set the dummy argument to an explanatory message, in case getopt encounters a bad argument */ preset_argv[0] = "while processing environment"; } /* process options */ do { /* if we're done doing the presets, then process the real arguments */ if (preset_argc == 0) { ac = argc; av = argv; /* this should keep getopt happy... */ optind = 1; } while ((i = getopt_long(ac, __DECONST(char * const *, av), "CSIHPabijJ:nquvzs:d:U:m:o:p:Ttw", longopts, NULL)) != EOF) { switch(i) { case 'v': /* show version number */ errx(0, "version FreeBSD"); break; case 'u': /* toggle uid/username display */ do_unames = !do_unames; break; case 'U': /* display only username's processes */ if ((ps.uid[0] = userid(optarg)) == -1) { errx(1, "%s: unknown user\n", optarg); } break; case 'S': /* show system processes */ ps.system = true; break; case 'I': /* show idle processes */ ps.idle = !ps.idle; break; case 'i': /* go interactive regardless */ interactive = 1; break; case 'n': /* batch, or non-interactive */ case 'b': interactive = 0; break; case 'a': fmt_flags ^= FMT_SHOWARGS; break; case 'd': /* number of displays to show */ if ((i = atoiwi(optarg)) == Invalid || i == 0) { warnx("warning: display count should be positive -- option ignored"); warnings++; } else { displays = i; } break; case 'p': { unsigned long long num; const char *errstr; num = strtonum(optarg, 0, INT_MAX, &errstr); if (errstr != NULL || !find_pid(num)) { fprintf(stderr, "%s: unknown pid\n", optarg); exit(1); } ps.pid = (pid_t)num; ps.system = true; break; } case 's': delay = strtod(optarg, &nptr); if (nptr == optarg) { warnx("warning: invalid delay"); delay = 2; warnings++; } if (delay < 0) { warnx("warning: seconds delay should be positive -- using default"); delay = 2; warnings++; } break; case 'q': /* be quick about it */ errno = 0; i = setpriority(PRIO_PROCESS, 0, PRIO_MIN); if (i == -1 && errno != 0) { warnx("warning: `-q' option failed (%m)"); warnings++; } break; case 'm': /* select display mode */ if (strcmp(optarg, "io") == 0) { displaymode = DISP_IO; } else if (strcmp(optarg, "cpu") == 0) { displaymode = DISP_CPU; } else { errx(1, "warning: `-m' option can only take args 'io' or 'cpu'"); } break; case 'o': /* select sort order */ order_name = optarg; break; case 't': ps.self = !ps.self; break; case 'C': ps.wcpu = !ps.wcpu; break; case 'H': ps.thread = !ps.thread; break; case 'T': ps.thread_id = !ps.thread_id; break; case 'j': ps.jail = !ps.jail; break; case 'J': /* display only jail's processes */ if ((ps.jid = jail_getid(optarg)) == -1) { fprintf(stderr, "%s: unknown jail\n", optarg); exit(1); } ps.jail = 1; break; case 'P': pcpu_stats = !pcpu_stats; break; case 'w': ps.swap = 1; break; case 'z': ps.kidle = !ps.kidle; break; default: errx(1, "[-abCHIijnPqStuvwz] [-d count] [-m io | cpu] [-o field] [-p pid]\n" " [-s time] [-J jail] [-U username] [number]"); } } /* get count of top processes to display (if any) */ if (optind < ac) { if ((topn = atoiwi(av[optind])) == Invalid) { warnx("warning: process display count should be non-negative -- using default"); warnings++; } else { topn_specified = true; } } /* tricky: remember old value of preset_argc & set preset_argc = 0 */ i = preset_argc; preset_argc = 0; /* repeat only if we really did the preset arguments */ } while (i != 0); /* set constants for username/uid display correctly */ if (!do_unames) { uname_field = " UID "; get_userid = itoa7; } /* initialize the kernel memory interface */ if (machine_init(&statics) == -1) { exit(1); } /* determine sorting order index, if necessary */ if (order_name != NULL) { if ((order_index = string_index(order_name, statics.order_names)) == -1) { const char * const *pp; warnx("'%s' is not a recognized sorting order.", order_name); fprintf(stderr, "\tTry one of these:"); pp = statics.order_names; while (*pp != NULL) { fprintf(stderr, " %s", *pp++); } fputc('\n', stderr); exit(1); } } /* initialize termcap */ init_termcap(interactive); /* get the string to use for the process area header */ header_text = format_header(uname_field); /* initialize display interface */ if ((max_topn = display_init(&statics)) == -1) { errx(4, "can't allocate sufficient memory"); } /* print warning if user requested more processes than we can display */ if (topn > max_topn) { warnx("warning: this terminal can only display %d processes.", max_topn); warnings++; } /* adjust for topn == Infinity */ if (topn == Infinity) { /* * For smart terminals, infinity really means everything that can * be displayed, or Largest. * On dumb terminals, infinity means every process in the system! * We only really want to do that if it was explicitly specified. * This is always the case when "Default_TOPN != Infinity". But if * topn wasn't explicitly specified and we are on a dumb terminal * and the default is Infinity, then (and only then) we use * "Nominal_TOPN" instead. */ topn = smart_terminal ? Largest : (topn_specified ? Largest : Nominal_TOPN); } /* set header display accordingly */ display_header(topn > 0); /* determine interactive state */ if (interactive == 2) { interactive = smart_terminal; } /* if # of displays not specified, fill it in */ if (displays == 0) { displays = smart_terminal ? Infinity : 1; } /* hold interrupt signals while setting up the screen and the handlers */ sigemptyset(&new_sigmask); sigaddset(&new_sigmask, SIGINT); sigaddset(&new_sigmask, SIGQUIT); sigaddset(&new_sigmask, SIGTSTP); sigprocmask(SIG_BLOCK, &new_sigmask, &old_sigmask); init_screen(); signal(SIGINT, leave); signal(SIGQUIT, leave); signal(SIGTSTP, tstop); signal(SIGWINCH, top_winch); sigprocmask(SIG_SETMASK, &old_sigmask, NULL); if (warnings) { fputs("....", stderr); fflush(stderr); sleep(3 * warnings); fputc('\n', stderr); } restart: /* * main loop -- repeat while display count is positive or while it * indicates infinity (by being -1) */ while ((displays == -1) || (displays-- > 0)) { int (*compare)(const void * const, const void * const); /* get the current stats */ get_system_info(&system_info); compare = compares[order_index]; /* get the current set of processes */ processes = get_process_info(&system_info, &ps, compare); /* display the load averages */ (*d_loadave)(system_info.last_pid, system_info.load_avg); /* display the current time */ /* this method of getting the time SHOULD be fairly portable */ time(&curr_time); i_uptime(&system_info.boottime, &curr_time); i_timeofday(&curr_time); /* display process state breakdown */ (*d_procstates)(system_info.p_total, system_info.procstates); (*d_cpustates)(system_info.cpustates); /* display memory stats */ (*d_memory)(system_info.memory); (*d_arc)(system_info.arc); (*d_carc)(system_info.carc); /* display swap stats */ (*d_swap)(system_info.swap); /* handle message area */ (*d_message)(); /* update the header area */ (*d_header)(header_text); if (topn > 0) { /* determine number of processes to actually display */ /* this number will be the smallest of: active processes, number user requested, number current screen accomodates */ active_procs = system_info.p_pactive; if (active_procs > topn) { active_procs = topn; } if (active_procs > max_topn) { active_procs = max_topn; } /* now show the top "n" processes. */ for (i = 0; i < active_procs; i++) { (*d_process)(i, format_next_process(processes, get_userid, fmt_flags)); } } else { i = 0; } /* do end-screen processing */ u_endscreen(i); /* now, flush the output buffer */ if (fflush(stdout) != 0) { new_message(MT_standout, " Write error on stdout"); putchar('\r'); quit(1); } /* only do the rest if we have more displays to show */ if (displays) { /* switch out for new display on smart terminals */ if (smart_terminal) { if (overstrike) { reset_display(); } else { d_loadave = u_loadave; d_procstates = u_procstates; d_cpustates = u_cpustates; d_memory = u_memory; d_arc = u_arc; d_carc = u_carc; d_swap = u_swap; d_message = u_message; d_header = u_header; d_process = u_process; } } no_command = true; if (!interactive) { usleep(delay * 1e6); if (leaveflag) { end_screen(); exit(0); } } else while (no_command) { /* assume valid command unless told otherwise */ no_command = false; /* set up arguments for select with timeout */ FD_ZERO(&readfds); FD_SET(0, &readfds); /* for standard input */ timeout.tv_sec = delay; timeout.tv_usec = 0; if (leaveflag) { end_screen(); exit(0); } if (tstopflag) { /* move to the lower left */ end_screen(); fflush(stdout); /* default the signal handler action */ signal(SIGTSTP, SIG_DFL); /* unblock the signal and send ourselves one */ sigsetmask(sigblock(0) & ~(1 << (SIGTSTP - 1))); kill(0, SIGTSTP); /* reset the signal handler */ signal(SIGTSTP, tstop); /* reinit screen */ reinit_screen(); reset_display(); tstopflag = 0; goto restart; } if (winchflag) { /* reascertain the screen dimensions */ get_screensize(); /* tell display to resize */ max_topn = display_resize(); /* reset the signal handler */ signal(SIGWINCH, top_winch); reset_display(); winchflag = 0; goto restart; } /* wait for either input or the end of the delay period */ sel_ret = select(2, &readfds, NULL, NULL, &timeout); if (sel_ret < 0 && errno != EINTR) quit(0); if (sel_ret > 0) { int newval; const char *errmsg; const struct command *cptr; /* something to read -- clear the message area first */ clear_message(); /* now read it and convert to command strchr */ /* (use "change" as a temporary to hold strchr) */ if (read(0, &ch, 1) != 1) { /* read error: either 0 or -1 */ new_message(MT_standout, " Read error on stdin"); putchar('\r'); quit(1); } if (ch == '\r' || ch == '\n') { continue; } cptr = all_commands; while (cptr->c != '\0') { if (cptr->c == ch) { break; } cptr++; } if (cptr->c == '\0') { new_message(MT_standout, " Command not understood"); putchar('\r'); no_command = true; } if (overstrike && !cptr->available_to_dumb) { new_message(MT_standout, " Command cannot be handled by this terminal"); putchar('\r'); no_command = true; } if (!no_command) { switch(cptr->id) { case CMD_redraw: /* redraw screen */ reset_display(); break; case CMD_update: /* merely update display */ break; case CMD_quit: quit(0); break; case CMD_help: reset_display(); top_clear(); show_help(); top_standout("Hit any key to continue: "); fflush(stdout); read(0, &ch, 1); break; case CMD_errors: /* show errors */ if (error_count() == 0) { new_message(MT_standout, " Currently no errors to report."); putchar('\r'); no_command = true; } else { reset_display(); top_clear(); show_errors(); top_standout("Hit any key to continue: "); fflush(stdout); read(0, &ch, 1); } break; case CMD_number: new_message(MT_standout, "Number of processes to show: "); newval = readline(tempbuf1, 8, true); if (newval > -1) { if (newval > max_topn) { new_message(MT_standout | MT_delayed, " This terminal can only display %d processes.", max_topn); putchar('\r'); } if (newval == 0) { /* inhibit the header */ display_header(false); } else if (newval > topn && topn == 0) { /* redraw the header */ display_header(true); d_header = i_header; } topn = newval; } break; case CMD_delay: /* new seconds delay */ new_message(MT_standout, "Seconds to delay: "); if ((i = readline(tempbuf1, 8, true)) > -1) { if ((delay = i) == 0) { delay = 1; } } clear_message(); break; case CMD_displays: /* change display count */ new_message(MT_standout, "Displays to show (currently %s): ", displays == -1 ? "infinite" : itoa(displays)); if ((i = readline(tempbuf1, 10, true)) > 0) { displays = i; } else if (i == 0) { quit(0); } clear_message(); break; case CMD_kill: /* kill program */ new_message(0, "kill "); if (readline(tempbuf2, sizeof(tempbuf2), false) > 0) { if ((errmsg = kill_procs(tempbuf2)) != NULL) { new_message(MT_standout, "%s", errmsg); putchar('\r'); no_command = true; } } else { clear_message(); } break; case CMD_renice: /* renice program */ new_message(0, "renice "); if (readline(tempbuf2, sizeof(tempbuf2), false) > 0) { if ((errmsg = renice_procs(tempbuf2)) != NULL) { new_message(MT_standout, "%s", errmsg); putchar('\r'); no_command = true; } } else { clear_message(); } break; case CMD_idletog: ps.idle = !ps.idle; new_message(MT_standout | MT_delayed, " %sisplaying idle processes.", ps.idle ? "D" : "Not d"); putchar('\r'); break; case CMD_selftog: ps.self = !ps.self; new_message(MT_standout | MT_delayed, " %sisplaying self.", (ps.self) ? "D" : "Not d"); putchar('\r'); break; case CMD_user: if (handle_user(tempbuf2, sizeof(tempbuf2))) no_command = true; break; case CMD_thrtog: ps.thread = !ps.thread; new_message(MT_standout | MT_delayed, " Displaying threads %s", ps.thread ? "separately" : "as a count"); header_text = format_header(uname_field); reset_display(); putchar('\r'); break; case CMD_toggletid: ps.thread_id = !ps.thread_id; new_message(MT_standout | MT_delayed, " Displaying %s", ps.thread_id ? "tid" : "pid"); header_text = format_header(uname_field); reset_display(); putchar('\r'); break; case CMD_wcputog: ps.wcpu = !ps.wcpu; new_message(MT_standout | MT_delayed, " Displaying %s CPU", ps.wcpu ? "weighted" : "raw"); header_text = format_header(uname_field); reset_display(); putchar('\r'); break; case CMD_viewtog: displaymode = displaymode == DISP_IO ? DISP_CPU : DISP_IO; new_message(MT_standout | MT_delayed, " Displaying %s statistics.", displaymode == DISP_IO ? "IO" : "CPU"); header_text = format_header(uname_field); display_header(true); d_header = i_header; reset_display(); break; case CMD_viewsys: ps.system = !ps.system; new_message(MT_standout | MT_delayed, " %sisplaying system processes.", ps.system ? "D" : "Not d"); break; case CMD_showargs: fmt_flags ^= FMT_SHOWARGS; new_message(MT_standout | MT_delayed, " %sisplaying process arguments.", fmt_flags & FMT_SHOWARGS ? "D" : "Not d"); break; case CMD_order: new_message(MT_standout, "Order to sort: "); if (readline(tempbuf2, sizeof(tempbuf2), false) > 0) { if ((i = string_index(tempbuf2, statics.order_names)) == -1) { new_message(MT_standout, " %s: unrecognized sorting order", tempbuf2); no_command = true; } else { order_index = i; } putchar('\r'); } else { clear_message(); } break; case CMD_jidtog: ps.jail = !ps.jail; new_message(MT_standout | MT_delayed, " %sisplaying jail ID.", ps.jail ? "D" : "Not d"); header_text = format_header(uname_field); reset_display(); putchar('\r'); break; case CMD_jail: new_message(MT_standout, "Jail to show (+ for all): "); if (readline(tempbuf2, sizeof(tempbuf2), false) > 0) { if (tempbuf2[0] == '+' && tempbuf2[1] == '\0') { ps.jid = -1; } else if ((i = jail_getid(tempbuf2)) == -1) { new_message(MT_standout, " %s: unknown jail", tempbuf2); no_command = true; } else { ps.jid = i; } if (ps.jail == 0) { ps.jail = 1; new_message(MT_standout | MT_delayed, " Displaying jail " "ID."); header_text = format_header(uname_field); reset_display(); } putchar('\r'); } else { clear_message(); } break; case CMD_kidletog: ps.kidle = !ps.kidle; new_message(MT_standout | MT_delayed, " %sisplaying system idle process.", ps.kidle ? "D" : "Not d"); putchar('\r'); break; case CMD_pcputog: pcpu_stats = !pcpu_stats; new_message(MT_standout | MT_delayed, " Displaying %sCPU statistics.", pcpu_stats ? "per-" : "global "); toggle_pcpustats(); max_topn = display_updatecpus(&statics); reset_display(); putchar('\r'); break; case CMD_swaptog: ps.swap = !ps.swap; new_message(MT_standout | MT_delayed, " %sisplaying per-process swap usage.", ps.swap ? "D" : "Not d"); header_text = format_header(uname_field); reset_display(); putchar('\r'); break; case CMD_pid: new_message(MT_standout, "Process id to show (+ for all): "); if (readline(tempbuf2, sizeof(tempbuf2), false) > 0) { if (tempbuf2[0] == '+' && tempbuf2[1] == '\0') { ps.pid = (pid_t)-1; } else { unsigned long long num; const char *errstr; num = strtonum(tempbuf2, 0, INT_MAX, &errstr); if (errstr != NULL || !find_pid(num)) { new_message(MT_standout, " %s: unknown pid", tempbuf2); no_command = true; } else { ps.pid = (pid_t)num; } } putchar('\r'); } else clear_message(); break; case CMD_NONE: assert(false && "reached switch without command"); } } } /* flush out stuff that may have been written */ fflush(stdout); } } } #ifdef DEBUG fclose(debug); #endif quit(0); } /* * reset_display() - reset all the display routine pointers so that entire * screen will get redrawn. */ static void reset_display(void) { d_loadave = i_loadave; d_procstates = i_procstates; d_cpustates = i_cpustates; d_memory = i_memory; d_arc = i_arc; d_carc = i_carc; d_swap = i_swap; d_message = i_message; d_header = i_header; d_process = i_process; } /* * signal handlers */ static sigret_t leave(int i __unused) /* exit under normal conditions -- INT handler */ { leaveflag = 1; } static sigret_t tstop(int i __unused) /* SIGTSTP handler */ { tstopflag = 1; } static sigret_t top_winch(int i __unused) /* SIGWINCH handler */ { winchflag = 1; } void __dead2 quit(int status) /* exit under duress */ { end_screen(); exit(status); } Index: projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.sh =================================================================== --- projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.sh (revision 352586) +++ projects/clang900-import/usr.sbin/freebsd-update/freebsd-update.sh (revision 352587) @@ -1,3361 +1,3370 @@ #!/bin/sh #- # SPDX-License-Identifier: BSD-2-Clause-FreeBSD # # Copyright 2004-2007 Colin Percival # All rights reserved # # Redistribution and use in source and binary forms, with or without # modification, are permitted providing that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # $FreeBSD$ #### Usage function -- called from command-line handling code. # Usage instructions. Options not listed: # --debug -- don't filter output from utilities # --no-stats -- don't show progress statistics while fetching files usage () { cat < ${LINE}" exit 1 fi done < ${CONFFILE} # Merge the settings read from the configuration file with those # provided at the command line. mergeconfig } # Provide some default parameters default_params () { # Save any parameters already configured, and clear the slate saveconfig nullconfig # Default configurations config_WorkDir /var/db/freebsd-update config_MailTo root config_AllowAdd yes config_AllowDelete yes config_KeepModifiedMetadata yes config_BaseDir / config_VerboseLevel stats config_StrictComponents no config_BackupKernel yes config_BackupKernelDir /boot/kernel.old config_BackupKernelSymbolFiles no # Merge these defaults into the earlier-configured settings mergeconfig } # Set utility output filtering options, based on ${VERBOSELEVEL} fetch_setup_verboselevel () { case ${VERBOSELEVEL} in debug) QUIETREDIR="/dev/stderr" QUIETFLAG=" " STATSREDIR="/dev/stderr" DDSTATS=".." XARGST="-t" NDEBUG=" " ;; nostats) QUIETREDIR="" QUIETFLAG="" STATSREDIR="/dev/null" DDSTATS=".." XARGST="" NDEBUG="" ;; stats) QUIETREDIR="/dev/null" QUIETFLAG="-q" STATSREDIR="/dev/stdout" DDSTATS="" XARGST="" NDEBUG="-n" ;; esac } # Perform sanity checks and set some final parameters # in preparation for fetching files. Figure out which # set of updates should be downloaded: If the user is # running *-p[0-9]+, strip off the last part; if the # user is running -SECURITY, call it -RELEASE. Chdir # into the working directory. fetchupgrade_check_params () { export HTTP_USER_AGENT="freebsd-update (${COMMAND}, `uname -r`)" _SERVERNAME_z=\ "SERVERNAME must be given via command line or configuration file." _KEYPRINT_z="Key must be given via -k option or configuration file." _KEYPRINT_bad="Invalid key fingerprint: " _WORKDIR_bad="Directory does not exist or is not writable: " _WORKDIR_bad2="Directory is not on a persistent filesystem: " if [ -z "${SERVERNAME}" ]; then echo -n "`basename $0`: " echo "${_SERVERNAME_z}" exit 1 fi if [ -z "${KEYPRINT}" ]; then echo -n "`basename $0`: " echo "${_KEYPRINT_z}" exit 1 fi if ! echo "${KEYPRINT}" | grep -qE "^[0-9a-f]{64}$"; then echo -n "`basename $0`: " echo -n "${_KEYPRINT_bad}" echo ${KEYPRINT} exit 1 fi if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then echo -n "`basename $0`: " echo -n "${_WORKDIR_bad}" echo ${WORKDIR} exit 1 fi case `df -T ${WORKDIR}` in */dev/md[0-9]* | *tmpfs*) echo -n "`basename $0`: " echo -n "${_WORKDIR_bad2}" echo ${WORKDIR} exit 1 ;; esac chmod 700 ${WORKDIR} cd ${WORKDIR} || exit 1 # Generate release number. The s/SECURITY/RELEASE/ bit exists # to provide an upgrade path for FreeBSD Update 1.x users, since # the kernels provided by FreeBSD Update 1.x are always labelled # as X.Y-SECURITY. RELNUM=`uname -r | sed -E 's,-p[0-9]+,,' | sed -E 's,-SECURITY,-RELEASE,'` ARCH=`uname -m` FETCHDIR=${RELNUM}/${ARCH} PATCHDIR=${RELNUM}/${ARCH}/bp # Disallow upgrade from a version that is not a release case ${RELNUM} in *-RELEASE | *-ALPHA* | *-BETA* | *-RC*) ;; *) echo -n "`basename $0`: " cat <<- EOF Cannot upgrade from a version that is not a release (including alpha, beta and release candidates) using `basename $0`. Instead, FreeBSD can be directly upgraded by source or upgraded to a RELEASE/RELENG version prior to running `basename $0`. Currently running: ${RELNUM} EOF exit 1 ;; esac # Figure out what directory contains the running kernel BOOTFILE=`sysctl -n kern.bootfile` KERNELDIR=${BOOTFILE%/kernel} if ! [ -d ${KERNELDIR} ]; then echo "Cannot identify running kernel" exit 1 fi # Figure out what kernel configuration is running. We start with # the output of `uname -i`, and then make the following adjustments: # 1. Replace "SMP-GENERIC" with "SMP". Why the SMP kernel config # file says "ident SMP-GENERIC", I don't know... # 2. If the kernel claims to be GENERIC _and_ ${ARCH} is "amd64" # _and_ `sysctl kern.version` contains a line which ends "/SMP", then # we're running an SMP kernel. This mis-identification is a bug # which was fixed in 6.2-STABLE. KERNCONF=`uname -i` if [ ${KERNCONF} = "SMP-GENERIC" ]; then KERNCONF=SMP fi if [ ${KERNCONF} = "GENERIC" ] && [ ${ARCH} = "amd64" ]; then if sysctl kern.version | grep -qE '/SMP$'; then KERNCONF=SMP fi fi # Define some paths BSPATCH=/usr/bin/bspatch SHA256=/sbin/sha256 PHTTPGET=/usr/libexec/phttpget # Set up variables relating to VERBOSELEVEL fetch_setup_verboselevel # Construct a unique name from ${BASEDIR} BDHASH=`echo ${BASEDIR} | sha256 -q` } # Perform sanity checks etc. before fetching updates. fetch_check_params () { fetchupgrade_check_params if ! [ -z "${TARGETRELEASE}" ]; then echo -n "`basename $0`: " echo -n "-r option is meaningless with 'fetch' command. " echo "(Did you mean 'upgrade' instead?)" exit 1 fi # Check that we have updates ready to install if [ -f ${BDHASH}-install/kerneldone -a $FORCEFETCH -eq 0 ]; then echo "You have a partially completed upgrade pending" echo "Run '$0 install' first." echo "Run '$0 fetch -F' to proceed anyway." exit 1 fi } # Perform sanity checks etc. before fetching upgrades. upgrade_check_params () { fetchupgrade_check_params # Unless set otherwise, we're upgrading to the same kernel config. NKERNCONF=${KERNCONF} # We need TARGETRELEASE set _TARGETRELEASE_z="Release target must be specified via -r option." if [ -z "${TARGETRELEASE}" ]; then echo -n "`basename $0`: " echo "${_TARGETRELEASE_z}" exit 1 fi # The target release should be != the current release. if [ "${TARGETRELEASE}" = "${RELNUM}" ]; then echo -n "`basename $0`: " echo "Cannot upgrade from ${RELNUM} to itself" exit 1 fi # Turning off AllowAdd or AllowDelete is a bad idea for upgrades. if [ "${ALLOWADD}" = "no" ]; then echo -n "`basename $0`: " echo -n "WARNING: \"AllowAdd no\" is a bad idea " echo "when upgrading between releases." echo fi if [ "${ALLOWDELETE}" = "no" ]; then echo -n "`basename $0`: " echo -n "WARNING: \"AllowDelete no\" is a bad idea " echo "when upgrading between releases." echo fi # Set EDITOR to /usr/bin/vi if it isn't already set : ${EDITOR:='/usr/bin/vi'} } # Perform sanity checks and set some final parameters in # preparation for installing updates. install_check_params () { # Check that we are root. All sorts of things won't work otherwise. if [ `id -u` != 0 ]; then echo "You must be root to run this." exit 1 fi # Check that securelevel <= 0. Otherwise we can't update schg files. if [ `sysctl -n kern.securelevel` -gt 0 ]; then echo "Updates cannot be installed when the system securelevel" echo "is greater than zero." exit 1 fi # Check that we have a working directory _WORKDIR_bad="Directory does not exist or is not writable: " if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then echo -n "`basename $0`: " echo -n "${_WORKDIR_bad}" echo ${WORKDIR} exit 1 fi cd ${WORKDIR} || exit 1 # Construct a unique name from ${BASEDIR} BDHASH=`echo ${BASEDIR} | sha256 -q` # Check that we have updates ready to install if ! [ -L ${BDHASH}-install ]; then echo "No updates are available to install." if [ $ISFETCHED -eq 0 ]; then echo "Run '$0 fetch' first." exit 1 fi exit 0 fi if ! [ -f ${BDHASH}-install/INDEX-OLD ] || ! [ -f ${BDHASH}-install/INDEX-NEW ]; then echo "Update manifest is corrupt -- this should never happen." echo "Re-run '$0 fetch'." exit 1 fi # Figure out what directory contains the running kernel BOOTFILE=`sysctl -n kern.bootfile` KERNELDIR=${BOOTFILE%/kernel} if ! [ -d ${KERNELDIR} ]; then echo "Cannot identify running kernel" exit 1 fi } # Perform sanity checks and set some final parameters in # preparation for UNinstalling updates. rollback_check_params () { # Check that we are root. All sorts of things won't work otherwise. if [ `id -u` != 0 ]; then echo "You must be root to run this." exit 1 fi # Check that we have a working directory _WORKDIR_bad="Directory does not exist or is not writable: " if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then echo -n "`basename $0`: " echo -n "${_WORKDIR_bad}" echo ${WORKDIR} exit 1 fi cd ${WORKDIR} || exit 1 # Construct a unique name from ${BASEDIR} BDHASH=`echo ${BASEDIR} | sha256 -q` # Check that we have updates ready to rollback if ! [ -L ${BDHASH}-rollback ]; then echo "No rollback directory found." exit 1 fi if ! [ -f ${BDHASH}-rollback/INDEX-OLD ] || ! [ -f ${BDHASH}-rollback/INDEX-NEW ]; then echo "Update manifest is corrupt -- this should never happen." exit 1 fi } # Perform sanity checks and set some final parameters # in preparation for comparing the system against the # published index. Figure out which index we should # compare against: If the user is running *-p[0-9]+, # strip off the last part; if the user is running # -SECURITY, call it -RELEASE. Chdir into the working # directory. IDS_check_params () { export HTTP_USER_AGENT="freebsd-update (${COMMAND}, `uname -r`)" _SERVERNAME_z=\ "SERVERNAME must be given via command line or configuration file." _KEYPRINT_z="Key must be given via -k option or configuration file." _KEYPRINT_bad="Invalid key fingerprint: " _WORKDIR_bad="Directory does not exist or is not writable: " if [ -z "${SERVERNAME}" ]; then echo -n "`basename $0`: " echo "${_SERVERNAME_z}" exit 1 fi if [ -z "${KEYPRINT}" ]; then echo -n "`basename $0`: " echo "${_KEYPRINT_z}" exit 1 fi if ! echo "${KEYPRINT}" | grep -qE "^[0-9a-f]{64}$"; then echo -n "`basename $0`: " echo -n "${_KEYPRINT_bad}" echo ${KEYPRINT} exit 1 fi if ! [ -d "${WORKDIR}" -a -w "${WORKDIR}" ]; then echo -n "`basename $0`: " echo -n "${_WORKDIR_bad}" echo ${WORKDIR} exit 1 fi cd ${WORKDIR} || exit 1 # Generate release number. The s/SECURITY/RELEASE/ bit exists # to provide an upgrade path for FreeBSD Update 1.x users, since # the kernels provided by FreeBSD Update 1.x are always labelled # as X.Y-SECURITY. RELNUM=`uname -r | sed -E 's,-p[0-9]+,,' | sed -E 's,-SECURITY,-RELEASE,'` ARCH=`uname -m` FETCHDIR=${RELNUM}/${ARCH} PATCHDIR=${RELNUM}/${ARCH}/bp # Figure out what directory contains the running kernel BOOTFILE=`sysctl -n kern.bootfile` KERNELDIR=${BOOTFILE%/kernel} if ! [ -d ${KERNELDIR} ]; then echo "Cannot identify running kernel" exit 1 fi # Figure out what kernel configuration is running. We start with # the output of `uname -i`, and then make the following adjustments: # 1. Replace "SMP-GENERIC" with "SMP". Why the SMP kernel config # file says "ident SMP-GENERIC", I don't know... # 2. If the kernel claims to be GENERIC _and_ ${ARCH} is "amd64" # _and_ `sysctl kern.version` contains a line which ends "/SMP", then # we're running an SMP kernel. This mis-identification is a bug # which was fixed in 6.2-STABLE. KERNCONF=`uname -i` if [ ${KERNCONF} = "SMP-GENERIC" ]; then KERNCONF=SMP fi if [ ${KERNCONF} = "GENERIC" ] && [ ${ARCH} = "amd64" ]; then if sysctl kern.version | grep -qE '/SMP$'; then KERNCONF=SMP fi fi # Define some paths SHA256=/sbin/sha256 PHTTPGET=/usr/libexec/phttpget # Set up variables relating to VERBOSELEVEL fetch_setup_verboselevel } #### Core functionality -- the actual work gets done here # Use an SRV query to pick a server. If the SRV query doesn't provide # a useful answer, use the server name specified by the user. # Put another way... look up _http._tcp.${SERVERNAME} and pick a server # from that; or if no servers are returned, use ${SERVERNAME}. # This allows a user to specify "portsnap.freebsd.org" (in which case # portsnap will select one of the mirrors) or "portsnap5.tld.freebsd.org" # (in which case portsnap will use that particular server, since there # won't be an SRV entry for that name). # # We ignore the Port field, since we are always going to use port 80. # Fetch the mirror list, but do not pick a mirror yet. Returns 1 if # no mirrors are available for any reason. fetch_pick_server_init () { : > serverlist_tried # Check that host(1) exists (i.e., that the system wasn't built with the # WITHOUT_BIND set) and don't try to find a mirror if it doesn't exist. if ! which -s host; then : > serverlist_full return 1 fi echo -n "Looking up ${SERVERNAME} mirrors... " # Issue the SRV query and pull out the Priority, Weight, and Target fields. # BIND 9 prints "$name has SRV record ..." while BIND 8 prints # "$name server selection ..."; we allow either format. MLIST="_http._tcp.${SERVERNAME}" host -t srv "${MLIST}" | sed -nE "s/${MLIST} (has SRV record|server selection) //Ip" | cut -f 1,2,4 -d ' ' | sed -e 's/\.$//' | sort > serverlist_full # If no records, give up -- we'll just use the server name we were given. if [ `wc -l < serverlist_full` -eq 0 ]; then echo "none found." return 1 fi # Report how many mirrors we found. echo `wc -l < serverlist_full` "mirrors found." # Generate a random seed for use in picking mirrors. If HTTP_PROXY # is set, this will be used to generate the seed; otherwise, the seed # will be random. if [ -n "${HTTP_PROXY}${http_proxy}" ]; then RANDVALUE=`sha256 -qs "${HTTP_PROXY}${http_proxy}" | tr -d 'a-f' | cut -c 1-9` else RANDVALUE=`jot -r 1 0 999999999` fi } # Pick a mirror. Returns 1 if we have run out of mirrors to try. fetch_pick_server () { # Generate a list of not-yet-tried mirrors sort serverlist_tried | comm -23 serverlist_full - > serverlist # Have we run out of mirrors? if [ `wc -l < serverlist` -eq 0 ]; then cat <<- EOF No mirrors remaining, giving up. This may be because upgrading from this platform (${ARCH}) or release (${RELNUM}) is unsupported by `basename $0`. Only platforms with Tier 1 support can be upgraded by `basename $0`. See https://www.freebsd.org/platforms/index.html for more info. If unsupported, FreeBSD must be upgraded by source. EOF return 1 fi # Find the highest priority level (lowest numeric value). SRV_PRIORITY=`cut -f 1 -d ' ' serverlist | sort -n | head -1` # Add up the weights of the response lines at that priority level. SRV_WSUM=0; while read X; do case "$X" in ${SRV_PRIORITY}\ *) SRV_W=`echo $X | cut -f 2 -d ' '` SRV_WSUM=$(($SRV_WSUM + $SRV_W)) ;; esac done < serverlist # If all the weights are 0, pretend that they are all 1 instead. if [ ${SRV_WSUM} -eq 0 ]; then SRV_WSUM=`grep -E "^${SRV_PRIORITY} " serverlist | wc -l` SRV_W_ADD=1 else SRV_W_ADD=0 fi # Pick a value between 0 and the sum of the weights - 1 SRV_RND=`expr ${RANDVALUE} % ${SRV_WSUM}` # Read through the list of mirrors and set SERVERNAME. Write the line # corresponding to the mirror we selected into serverlist_tried so that # we won't try it again. while read X; do case "$X" in ${SRV_PRIORITY}\ *) SRV_W=`echo $X | cut -f 2 -d ' '` SRV_W=$(($SRV_W + $SRV_W_ADD)) if [ $SRV_RND -lt $SRV_W ]; then SERVERNAME=`echo $X | cut -f 3 -d ' '` echo "$X" >> serverlist_tried break else SRV_RND=$(($SRV_RND - $SRV_W)) fi ;; esac done < serverlist } # Take a list of ${oldhash}|${newhash} and output a list of needed patches, # i.e., those for which we have ${oldhash} and don't have ${newhash}. fetch_make_patchlist () { grep -vE "^([0-9a-f]{64})\|\1$" | tr '|' ' ' | while read X Y; do if [ -f "files/${Y}.gz" ] || [ ! -f "files/${X}.gz" ]; then continue fi echo "${X}|${Y}" done | sort -u } # Print user-friendly progress statistics fetch_progress () { LNC=0 while read x; do LNC=$(($LNC + 1)) if [ $(($LNC % 10)) = 0 ]; then echo -n $LNC elif [ $(($LNC % 2)) = 0 ]; then echo -n . fi done echo -n " " } # Function for asking the user if everything is ok continuep () { while read -p "Does this look reasonable (y/n)? " CONTINUE; do case "${CONTINUE}" in y*) return 0 ;; n*) return 1 ;; esac done } # Initialize the working directory workdir_init () { mkdir -p files touch tINDEX.present } # Check that we have a public key with an appropriate hash, or # fetch the key if it doesn't exist. Returns 1 if the key has # not yet been fetched. fetch_key () { if [ -r pub.ssl ] && [ `${SHA256} -q pub.ssl` = ${KEYPRINT} ]; then return 0 fi echo -n "Fetching public key from ${SERVERNAME}... " rm -f pub.ssl fetch ${QUIETFLAG} http://${SERVERNAME}/${FETCHDIR}/pub.ssl \ 2>${QUIETREDIR} || true if ! [ -r pub.ssl ]; then echo "failed." return 1 fi if ! [ `${SHA256} -q pub.ssl` = ${KEYPRINT} ]; then echo "key has incorrect hash." rm -f pub.ssl return 1 fi echo "done." } # Fetch metadata signature, aka "tag". fetch_tag () { echo -n "Fetching metadata signature " echo ${NDEBUG} "for ${RELNUM} from ${SERVERNAME}... " rm -f latest.ssl fetch ${QUIETFLAG} http://${SERVERNAME}/${FETCHDIR}/latest.ssl \ 2>${QUIETREDIR} || true if ! [ -r latest.ssl ]; then echo "failed." return 1 fi openssl rsautl -pubin -inkey pub.ssl -verify \ < latest.ssl > tag.new 2>${QUIETREDIR} || true rm latest.ssl if ! [ `wc -l < tag.new` = 1 ] || ! grep -qE \ "^freebsd-update\|${ARCH}\|${RELNUM}\|[0-9]+\|[0-9a-f]{64}\|[0-9]{10}" \ tag.new; then echo "invalid signature." return 1 fi echo "done." RELPATCHNUM=`cut -f 4 -d '|' < tag.new` TINDEXHASH=`cut -f 5 -d '|' < tag.new` EOLTIME=`cut -f 6 -d '|' < tag.new` } # Sanity-check the patch number in a tag, to make sure that we're not # going to "update" backwards and to prevent replay attacks. fetch_tagsanity () { # Check that we're not going to move from -pX to -pY with Y < X. RELPX=`uname -r | sed -E 's,.*-,,'` if echo ${RELPX} | grep -qE '^p[0-9]+$'; then RELPX=`echo ${RELPX} | cut -c 2-` else RELPX=0 fi if [ "${RELPATCHNUM}" -lt "${RELPX}" ]; then echo echo -n "Files on mirror (${RELNUM}-p${RELPATCHNUM})" echo " appear older than what" echo "we are currently running (`uname -r`)!" echo "Cowardly refusing to proceed any further." return 1 fi # If "tag" exists and corresponds to ${RELNUM}, make sure that # it contains a patch number <= RELPATCHNUM, in order to protect # against rollback (replay) attacks. if [ -f tag ] && grep -qE \ "^freebsd-update\|${ARCH}\|${RELNUM}\|[0-9]+\|[0-9a-f]{64}\|[0-9]{10}" \ tag; then LASTRELPATCHNUM=`cut -f 4 -d '|' < tag` if [ "${RELPATCHNUM}" -lt "${LASTRELPATCHNUM}" ]; then echo echo -n "Files on mirror (${RELNUM}-p${RELPATCHNUM})" echo " are older than the" echo -n "most recently seen updates" echo " (${RELNUM}-p${LASTRELPATCHNUM})." echo "Cowardly refusing to proceed any further." return 1 fi fi } # Fetch metadata index file fetch_metadata_index () { echo ${NDEBUG} "Fetching metadata index... " rm -f ${TINDEXHASH} fetch ${QUIETFLAG} http://${SERVERNAME}/${FETCHDIR}/t/${TINDEXHASH} 2>${QUIETREDIR} if ! [ -f ${TINDEXHASH} ]; then echo "failed." return 1 fi if [ `${SHA256} -q ${TINDEXHASH}` != ${TINDEXHASH} ]; then echo "update metadata index corrupt." return 1 fi echo "done." } # Print an error message about signed metadata being bogus. fetch_metadata_bogus () { echo echo "The update metadata$1 is correctly signed, but" echo "failed an integrity check." echo "Cowardly refusing to proceed any further." return 1 } # Construct tINDEX.new by merging the lines named in $1 from ${TINDEXHASH} # with the lines not named in $@ from tINDEX.present (if that file exists). fetch_metadata_index_merge () { for METAFILE in $@; do if [ `grep -E "^${METAFILE}\|" ${TINDEXHASH} | wc -l` \ -ne 1 ]; then fetch_metadata_bogus " index" return 1 fi grep -E "${METAFILE}\|" ${TINDEXHASH} done | sort > tINDEX.wanted if [ -f tINDEX.present ]; then join -t '|' -v 2 tINDEX.wanted tINDEX.present | sort -m - tINDEX.wanted > tINDEX.new rm tINDEX.wanted else mv tINDEX.wanted tINDEX.new fi } # Sanity check all the lines of tINDEX.new. Even if more metadata lines # are added by future versions of the server, this won't cause problems, # since the only lines which appear in tINDEX.new are the ones which we # specifically grepped out of ${TINDEXHASH}. fetch_metadata_index_sanity () { if grep -qvE '^[0-9A-Z.-]+\|[0-9a-f]{64}$' tINDEX.new; then fetch_metadata_bogus " index" return 1 fi } # Sanity check the metadata file $1. fetch_metadata_sanity () { # Some aliases to save space later: ${P} is a character which can # appear in a path; ${M} is the four numeric metadata fields; and # ${H} is a sha256 hash. P="[-+./:=,%@_[~[:alnum:]]" M="[0-9]+\|[0-9]+\|[0-9]+\|[0-9]+" H="[0-9a-f]{64}" # Check that the first four fields make sense. if gunzip -c < files/$1.gz | grep -qvE "^[a-z]+\|[0-9a-z-]+\|${P}+\|[fdL-]\|"; then fetch_metadata_bogus "" return 1 fi # Remove the first three fields. gunzip -c < files/$1.gz | cut -f 4- -d '|' > sanitycheck.tmp # Sanity check entries with type 'f' if grep -E '^f' sanitycheck.tmp | grep -qvE "^f\|${M}\|${H}\|${P}*\$"; then fetch_metadata_bogus "" return 1 fi # Sanity check entries with type 'd' if grep -E '^d' sanitycheck.tmp | grep -qvE "^d\|${M}\|\|\$"; then fetch_metadata_bogus "" return 1 fi # Sanity check entries with type 'L' if grep -E '^L' sanitycheck.tmp | grep -qvE "^L\|${M}\|${P}*\|\$"; then fetch_metadata_bogus "" return 1 fi # Sanity check entries with type '-' if grep -E '^-' sanitycheck.tmp | grep -qvE "^-\|\|\|\|\|\|"; then fetch_metadata_bogus "" return 1 fi # Clean up rm sanitycheck.tmp } # Fetch the metadata index and metadata files listed in $@, # taking advantage of metadata patches where possible. fetch_metadata () { fetch_metadata_index || return 1 fetch_metadata_index_merge $@ || return 1 fetch_metadata_index_sanity || return 1 # Generate a list of wanted metadata patches join -t '|' -o 1.2,2.2 tINDEX.present tINDEX.new | fetch_make_patchlist > patchlist if [ -s patchlist ]; then # Attempt to fetch metadata patches echo -n "Fetching `wc -l < patchlist | tr -d ' '` " echo ${NDEBUG} "metadata patches.${DDSTATS}" tr '|' '-' < patchlist | lam -s "${FETCHDIR}/tp/" - -s ".gz" | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${STATSREDIR} | fetch_progress echo "done." # Attempt to apply metadata patches echo -n "Applying metadata patches... " tr '|' ' ' < patchlist | while read X Y; do if [ ! -f "${X}-${Y}.gz" ]; then continue; fi gunzip -c < ${X}-${Y}.gz > diff gunzip -c < files/${X}.gz > diff-OLD # Figure out which lines are being added and removed grep -E '^-' diff | cut -c 2- | while read PREFIX; do look "${PREFIX}" diff-OLD done | sort > diff-rm grep -E '^\+' diff | cut -c 2- > diff-add # Generate the new file comm -23 diff-OLD diff-rm | sort - diff-add > diff-NEW if [ `${SHA256} -q diff-NEW` = ${Y} ]; then mv diff-NEW files/${Y} gzip -n files/${Y} else mv diff-NEW ${Y}.bad fi rm -f ${X}-${Y}.gz diff rm -f diff-OLD diff-NEW diff-add diff-rm done 2>${QUIETREDIR} echo "done." fi # Update metadata without patches cut -f 2 -d '|' < tINDEX.new | while read Y; do if [ ! -f "files/${Y}.gz" ]; then echo ${Y}; fi done | sort -u > filelist if [ -s filelist ]; then echo -n "Fetching `wc -l < filelist | tr -d ' '` " echo ${NDEBUG} "metadata files... " lam -s "${FETCHDIR}/m/" - -s ".gz" < filelist | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${QUIETREDIR} while read Y; do if ! [ -f ${Y}.gz ]; then echo "failed." return 1 fi if [ `gunzip -c < ${Y}.gz | ${SHA256} -q` = ${Y} ]; then mv ${Y}.gz files/${Y}.gz else echo "metadata is corrupt." return 1 fi done < filelist echo "done." fi # Sanity-check the metadata files. cut -f 2 -d '|' tINDEX.new > filelist while read X; do fetch_metadata_sanity ${X} || return 1 done < filelist # Remove files which are no longer needed cut -f 2 -d '|' tINDEX.present | sort > oldfiles cut -f 2 -d '|' tINDEX.new | sort | comm -13 - oldfiles | lam -s "files/" - -s ".gz" | xargs rm -f rm patchlist filelist oldfiles rm ${TINDEXHASH} # We're done! mv tINDEX.new tINDEX.present mv tag.new tag return 0 } # Extract a subset of a downloaded metadata file containing only the parts # which are listed in COMPONENTS. fetch_filter_metadata_components () { METAHASH=`look "$1|" tINDEX.present | cut -f 2 -d '|'` gunzip -c < files/${METAHASH}.gz > $1.all # Fish out the lines belonging to components we care about. for C in ${COMPONENTS}; do look "`echo ${C} | tr '/' '|'`|" $1.all done > $1 # Remove temporary file. rm $1.all } # Generate a filtered version of the metadata file $1 from the downloaded # file, by fishing out the lines corresponding to components we're trying # to keep updated, and then removing lines corresponding to paths we want # to ignore. fetch_filter_metadata () { # Fish out the lines belonging to components we care about. fetch_filter_metadata_components $1 # Canonicalize directory names by removing any trailing / in # order to avoid listing directories multiple times if they # belong to multiple components. Turning "/" into "" doesn't # matter, since we add a leading "/" when we use paths later. cut -f 3- -d '|' $1 | sed -e 's,/|d|,|d|,' | sed -e 's,/|-|,|-|,' | sort -u > $1.tmp # Figure out which lines to ignore and remove them. for X in ${IGNOREPATHS}; do grep -E "^${X}" $1.tmp done | sort -u | comm -13 - $1.tmp > $1 # Remove temporary files. rm $1.tmp } # Filter the metadata file $1 by adding lines with "/boot/$2" # replaced by ${KERNELDIR} (which is `sysctl -n kern.bootfile` minus the # trailing "/kernel"); and if "/boot/$2" does not exist, remove # the original lines which start with that. # Put another way: Deal with the fact that the FOO kernel is sometimes # installed in /boot/FOO/ and is sometimes installed elsewhere. fetch_filter_kernel_names () { grep ^/boot/$2 $1 | sed -e "s,/boot/$2,${KERNELDIR},g" | sort - $1 > $1.tmp mv $1.tmp $1 if ! [ -d /boot/$2 ]; then grep -v ^/boot/$2 $1 > $1.tmp mv $1.tmp $1 fi } # For all paths appearing in $1 or $3, inspect the system # and generate $2 describing what is currently installed. fetch_inspect_system () { # No errors yet... rm -f .err # Tell the user why his disk is suddenly making lots of noise echo -n "Inspecting system... " # Generate list of files to inspect cat $1 $3 | cut -f 1 -d '|' | sort -u > filelist # Examine each file and output lines of the form # /path/to/file|type|device-inum|user|group|perm|flags|value # sorted by device and inode number. while read F; do # If the symlink/file/directory does not exist, record this. if ! [ -e ${BASEDIR}/${F} ]; then echo "${F}|-||||||" continue fi if ! [ -r ${BASEDIR}/${F} ]; then echo "Cannot read file: ${BASEDIR}/${F}" \ >/dev/stderr touch .err return 1 fi # Otherwise, output an index line. if [ -L ${BASEDIR}/${F} ]; then echo -n "${F}|L|" stat -n -f '%d-%i|%u|%g|%Mp%Lp|%Of|' ${BASEDIR}/${F}; readlink ${BASEDIR}/${F}; elif [ -f ${BASEDIR}/${F} ]; then echo -n "${F}|f|" stat -n -f '%d-%i|%u|%g|%Mp%Lp|%Of|' ${BASEDIR}/${F}; sha256 -q ${BASEDIR}/${F}; elif [ -d ${BASEDIR}/${F} ]; then echo -n "${F}|d|" stat -f '%d-%i|%u|%g|%Mp%Lp|%Of|' ${BASEDIR}/${F}; else echo "Unknown file type: ${BASEDIR}/${F}" \ >/dev/stderr touch .err return 1 fi done < filelist | sort -k 3,3 -t '|' > $2.tmp rm filelist # Check if an error occurred during system inspection if [ -f .err ]; then return 1 fi # Convert to the form # /path/to/file|type|user|group|perm|flags|value|hlink # by resolving identical device and inode numbers into hard links. cut -f 1,3 -d '|' $2.tmp | sort -k 1,1 -t '|' | sort -s -u -k 2,2 -t '|' | join -1 2 -2 3 -t '|' - $2.tmp | awk -F \| -v OFS=\| \ '{ if (($2 == $3) || ($4 == "-")) print $3,$4,$5,$6,$7,$8,$9,"" else print $3,$4,$5,$6,$7,$8,$9,$2 }' | sort > $2 rm $2.tmp # We're finished looking around echo "done." } # For any paths matching ${MERGECHANGES}, compare $1 and $2 and find any # files which differ; generate $3 containing these paths and the old hashes. fetch_filter_mergechanges () { # Pull out the paths and hashes of the files matching ${MERGECHANGES}. for F in $1 $2; do for X in ${MERGECHANGES}; do grep -E "^${X}" ${F} done | cut -f 1,2,7 -d '|' | sort > ${F}-values done # Any line in $2-values which doesn't appear in $1-values and is a # file means that we should list the path in $3. comm -13 $1-values $2-values | fgrep '|f|' | cut -f 1 -d '|' > $2-paths # For each path, pull out one (and only one!) entry from $1-values. # Note that we cannot distinguish which "old" version the user made # changes to; but hopefully any changes which occur due to security # updates will exist in both the "new" version and the version which # the user has installed, so the merging will still work. while read X; do look "${X}|" $1-values | head -1 done < $2-paths > $3 # Clean up rm $1-values $2-values $2-paths } # For any paths matching ${UPDATEIFUNMODIFIED}, remove lines from $[123] # which correspond to lines in $2 with hashes not matching $1 or $3, unless # the paths are listed in $4. For entries in $2 marked "not present" # (aka. type -), remove lines from $[123] unless there is a corresponding # entry in $1. fetch_filter_unmodified_notpresent () { # Figure out which lines of $1 and $3 correspond to bits which # should only be updated if they haven't changed, and fish out # the (path, type, value) tuples. # NOTE: We don't consider a file to be "modified" if it matches # the hash from $3. for X in ${UPDATEIFUNMODIFIED}; do grep -E "^${X}" $1 grep -E "^${X}" $3 done | cut -f 1,2,7 -d '|' | sort > $1-values # Do the same for $2. for X in ${UPDATEIFUNMODIFIED}; do grep -E "^${X}" $2 done | cut -f 1,2,7 -d '|' | sort > $2-values # Any entry in $2-values which is not in $1-values corresponds to # a path which we need to remove from $1, $2, and $3, unless it # that path appears in $4. comm -13 $1-values $2-values | sort -t '|' -k 1,1 > mlines.tmp cut -f 1 -d '|' $4 | sort | join -v 2 -t '|' - mlines.tmp | sort > mlines rm $1-values $2-values mlines.tmp # Any lines in $2 which are not in $1 AND are "not present" lines # also belong in mlines. comm -13 $1 $2 | cut -f 1,2,7 -d '|' | fgrep '|-|' >> mlines # Remove lines from $1, $2, and $3 for X in $1 $2 $3; do sort -t '|' -k 1,1 ${X} > ${X}.tmp cut -f 1 -d '|' < mlines | sort | join -v 2 -t '|' - ${X}.tmp | sort > ${X} rm ${X}.tmp done # Store a list of the modified files, for future reference fgrep -v '|-|' mlines | cut -f 1 -d '|' > modifiedfiles rm mlines } # For each entry in $1 of type -, remove any corresponding # entry from $2 if ${ALLOWADD} != "yes". Remove all entries # of type - from $1. fetch_filter_allowadd () { cut -f 1,2 -d '|' < $1 | fgrep '|-' | cut -f 1 -d '|' > filesnotpresent if [ ${ALLOWADD} != "yes" ]; then sort < $2 | join -v 1 -t '|' - filesnotpresent | sort > $2.tmp mv $2.tmp $2 fi sort < $1 | join -v 1 -t '|' - filesnotpresent | sort > $1.tmp mv $1.tmp $1 rm filesnotpresent } # If ${ALLOWDELETE} != "yes", then remove any entries from $1 # which don't correspond to entries in $2. fetch_filter_allowdelete () { # Produce a lists ${PATH}|${TYPE} for X in $1 $2; do cut -f 1-2 -d '|' < ${X} | sort -u > ${X}.nodes done # Figure out which lines need to be removed from $1. if [ ${ALLOWDELETE} != "yes" ]; then comm -23 $1.nodes $2.nodes > $1.badnodes else : > $1.badnodes fi # Remove the relevant lines from $1 while read X; do look "${X}|" $1 done < $1.badnodes | comm -13 - $1 > $1.tmp mv $1.tmp $1 rm $1.badnodes $1.nodes $2.nodes } # If ${KEEPMODIFIEDMETADATA} == "yes", then for each entry in $2 # with metadata not matching any entry in $1, replace the corresponding # line of $3 with one having the same metadata as the entry in $2. fetch_filter_modified_metadata () { # Fish out the metadata from $1 and $2 for X in $1 $2; do cut -f 1-6 -d '|' < ${X} > ${X}.metadata done # Find the metadata we need to keep if [ ${KEEPMODIFIEDMETADATA} = "yes" ]; then comm -13 $1.metadata $2.metadata > keepmeta else : > keepmeta fi # Extract the lines which we need to remove from $3, and # construct the lines which we need to add to $3. : > $3.remove : > $3.add while read LINE; do NODE=`echo "${LINE}" | cut -f 1-2 -d '|'` look "${NODE}|" $3 >> $3.remove look "${NODE}|" $3 | cut -f 7- -d '|' | lam -s "${LINE}|" - >> $3.add done < keepmeta # Remove the specified lines and add the new lines. sort $3.remove | comm -13 - $3 | sort -u - $3.add > $3.tmp mv $3.tmp $3 rm keepmeta $1.metadata $2.metadata $3.add $3.remove } # Remove lines from $1 and $2 which are identical; # no need to update a file if it isn't changing. fetch_filter_uptodate () { comm -23 $1 $2 > $1.tmp comm -13 $1 $2 > $2.tmp mv $1.tmp $1 mv $2.tmp $2 } # Fetch any "clean" old versions of files we need for merging changes. fetch_files_premerge () { # We only need to do anything if $1 is non-empty. if [ -s $1 ]; then # Tell the user what we're doing echo -n "Fetching files from ${OLDRELNUM} for merging... " # List of files wanted fgrep '|f|' < $1 | cut -f 3 -d '|' | sort -u > files.wanted # Only fetch the files we don't already have while read Y; do if [ ! -f "files/${Y}.gz" ]; then echo ${Y}; fi done < files.wanted > filelist # Actually fetch them lam -s "${OLDFETCHDIR}/f/" - -s ".gz" < filelist | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${QUIETREDIR} # Make sure we got them all, and move them into /files/ while read Y; do if ! [ -f ${Y}.gz ]; then echo "failed." return 1 fi if [ `gunzip -c < ${Y}.gz | ${SHA256} -q` = ${Y} ]; then mv ${Y}.gz files/${Y}.gz else echo "${Y} has incorrect hash." return 1 fi done < filelist echo "done." # Clean up rm filelist files.wanted fi } # Prepare to fetch files: Generate a list of the files we need, # copy the unmodified files we have into /files/, and generate # a list of patches to download. fetch_files_prepare () { # Tell the user why his disk is suddenly making lots of noise echo -n "Preparing to download files... " # Reduce indices to ${PATH}|${HASH} pairs for X in $1 $2 $3; do cut -f 1,2,7 -d '|' < ${X} | fgrep '|f|' | cut -f 1,3 -d '|' | sort > ${X}.hashes done # List of files wanted cut -f 2 -d '|' < $3.hashes | sort -u | while read HASH; do if ! [ -f files/${HASH}.gz ]; then echo ${HASH} fi done > files.wanted # Generate a list of unmodified files comm -12 $1.hashes $2.hashes | sort -k 1,1 -t '|' > unmodified.files # Copy all files into /files/. We only need the unmodified files # for use in patching; but we'll want all of them if the user asks # to rollback the updates later. while read LINE; do F=`echo "${LINE}" | cut -f 1 -d '|'` HASH=`echo "${LINE}" | cut -f 2 -d '|'` # Skip files we already have. if [ -f files/${HASH}.gz ]; then continue fi # Make sure the file hasn't changed. cp "${BASEDIR}/${F}" tmpfile if [ `sha256 -q tmpfile` != ${HASH} ]; then echo echo "File changed while FreeBSD Update running: ${F}" return 1 fi # Place the file into storage. gzip -c < tmpfile > files/${HASH}.gz rm tmpfile done < $2.hashes # Produce a list of patches to download sort -k 1,1 -t '|' $3.hashes | join -t '|' -o 2.2,1.2 - unmodified.files | fetch_make_patchlist > patchlist # Garbage collect rm unmodified.files $1.hashes $2.hashes $3.hashes # We don't need the list of possible old files any more. rm $1 # We're finished making noise echo "done." } # Fetch files. fetch_files () { # Attempt to fetch patches if [ -s patchlist ]; then echo -n "Fetching `wc -l < patchlist | tr -d ' '` " echo ${NDEBUG} "patches.${DDSTATS}" tr '|' '-' < patchlist | lam -s "${PATCHDIR}/" - | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${STATSREDIR} | fetch_progress echo "done." # Attempt to apply patches echo -n "Applying patches... " tr '|' ' ' < patchlist | while read X Y; do if [ ! -f "${X}-${Y}" ]; then continue; fi gunzip -c < files/${X}.gz > OLD bspatch OLD NEW ${X}-${Y} if [ `${SHA256} -q NEW` = ${Y} ]; then mv NEW files/${Y} gzip -n files/${Y} fi rm -f diff OLD NEW ${X}-${Y} done 2>${QUIETREDIR} echo "done." fi # Download files which couldn't be generate via patching while read Y; do if [ ! -f "files/${Y}.gz" ]; then echo ${Y}; fi done < files.wanted > filelist if [ -s filelist ]; then echo -n "Fetching `wc -l < filelist | tr -d ' '` " echo ${NDEBUG} "files... " lam -s "${FETCHDIR}/f/" - -s ".gz" < filelist | xargs ${XARGST} ${PHTTPGET} ${SERVERNAME} \ 2>${STATSREDIR} | fetch_progress while read Y; do if ! [ -f ${Y}.gz ]; then echo "failed." return 1 fi if [ `gunzip -c < ${Y}.gz | ${SHA256} -q` = ${Y} ]; then mv ${Y}.gz files/${Y}.gz else echo "${Y} has incorrect hash." return 1 fi done < filelist echo "done." fi # Clean up rm files.wanted filelist patchlist } # Create and populate install manifest directory; and report what updates # are available. fetch_create_manifest () { # If we have an existing install manifest, nuke it. if [ -L "${BDHASH}-install" ]; then rm -r ${BDHASH}-install/ rm ${BDHASH}-install fi # Report to the user if any updates were avoided due to local changes if [ -s modifiedfiles ]; then cat - modifiedfiles <<- EOF | ${PAGER} The following files are affected by updates. No changes have been downloaded, however, because the files have been modified locally: EOF fi rm modifiedfiles # If no files will be updated, tell the user and exit if ! [ -s INDEX-PRESENT ] && ! [ -s INDEX-NEW ]; then rm INDEX-PRESENT INDEX-NEW echo echo -n "No updates needed to update system to " echo "${RELNUM}-p${RELPATCHNUM}." return fi # Divide files into (a) removed files, (b) added files, and # (c) updated files. cut -f 1 -d '|' < INDEX-PRESENT | sort > INDEX-PRESENT.flist cut -f 1 -d '|' < INDEX-NEW | sort > INDEX-NEW.flist comm -23 INDEX-PRESENT.flist INDEX-NEW.flist > files.removed comm -13 INDEX-PRESENT.flist INDEX-NEW.flist > files.added comm -12 INDEX-PRESENT.flist INDEX-NEW.flist > files.updated rm INDEX-PRESENT.flist INDEX-NEW.flist # Report removed files, if any if [ -s files.removed ]; then cat - files.removed <<- EOF | ${PAGER} The following files will be removed as part of updating to ${RELNUM}-p${RELPATCHNUM}: EOF fi rm files.removed # Report added files, if any if [ -s files.added ]; then cat - files.added <<- EOF | ${PAGER} The following files will be added as part of updating to ${RELNUM}-p${RELPATCHNUM}: EOF fi rm files.added # Report updated files, if any if [ -s files.updated ]; then cat - files.updated <<- EOF | ${PAGER} The following files will be updated as part of updating to ${RELNUM}-p${RELPATCHNUM}: EOF fi rm files.updated # Create a directory for the install manifest. MDIR=`mktemp -d install.XXXXXX` || return 1 # Populate it mv INDEX-PRESENT ${MDIR}/INDEX-OLD mv INDEX-NEW ${MDIR}/INDEX-NEW # Link it into place ln -s ${MDIR} ${BDHASH}-install } # Warn about any upcoming EoL fetch_warn_eol () { # What's the current time? NOWTIME=`date "+%s"` # When did we last warn about the EoL date? if [ -f lasteolwarn ]; then LASTWARN=`cat lasteolwarn` else LASTWARN=`expr ${NOWTIME} - 63072000` fi # If the EoL time is past, warn. if [ ${EOLTIME} -lt ${NOWTIME} ]; then echo cat <<-EOF WARNING: `uname -sr` HAS PASSED ITS END-OF-LIFE DATE. Any security issues discovered after `date -r ${EOLTIME}` will not have been corrected. EOF return 1 fi # Figure out how long it has been since we last warned about the # upcoming EoL, and how much longer we have left. SINCEWARN=`expr ${NOWTIME} - ${LASTWARN}` TIMELEFT=`expr ${EOLTIME} - ${NOWTIME}` # Don't warn if the EoL is more than 3 months away if [ ${TIMELEFT} -gt 7884000 ]; then return 0 fi # Don't warn if the time remaining is more than 3 times the time # since the last warning. if [ ${TIMELEFT} -gt `expr ${SINCEWARN} \* 3` ]; then return 0 fi # Figure out what time units to use. if [ ${TIMELEFT} -lt 604800 ]; then UNIT="day" SIZE=86400 elif [ ${TIMELEFT} -lt 2678400 ]; then UNIT="week" SIZE=604800 else UNIT="month" SIZE=2678400 fi # Compute the right number of units NUM=`expr ${TIMELEFT} / ${SIZE}` if [ ${NUM} != 1 ]; then UNIT="${UNIT}s" fi # Print the warning echo cat <<-EOF WARNING: `uname -sr` is approaching its End-of-Life date. It is strongly recommended that you upgrade to a newer release within the next ${NUM} ${UNIT}. EOF # Update the stored time of last warning echo ${NOWTIME} > lasteolwarn } # Do the actual work involved in "fetch" / "cron". fetch_run () { workdir_init || return 1 # Prepare the mirror list. fetch_pick_server_init && fetch_pick_server # Try to fetch the public key until we run out of servers. while ! fetch_key; do fetch_pick_server || return 1 done # Try to fetch the metadata index signature ("tag") until we run # out of available servers; and sanity check the downloaded tag. while ! fetch_tag; do fetch_pick_server || return 1 done fetch_tagsanity || return 1 # Fetch the latest INDEX-NEW and INDEX-OLD files. fetch_metadata INDEX-NEW INDEX-OLD || return 1 # Generate filtered INDEX-NEW and INDEX-OLD files containing only # the lines which (a) belong to components we care about, and (b) # don't correspond to paths we're explicitly ignoring. fetch_filter_metadata INDEX-NEW || return 1 fetch_filter_metadata INDEX-OLD || return 1 # Translate /boot/${KERNCONF} into ${KERNELDIR} fetch_filter_kernel_names INDEX-NEW ${KERNCONF} fetch_filter_kernel_names INDEX-OLD ${KERNCONF} # For all paths appearing in INDEX-OLD or INDEX-NEW, inspect the # system and generate an INDEX-PRESENT file. fetch_inspect_system INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1 # Based on ${UPDATEIFUNMODIFIED}, remove lines from INDEX-* which # correspond to lines in INDEX-PRESENT with hashes not appearing # in INDEX-OLD or INDEX-NEW. Also remove lines where the entry in # INDEX-PRESENT has type - and there isn't a corresponding entry in # INDEX-OLD with type -. fetch_filter_unmodified_notpresent \ INDEX-OLD INDEX-PRESENT INDEX-NEW /dev/null # For each entry in INDEX-PRESENT of type -, remove any corresponding # entry from INDEX-NEW if ${ALLOWADD} != "yes". Remove all entries # of type - from INDEX-PRESENT. fetch_filter_allowadd INDEX-PRESENT INDEX-NEW # If ${ALLOWDELETE} != "yes", then remove any entries from # INDEX-PRESENT which don't correspond to entries in INDEX-NEW. fetch_filter_allowdelete INDEX-PRESENT INDEX-NEW # If ${KEEPMODIFIEDMETADATA} == "yes", then for each entry in # INDEX-PRESENT with metadata not matching any entry in INDEX-OLD, # replace the corresponding line of INDEX-NEW with one having the # same metadata as the entry in INDEX-PRESENT. fetch_filter_modified_metadata INDEX-OLD INDEX-PRESENT INDEX-NEW # Remove lines from INDEX-PRESENT and INDEX-NEW which are identical; # no need to update a file if it isn't changing. fetch_filter_uptodate INDEX-PRESENT INDEX-NEW # Prepare to fetch files: Generate a list of the files we need, # copy the unmodified files we have into /files/, and generate # a list of patches to download. fetch_files_prepare INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1 # Fetch files. fetch_files || return 1 # Create and populate install manifest directory; and report what # updates are available. fetch_create_manifest || return 1 # Warn about any upcoming EoL fetch_warn_eol || return 1 } # If StrictComponents is not "yes", generate a new components list # with only the components which appear to be installed. upgrade_guess_components () { if [ "${STRICTCOMPONENTS}" = "no" ]; then # Generate filtered INDEX-ALL with only the components listed # in COMPONENTS. fetch_filter_metadata_components $1 || return 1 # Tell the user why his disk is suddenly making lots of noise echo -n "Inspecting system... " # Look at the files on disk, and assume that a component is # supposed to be present if it is more than half-present. cut -f 1-3 -d '|' < INDEX-ALL | tr '|' ' ' | while read C S F; do if [ -e ${BASEDIR}/${F} ]; then echo "+ ${C}|${S}" fi echo "= ${C}|${S}" done | sort | uniq -c | sed -E 's,^ +,,' > compfreq grep ' = ' compfreq | cut -f 1,3 -d ' ' | sort -k 2,2 -t ' ' > compfreq.total grep ' + ' compfreq | cut -f 1,3 -d ' ' | sort -k 2,2 -t ' ' > compfreq.present join -t ' ' -1 2 -2 2 compfreq.present compfreq.total | while read S P T; do if [ ${T} -ne 0 -a ${P} -gt `expr ${T} / 2` ]; then echo ${S} fi done > comp.present cut -f 2 -d ' ' < compfreq.total > comp.total rm INDEX-ALL compfreq compfreq.total compfreq.present # We're done making noise. echo "done." # Sometimes the kernel isn't installed where INDEX-ALL # thinks that it should be: In particular, it is often in # /boot/kernel instead of /boot/GENERIC or /boot/SMP. To # deal with this, if "kernel|X" is listed in comp.total # (i.e., is a component which would be upgraded if it is # found to be present) we will add it to comp.present. # If "kernel|" is in comp.total but "kernel|X" is # not, we print a warning -- the user is running a kernel # which isn't part of the release. KCOMP=`echo ${KERNCONF} | tr 'A-Z' 'a-z'` grep -E "^kernel\|${KCOMP}\$" comp.total >> comp.present if grep -qE "^kernel\|" comp.total && ! grep -qE "^kernel\|${KCOMP}\$" comp.total; then cat <<-EOF WARNING: This system is running a "${KCOMP}" kernel, which is not a kernel configuration distributed as part of FreeBSD ${RELNUM}. This kernel will not be updated: you MUST update the kernel manually before running "$0 install". EOF fi # Re-sort the list of installed components and generate # the list of non-installed components. sort -u < comp.present > comp.present.tmp mv comp.present.tmp comp.present comm -13 comp.present comp.total > comp.absent # Ask the user to confirm that what we have is correct. To # reduce user confusion, translate "X|Y" back to "X/Y" (as # subcomponents must be listed in the configuration file). echo echo -n "The following components of FreeBSD " echo "seem to be installed:" tr '|' '/' < comp.present | fmt -72 echo echo -n "The following components of FreeBSD " echo "do not seem to be installed:" tr '|' '/' < comp.absent | fmt -72 echo continuep || return 1 echo # Suck the generated list of components into ${COMPONENTS}. # Note that comp.present.tmp is used due to issues with # pipelines and setting variables. COMPONENTS="" tr '|' '/' < comp.present > comp.present.tmp while read C; do COMPONENTS="${COMPONENTS} ${C}" done < comp.present.tmp # Delete temporary files rm comp.present comp.present.tmp comp.absent comp.total fi } # If StrictComponents is not "yes", COMPONENTS contains an entry # corresponding to the currently running kernel, and said kernel # does not exist in the new release, add "kernel/generic" to the # list of components. upgrade_guess_new_kernel () { if [ "${STRICTCOMPONENTS}" = "no" ]; then # Grab the unfiltered metadata file. METAHASH=`look "$1|" tINDEX.present | cut -f 2 -d '|'` gunzip -c < files/${METAHASH}.gz > $1.all # If "kernel/${KCOMP}" is in ${COMPONENTS} and that component # isn't in $1.all, we need to add kernel/generic. for C in ${COMPONENTS}; do if [ ${C} = "kernel/${KCOMP}" ] && ! grep -qE "^kernel\|${KCOMP}\|" $1.all; then COMPONENTS="${COMPONENTS} kernel/generic" NKERNCONF="GENERIC" cat <<-EOF WARNING: This system is running a "${KCOMP}" kernel, which is not a kernel configuration distributed as part of FreeBSD ${RELNUM}. As part of upgrading to FreeBSD ${RELNUM}, this kernel will be replaced with a "generic" kernel. EOF continuep || return 1 fi done # Don't need this any more... rm $1.all fi } # Convert INDEX-OLD (last release) and INDEX-ALL (new release) into # INDEX-OLD and INDEX-NEW files (in the sense of normal upgrades). upgrade_oldall_to_oldnew () { # For each ${F}|... which appears in INDEX-ALL but does not appear # in INDEX-OLD, add ${F}|-|||||| to INDEX-OLD. cut -f 1 -d '|' < $1 | sort -u > $1.paths cut -f 1 -d '|' < $2 | sort -u | comm -13 $1.paths - | lam - -s "|-||||||" | sort - $1 > $1.tmp mv $1.tmp $1 # Remove lines from INDEX-OLD which also appear in INDEX-ALL comm -23 $1 $2 > $1.tmp mv $1.tmp $1 # Remove lines from INDEX-ALL which have a file name not appearing # anywhere in INDEX-OLD (since these must be files which haven't # changed -- if they were new, there would be an entry of type "-"). cut -f 1 -d '|' < $1 | sort -u > $1.paths sort -k 1,1 -t '|' < $2 | join -t '|' - $1.paths | sort > $2.tmp rm $1.paths mv $2.tmp $2 # Rename INDEX-ALL to INDEX-NEW. mv $2 $3 } # Helper for upgrade_merge: Return zero true iff the two files differ only # in the contents of their RCS tags. samef () { X=`sed -E 's/\\$FreeBSD.*\\$/\$FreeBSD\$/' < $1 | ${SHA256}` Y=`sed -E 's/\\$FreeBSD.*\\$/\$FreeBSD\$/' < $2 | ${SHA256}` if [ $X = $Y ]; then return 0; else return 1; fi } # From the list of "old" files in $1, merge changes in $2 with those in $3, # and update $3 to reflect the hashes of merged files. upgrade_merge () { # We only need to do anything if $1 is non-empty. if [ -s $1 ]; then cut -f 1 -d '|' $1 | sort > $1-paths # Create staging area for merging files rm -rf merge/ while read F; do D=`dirname ${F}` mkdir -p merge/old/${D} mkdir -p merge/${OLDRELNUM}/${D} mkdir -p merge/${RELNUM}/${D} mkdir -p merge/new/${D} done < $1-paths # Copy in files while read F; do # Currently installed file V=`look "${F}|" $2 | cut -f 7 -d '|'` gunzip < files/${V}.gz > merge/old/${F} # Old release if look "${F}|" $1 | fgrep -q "|f|"; then V=`look "${F}|" $1 | cut -f 3 -d '|'` gunzip < files/${V}.gz \ > merge/${OLDRELNUM}/${F} fi # New release if look "${F}|" $3 | cut -f 1,2,7 -d '|' | fgrep -q "|f|"; then V=`look "${F}|" $3 | cut -f 7 -d '|'` gunzip < files/${V}.gz \ > merge/${RELNUM}/${F} fi done < $1-paths # Attempt to automatically merge changes echo -n "Attempting to automatically merge " echo -n "changes in files..." : > failed.merges while read F; do # If the file doesn't exist in the new release, # the result of "merging changes" is having the file # not exist. if ! [ -f merge/${RELNUM}/${F} ]; then continue fi # If the file didn't exist in the old release, we're # going to throw away the existing file and hope that # the version from the new release is what we want. if ! [ -f merge/${OLDRELNUM}/${F} ]; then cp merge/${RELNUM}/${F} merge/new/${F} continue fi # Some files need special treatment. case ${F} in /etc/spwd.db | /etc/pwd.db | /etc/login.conf.db) # Don't merge these -- we're rebuild them # after updates are installed. cp merge/old/${F} merge/new/${F} ;; *) if ! diff3 -E -m -L "current version" \ -L "${OLDRELNUM}" -L "${RELNUM}" \ merge/old/${F} \ merge/${OLDRELNUM}/${F} \ merge/${RELNUM}/${F} \ > merge/new/${F} 2>/dev/null; then echo ${F} >> failed.merges fi ;; esac done < $1-paths echo " done." # Ask the user to handle any files which didn't merge. while read F; do # If the installed file differs from the version in # the old release only due to RCS tag expansion # then just use the version in the new release. if samef merge/old/${F} merge/${OLDRELNUM}/${F}; then cp merge/${RELNUM}/${F} merge/new/${F} continue fi cat <<-EOF The following file could not be merged automatically: ${F} Press Enter to edit this file in ${EDITOR} and resolve the conflicts manually... EOF read dummy files/${V}.gz echo "${F}|${V}" fi done < $1-paths > newhashes # Pull lines out from $3 which need to be updated to # reflect merged files. while read F; do look "${F}|" $3 done < $1-paths > $3-oldlines # Update lines to reflect merged files join -t '|' -o 1.1,1.2,1.3,1.4,1.5,1.6,2.2,1.8 \ $3-oldlines newhashes > $3-newlines # Remove old lines from $3 and add new lines. sort $3-oldlines | comm -13 - $3 | sort - $3-newlines > $3.tmp mv $3.tmp $3 # Clean up rm $1-paths newhashes $3-oldlines $3-newlines rm -rf merge/ fi # We're done with merging files. rm $1 } # Do the work involved in fetching upgrades to a new release upgrade_run () { workdir_init || return 1 # Prepare the mirror list. fetch_pick_server_init && fetch_pick_server # Try to fetch the public key until we run out of servers. while ! fetch_key; do fetch_pick_server || return 1 done # Try to fetch the metadata index signature ("tag") until we run # out of available servers; and sanity check the downloaded tag. while ! fetch_tag; do fetch_pick_server || return 1 done fetch_tagsanity || return 1 # Fetch the INDEX-OLD and INDEX-ALL. fetch_metadata INDEX-OLD INDEX-ALL || return 1 # If StrictComponents is not "yes", generate a new components list # with only the components which appear to be installed. upgrade_guess_components INDEX-ALL || return 1 # Generate filtered INDEX-OLD and INDEX-ALL files containing only # the components we want and without anything marked as "Ignore". fetch_filter_metadata INDEX-OLD || return 1 fetch_filter_metadata INDEX-ALL || return 1 # Merge the INDEX-OLD and INDEX-ALL files into INDEX-OLD. sort INDEX-OLD INDEX-ALL > INDEX-OLD.tmp mv INDEX-OLD.tmp INDEX-OLD rm INDEX-ALL # Adjust variables for fetching files from the new release. OLDRELNUM=${RELNUM} RELNUM=${TARGETRELEASE} OLDFETCHDIR=${FETCHDIR} FETCHDIR=${RELNUM}/${ARCH} # Try to fetch the NEW metadata index signature ("tag") until we run # out of available servers; and sanity check the downloaded tag. while ! fetch_tag; do fetch_pick_server || return 1 done # Fetch the new INDEX-ALL. fetch_metadata INDEX-ALL || return 1 # If StrictComponents is not "yes", COMPONENTS contains an entry # corresponding to the currently running kernel, and said kernel # does not exist in the new release, add "kernel/generic" to the # list of components. upgrade_guess_new_kernel INDEX-ALL || return 1 # Filter INDEX-ALL to contain only the components we want and without # anything marked as "Ignore". fetch_filter_metadata INDEX-ALL || return 1 # Convert INDEX-OLD (last release) and INDEX-ALL (new release) into # INDEX-OLD and INDEX-NEW files (in the sense of normal upgrades). upgrade_oldall_to_oldnew INDEX-OLD INDEX-ALL INDEX-NEW # Translate /boot/${KERNCONF} or /boot/${NKERNCONF} into ${KERNELDIR} fetch_filter_kernel_names INDEX-NEW ${NKERNCONF} fetch_filter_kernel_names INDEX-OLD ${KERNCONF} # For all paths appearing in INDEX-OLD or INDEX-NEW, inspect the # system and generate an INDEX-PRESENT file. fetch_inspect_system INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1 # Based on ${MERGECHANGES}, generate a file tomerge-old with the # paths and hashes of old versions of files to merge. fetch_filter_mergechanges INDEX-OLD INDEX-PRESENT tomerge-old # Based on ${UPDATEIFUNMODIFIED}, remove lines from INDEX-* which # correspond to lines in INDEX-PRESENT with hashes not appearing # in INDEX-OLD or INDEX-NEW. Also remove lines where the entry in # INDEX-PRESENT has type - and there isn't a corresponding entry in # INDEX-OLD with type -. fetch_filter_unmodified_notpresent \ INDEX-OLD INDEX-PRESENT INDEX-NEW tomerge-old # For each entry in INDEX-PRESENT of type -, remove any corresponding # entry from INDEX-NEW if ${ALLOWADD} != "yes". Remove all entries # of type - from INDEX-PRESENT. fetch_filter_allowadd INDEX-PRESENT INDEX-NEW # If ${ALLOWDELETE} != "yes", then remove any entries from # INDEX-PRESENT which don't correspond to entries in INDEX-NEW. fetch_filter_allowdelete INDEX-PRESENT INDEX-NEW # If ${KEEPMODIFIEDMETADATA} == "yes", then for each entry in # INDEX-PRESENT with metadata not matching any entry in INDEX-OLD, # replace the corresponding line of INDEX-NEW with one having the # same metadata as the entry in INDEX-PRESENT. fetch_filter_modified_metadata INDEX-OLD INDEX-PRESENT INDEX-NEW # Remove lines from INDEX-PRESENT and INDEX-NEW which are identical; # no need to update a file if it isn't changing. fetch_filter_uptodate INDEX-PRESENT INDEX-NEW # Fetch "clean" files from the old release for merging changes. fetch_files_premerge tomerge-old # Prepare to fetch files: Generate a list of the files we need, # copy the unmodified files we have into /files/, and generate # a list of patches to download. fetch_files_prepare INDEX-OLD INDEX-PRESENT INDEX-NEW || return 1 # Fetch patches from to-${RELNUM}/${ARCH}/bp/ PATCHDIR=to-${RELNUM}/${ARCH}/bp fetch_files || return 1 # Merge configuration file changes. upgrade_merge tomerge-old INDEX-PRESENT INDEX-NEW || return 1 # Create and populate install manifest directory; and report what # updates are available. fetch_create_manifest || return 1 # Leave a note behind to tell the "install" command that the kernel # needs to be installed before the world. touch ${BDHASH}-install/kernelfirst # Remind the user that they need to run "freebsd-update install" # to install the downloaded bits, in case they didn't RTFM. echo "To install the downloaded upgrades, run \"$0 install\"." } # Make sure that all the file hashes mentioned in $@ have corresponding # gzipped files stored in /files/. install_verify () { # Generate a list of hashes cat $@ | cut -f 2,7 -d '|' | grep -E '^f' | cut -f 2 -d '|' | sort -u > filelist # Make sure all the hashes exist while read HASH; do if ! [ -f files/${HASH}.gz ]; then echo -n "Update files missing -- " echo "this should never happen." echo "Re-run '$0 fetch'." return 1 fi done < filelist # Clean up rm filelist } # Remove the system immutable flag from files install_unschg () { # Generate file list cat $@ | cut -f 1 -d '|' > filelist # Remove flags while read F; do if ! [ -e ${BASEDIR}/${F} ]; then continue else echo ${BASEDIR}/${F} fi done < filelist | xargs chflags noschg || return 1 # Clean up rm filelist } # Decide which directory name to use for kernel backups. backup_kernel_finddir () { CNT=0 while true ; do # Pathname does not exist, so it is OK use that name # for backup directory. if [ ! -e $BASEDIR/$BACKUPKERNELDIR ]; then return 0 fi # If directory do exist, we only use if it has our # marker file. if [ -d $BASEDIR/$BACKUPKERNELDIR -a \ -e $BASEDIR/$BACKUPKERNELDIR/.freebsd-update ]; then return 0 fi # We could not use current directory name, so add counter to # the end and try again. CNT=$((CNT + 1)) if [ $CNT -gt 9 ]; then echo "Could not find valid backup dir ($BASEDIR/$BACKUPKERNELDIR)" exit 1 fi BACKUPKERNELDIR="`echo $BACKUPKERNELDIR | sed -Ee 's/[0-9]\$//'`" BACKUPKERNELDIR="${BACKUPKERNELDIR}${CNT}" done } # Backup the current kernel using hardlinks, if not disabled by user. # Since we delete all files in the directory used for previous backups # we create a marker file called ".freebsd-update" in the directory so # we can determine on the next run that the directory was created by # freebsd-update and we then do not accidentally remove user files in # the unlikely case that the user has created a directory with a # conflicting name. backup_kernel () { # Only make kernel backup is so configured. if [ $BACKUPKERNEL != yes ]; then return 0 fi # Decide which directory name to use for kernel backups. backup_kernel_finddir # Remove old kernel backup files. If $BACKUPKERNELDIR was # "not ours", backup_kernel_finddir would have exited, so # deleting the directory content is as safe as we can make it. if [ -d $BASEDIR/$BACKUPKERNELDIR ]; then rm -fr $BASEDIR/$BACKUPKERNELDIR fi # Create directories for backup. mkdir -p $BASEDIR/$BACKUPKERNELDIR mtree -cdn -p "${BASEDIR}/${KERNELDIR}" | \ mtree -Ue -p "${BASEDIR}/${BACKUPKERNELDIR}" > /dev/null # Mark the directory as having been created by freebsd-update. touch $BASEDIR/$BACKUPKERNELDIR/.freebsd-update if [ $? -ne 0 ]; then echo "Could not create kernel backup directory" exit 1 fi # Disable pathname expansion to be sure *.symbols is not # expanded. set -f # Use find to ignore symbol files, unless disabled by user. if [ $BACKUPKERNELSYMBOLFILES = yes ]; then FINDFILTER="" else FINDFILTER="-a ! -name *.debug -a ! -name *.symbols" fi # Backup all the kernel files using hardlinks. (cd ${BASEDIR}/${KERNELDIR} && find . -type f $FINDFILTER -exec \ cp -pl '{}' ${BASEDIR}/${BACKUPKERNELDIR}/'{}' \;) # Re-enable patchname expansion. set +f } # Install new files install_from_index () { # First pass: Do everything apart from setting file flags. We # can't set flags yet, because schg inhibits hard linking. sort -k 1,1 -t '|' $1 | tr '|' ' ' | while read FPATH TYPE OWNER GROUP PERM FLAGS HASH LINK; do case ${TYPE} in d) # Create a directory install -d -o ${OWNER} -g ${GROUP} \ -m ${PERM} ${BASEDIR}/${FPATH} ;; f) if [ -z "${LINK}" ]; then # Create a file, without setting flags. gunzip < files/${HASH}.gz > ${HASH} install -S -o ${OWNER} -g ${GROUP} \ -m ${PERM} ${HASH} ${BASEDIR}/${FPATH} rm ${HASH} else # Create a hard link. ln -f ${BASEDIR}/${LINK} ${BASEDIR}/${FPATH} fi ;; L) # Create a symlink ln -sfh ${HASH} ${BASEDIR}/${FPATH} ;; esac done # Perform a second pass, adding file flags. tr '|' ' ' < $1 | while read FPATH TYPE OWNER GROUP PERM FLAGS HASH LINK; do if [ ${TYPE} = "f" ] && ! [ ${FLAGS} = "0" ]; then chflags ${FLAGS} ${BASEDIR}/${FPATH} fi done } # Remove files which we want to delete install_delete () { # Generate list of new files cut -f 1 -d '|' < $2 | sort > newfiles # Generate subindex of old files we want to nuke sort -k 1,1 -t '|' $1 | join -t '|' -v 1 - newfiles | sort -r -k 1,1 -t '|' | cut -f 1,2 -d '|' | tr '|' ' ' > killfiles # Remove the offending bits while read FPATH TYPE; do case ${TYPE} in d) rmdir ${BASEDIR}/${FPATH} ;; f) rm ${BASEDIR}/${FPATH} ;; L) rm ${BASEDIR}/${FPATH} ;; esac done < killfiles # Clean up rm newfiles killfiles } # Install new files, delete old files, and update linker.hints install_files () { # If we haven't already dealt with the kernel, deal with it. if ! [ -f $1/kerneldone ]; then grep -E '^/boot/' $1/INDEX-OLD > INDEX-OLD grep -E '^/boot/' $1/INDEX-NEW > INDEX-NEW # Backup current kernel before installing a new one backup_kernel || return 1 # Install new files install_from_index INDEX-NEW || return 1 # Remove files which need to be deleted install_delete INDEX-OLD INDEX-NEW || return 1 # Update linker.hints if necessary if [ -s INDEX-OLD -o -s INDEX-NEW ]; then kldxref -R ${BASEDIR}/boot/ 2>/dev/null fi # We've finished updating the kernel. touch $1/kerneldone # Do we need to ask for a reboot now? if [ -f $1/kernelfirst ] && [ -s INDEX-OLD -o -s INDEX-NEW ]; then cat <<-EOF Kernel updates have been installed. Please reboot and run "$0 install" again to finish installing updates. EOF exit 0 fi fi # If we haven't already dealt with the world, deal with it. if ! [ -f $1/worlddone ]; then # Create any necessary directories first grep -vE '^/boot/' $1/INDEX-NEW | grep -E '^[^|]+\|d\|' > INDEX-NEW install_from_index INDEX-NEW || return 1 # Install new runtime linker grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '^[^|]+\|d\|' | grep -E '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' > INDEX-NEW install_from_index INDEX-NEW || return 1 # Install new shared libraries next grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '^[^|]+\|d\|' | grep -vE '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-NEW install_from_index INDEX-NEW || return 1 # Deal with everything else grep -vE '^/boot/' $1/INDEX-OLD | grep -vE '^[^|]+\|d\|' | grep -vE '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' | grep -vE '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-OLD grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '^[^|]+\|d\|' | grep -vE '^/libexec/ld-elf[^|]*\.so\.[0-9]+\|' | grep -vE '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-NEW install_from_index INDEX-NEW || return 1 install_delete INDEX-OLD INDEX-NEW || return 1 # Rebuild generated pwd files. if [ ${BASEDIR}/etc/master.passwd -nt ${BASEDIR}/etc/spwd.db ] || [ ${BASEDIR}/etc/master.passwd -nt ${BASEDIR}/etc/pwd.db ] || [ ${BASEDIR}/etc/master.passwd -nt ${BASEDIR}/etc/passwd ]; then pwd_mkdb -d ${BASEDIR}/etc -p ${BASEDIR}/etc/master.passwd fi # Rebuild /etc/login.conf.db if necessary. if [ ${BASEDIR}/etc/login.conf -nt ${BASEDIR}/etc/login.conf.db ]; then cap_mkdb ${BASEDIR}/etc/login.conf fi # Rebuild man page databases, if necessary. for D in /usr/share/man /usr/share/openssl/man; do if [ ! -d ${BASEDIR}/$D ]; then continue fi if [ -z "$(find ${BASEDIR}/$D -type f -newer ${BASEDIR}/$D/mandoc.db)" ]; then continue; fi makewhatis ${BASEDIR}/$D done # We've finished installing the world and deleting old files # which are not shared libraries. touch $1/worlddone # Do we need to ask the user to portupgrade now? grep -vE '^/boot/' $1/INDEX-NEW | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' | cut -f 1 -d '|' | sort > newfiles if grep -vE '^/boot/' $1/INDEX-OLD | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' | cut -f 1 -d '|' | sort | join -v 1 - newfiles | grep -q .; then cat <<-EOF Completing this upgrade requires removing old shared object files. Please rebuild all installed 3rd party software (e.g., programs installed from the ports tree) and then run "$0 install" again to finish installing updates. EOF rm newfiles exit 0 fi rm newfiles fi # Remove old shared libraries grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '^[^|]+\|d\|' | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-NEW grep -vE '^/boot/' $1/INDEX-OLD | grep -vE '^[^|]+\|d\|' | grep -E '^[^|]*/lib/[^|]*\.so\.[0-9]+\|' > INDEX-OLD install_delete INDEX-OLD INDEX-NEW || return 1 # Remove old directories grep -vE '^/boot/' $1/INDEX-NEW | grep -E '^[^|]+\|d\|' > INDEX-NEW grep -vE '^/boot/' $1/INDEX-OLD | grep -E '^[^|]+\|d\|' > INDEX-OLD install_delete INDEX-OLD INDEX-NEW || return 1 # Remove temporary files rm INDEX-OLD INDEX-NEW } # Rearrange bits to allow the installed updates to be rolled back install_setup_rollback () { # Remove the "reboot after installing kernel", "kernel updated", and # "finished installing the world" flags if present -- they are # irrelevant when rolling back updates. if [ -f ${BDHASH}-install/kernelfirst ]; then rm ${BDHASH}-install/kernelfirst rm ${BDHASH}-install/kerneldone fi if [ -f ${BDHASH}-install/worlddone ]; then rm ${BDHASH}-install/worlddone fi if [ -L ${BDHASH}-rollback ]; then mv ${BDHASH}-rollback ${BDHASH}-install/rollback fi mv ${BDHASH}-install ${BDHASH}-rollback } # Actually install updates install_run () { echo -n "Installing updates..." # Make sure we have all the files we should have install_verify ${BDHASH}-install/INDEX-OLD \ ${BDHASH}-install/INDEX-NEW || return 1 # Remove system immutable flag from files install_unschg ${BDHASH}-install/INDEX-OLD \ ${BDHASH}-install/INDEX-NEW || return 1 # Install new files, delete old files, and update linker.hints install_files ${BDHASH}-install || return 1 # Rearrange bits to allow the installed updates to be rolled back install_setup_rollback echo " done." } # Rearrange bits to allow the previous set of updates to be rolled back next. rollback_setup_rollback () { if [ -L ${BDHASH}-rollback/rollback ]; then mv ${BDHASH}-rollback/rollback rollback-tmp rm -r ${BDHASH}-rollback/ rm ${BDHASH}-rollback mv rollback-tmp ${BDHASH}-rollback else rm -r ${BDHASH}-rollback/ rm ${BDHASH}-rollback fi } # Install old files, delete new files, and update linker.hints rollback_files () { # Install old shared library files which don't have the same path as # a new shared library file. grep -vE '^/boot/' $1/INDEX-NEW | grep -E '/lib/.*\.so\.[0-9]+\|' | cut -f 1 -d '|' | sort > INDEX-NEW.libs.flist grep -vE '^/boot/' $1/INDEX-OLD | grep -E '/lib/.*\.so\.[0-9]+\|' | sort -k 1,1 -t '|' - | join -t '|' -v 1 - INDEX-NEW.libs.flist > INDEX-OLD install_from_index INDEX-OLD || return 1 # Deal with files which are neither kernel nor shared library grep -vE '^/boot/' $1/INDEX-OLD | grep -vE '/lib/.*\.so\.[0-9]+\|' > INDEX-OLD grep -vE '^/boot/' $1/INDEX-NEW | grep -vE '/lib/.*\.so\.[0-9]+\|' > INDEX-NEW install_from_index INDEX-OLD || return 1 install_delete INDEX-NEW INDEX-OLD || return 1 # Install any old shared library files which we didn't install above. grep -vE '^/boot/' $1/INDEX-OLD | grep -E '/lib/.*\.so\.[0-9]+\|' | sort -k 1,1 -t '|' - | join -t '|' - INDEX-NEW.libs.flist > INDEX-OLD install_from_index INDEX-OLD || return 1 # Delete unneeded shared library files grep -vE '^/boot/' $1/INDEX-OLD | grep -E '/lib/.*\.so\.[0-9]+\|' > INDEX-OLD grep -vE '^/boot/' $1/INDEX-NEW | grep -E '/lib/.*\.so\.[0-9]+\|' > INDEX-NEW install_delete INDEX-NEW INDEX-OLD || return 1 # Deal with kernel files grep -E '^/boot/' $1/INDEX-OLD > INDEX-OLD grep -E '^/boot/' $1/INDEX-NEW > INDEX-NEW install_from_index INDEX-OLD || return 1 install_delete INDEX-NEW INDEX-OLD || return 1 if [ -s INDEX-OLD -o -s INDEX-NEW ]; then kldxref -R /boot/ 2>/dev/null fi # Remove temporary files rm INDEX-OLD INDEX-NEW INDEX-NEW.libs.flist } # Actually rollback updates rollback_run () { echo -n "Uninstalling updates..." # If there are updates waiting to be installed, remove them; we # want the user to re-run 'fetch' after rolling back updates. if [ -L ${BDHASH}-install ]; then rm -r ${BDHASH}-install/ rm ${BDHASH}-install fi # Make sure we have all the files we should have install_verify ${BDHASH}-rollback/INDEX-NEW \ ${BDHASH}-rollback/INDEX-OLD || return 1 # Remove system immutable flag from files install_unschg ${BDHASH}-rollback/INDEX-NEW \ ${BDHASH}-rollback/INDEX-OLD || return 1 # Install old files, delete new files, and update linker.hints rollback_files ${BDHASH}-rollback || return 1 # Remove the rollback directory and the symlink pointing to it; and # rearrange bits to allow the previous set of updates to be rolled # back next. rollback_setup_rollback echo " done." } # Compare INDEX-ALL and INDEX-PRESENT and print warnings about differences. IDS_compare () { # Get all the lines which mismatch in something other than file # flags. We ignore file flags because sysinstall doesn't seem to # set them when it installs FreeBSD; warning about these adds a # very large amount of noise. cut -f 1-5,7-8 -d '|' $1 > $1.noflags sort -k 1,1 -t '|' $1.noflags > $1.sorted cut -f 1-5,7-8 -d '|' $2 | comm -13 $1.noflags - | fgrep -v '|-|||||' | sort -k 1,1 -t '|' | join -t '|' $1.sorted - > INDEX-NOTMATCHING # Ignore files which match IDSIGNOREPATHS. for X in ${IDSIGNOREPATHS}; do grep -E "^${X}" INDEX-NOTMATCHING done | sort -u | comm -13 - INDEX-NOTMATCHING > INDEX-NOTMATCHING.tmp mv INDEX-NOTMATCHING.tmp INDEX-NOTMATCHING # Go through the lines and print warnings. local IFS='|' while read FPATH TYPE OWNER GROUP PERM HASH LINK P_TYPE P_OWNER P_GROUP P_PERM P_HASH P_LINK; do # Warn about different object types. if ! [ "${TYPE}" = "${P_TYPE}" ]; then echo -n "${FPATH} is a " case "${P_TYPE}" in f) echo -n "regular file, " ;; d) echo -n "directory, " ;; L) echo -n "symlink, " ;; esac echo -n "but should be a " case "${TYPE}" in f) echo -n "regular file." ;; d) echo -n "directory." ;; L) echo -n "symlink." ;; esac echo # Skip other tests, since they don't make sense if # we're comparing different object types. continue fi # Warn about different owners. if ! [ "${OWNER}" = "${P_OWNER}" ]; then echo -n "${FPATH} is owned by user id ${P_OWNER}, " echo "but should be owned by user id ${OWNER}." fi # Warn about different groups. if ! [ "${GROUP}" = "${P_GROUP}" ]; then echo -n "${FPATH} is owned by group id ${P_GROUP}, " echo "but should be owned by group id ${GROUP}." fi # Warn about different permissions. We do not warn about # different permissions on symlinks, since some archivers # don't extract symlink permissions correctly and they are # ignored anyway. if ! [ "${PERM}" = "${P_PERM}" ] && ! [ "${TYPE}" = "L" ]; then echo -n "${FPATH} has ${P_PERM} permissions, " echo "but should have ${PERM} permissions." fi # Warn about different file hashes / symlink destinations. if ! [ "${HASH}" = "${P_HASH}" ]; then if [ "${TYPE}" = "L" ]; then echo -n "${FPATH} is a symlink to ${P_HASH}, " echo "but should be a symlink to ${HASH}." fi if [ "${TYPE}" = "f" ]; then echo -n "${FPATH} has SHA256 hash ${P_HASH}, " echo "but should have SHA256 hash ${HASH}." fi fi # We don't warn about different hard links, since some # some archivers break hard links, and as long as the # underlying data is correct they really don't matter. done < INDEX-NOTMATCHING # Clean up rm $1 $1.noflags $1.sorted $2 INDEX-NOTMATCHING } # Do the work involved in comparing the system to a "known good" index IDS_run () { workdir_init || return 1 # Prepare the mirror list. fetch_pick_server_init && fetch_pick_server # Try to fetch the public key until we run out of servers. while ! fetch_key; do fetch_pick_server || return 1 done # Try to fetch the metadata index signature ("tag") until we run # out of available servers; and sanity check the downloaded tag. while ! fetch_tag; do fetch_pick_server || return 1 done fetch_tagsanity || return 1 # Fetch INDEX-OLD and INDEX-ALL. fetch_metadata INDEX-OLD INDEX-ALL || return 1 # Generate filtered INDEX-OLD and INDEX-ALL files containing only # the components we want and without anything marked as "Ignore". fetch_filter_metadata INDEX-OLD || return 1 fetch_filter_metadata INDEX-ALL || return 1 # Merge the INDEX-OLD and INDEX-ALL files into INDEX-ALL. sort INDEX-OLD INDEX-ALL > INDEX-ALL.tmp mv INDEX-ALL.tmp INDEX-ALL rm INDEX-OLD # Translate /boot/${KERNCONF} to ${KERNELDIR} fetch_filter_kernel_names INDEX-ALL ${KERNCONF} # Inspect the system and generate an INDEX-PRESENT file. fetch_inspect_system INDEX-ALL INDEX-PRESENT /dev/null || return 1 # Compare INDEX-ALL and INDEX-PRESENT and print warnings about any # differences. IDS_compare INDEX-ALL INDEX-PRESENT } #### Main functions -- call parameter-handling and core functions # Using the command line, configuration file, and defaults, # set all the parameters which are needed later. get_params () { init_params parse_cmdline $@ parse_conffile default_params + finalize_components_config ${COMPONENTS} } # Fetch command. Make sure that we're being called # interactively, then run fetch_check_params and fetch_run cmd_fetch () { if [ ! -t 0 -a $NOTTYOK -eq 0 ]; then echo -n "`basename $0` fetch should not " echo "be run non-interactively." echo "Run `basename $0` cron instead." exit 1 fi fetch_check_params fetch_run || exit 1 ISFETCHED=1 } # Cron command. Make sure the parameters are sensible; wait # rand(3600) seconds; then fetch updates. While fetching updates, # send output to a temporary file; only print that file if the # fetching failed. cmd_cron () { fetch_check_params sleep `jot -r 1 0 3600` TMPFILE=`mktemp /tmp/freebsd-update.XXXXXX` || exit 1 if ! fetch_run >> ${TMPFILE} || ! grep -q "No updates needed" ${TMPFILE} || [ ${VERBOSELEVEL} = "debug" ]; then mail -s "`hostname` security updates" ${MAILTO} < ${TMPFILE} fi rm ${TMPFILE} } # Fetch files for upgrading to a new release. cmd_upgrade () { upgrade_check_params upgrade_run || exit 1 } # Install downloaded updates. cmd_install () { install_check_params install_run || exit 1 } # Rollback most recently installed updates. cmd_rollback () { rollback_check_params rollback_run || exit 1 } # Compare system against a "known good" index. cmd_IDS () { IDS_check_params IDS_run || exit 1 } #### Entry point # Make sure we find utilities from the base system export PATH=/sbin:/bin:/usr/sbin:/usr/bin:${PATH} # Set a pager if the user doesn't if [ -z "$PAGER" ]; then PAGER=/usr/bin/less fi # Set LC_ALL in order to avoid problems with character ranges like [A-Z]. export LC_ALL=C get_params $@ for COMMAND in ${COMMANDS}; do cmd_${COMMAND} done Index: projects/clang900-import/usr.sbin/rpc.statd/Makefile =================================================================== --- projects/clang900-import/usr.sbin/rpc.statd/Makefile (revision 352586) +++ projects/clang900-import/usr.sbin/rpc.statd/Makefile (revision 352587) @@ -1,26 +1,28 @@ # $FreeBSD$ PROG= rpc.statd MAN= rpc.statd.8 SRCS= file.c sm_inter_svc.c sm_inter.h statd.c procs.c -CFLAGS+= -I. -WARNS?= 2 - +CFLAGS+= -I${.OBJDIR} LIBADD= rpcsvc +# XXX: mismatch between (xdrproc_t) and xdr_void(). +WARNS?= 2 + CLEANFILES= sm_inter_svc.c sm_inter.h RPCSRC= ${SYSROOT:U${DESTDIR}}/usr/include/rpcsvc/sm_inter.x RPCGEN= RPCGEN_CPP=${CPP:Q} rpcgen -L -C sm_inter_svc.c: ${RPCSRC} ${RPCGEN} -m -o ${.TARGET} ${.ALLSRC} sm_inter.h: ${RPCSRC} ${RPCGEN} -h -o ${.TARGET} ${.ALLSRC} -test: test.c - cc -o test test.c -lrpcsvc +test: test.o + ${CC} ${LDFLAGS} -o ${.TARGET} ${.ALLSRC} ${LIBADD:S/^/-l/} +CLEANFILES+= test test.o .include Index: projects/clang900-import/usr.sbin/rpc.statd/file.c =================================================================== --- projects/clang900-import/usr.sbin/rpc.statd/file.c (revision 352586) +++ projects/clang900-import/usr.sbin/rpc.statd/file.c (revision 352587) @@ -1,369 +1,369 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 * A.R. Gordon (andrew.gordon@net-tel.co.uk). All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the FreeBSD project * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY ANDREW GORDON AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #include #include #include #include #include #include #include #include #include /* For mmap() */ #include #include #include #include "statd.h" FileLayout *status_info; /* Pointer to the mmap()ed status file */ static int status_fd; /* File descriptor for the open file */ static off_t status_file_len; /* Current on-disc length of file */ /* sync_file --------------------------------------------------------------- */ /* Purpose: Packaged call of msync() to flush changes to mmap()ed file Returns: Nothing. Errors to syslog. */ void sync_file(void) { if (msync((void *)status_info, 0, 0) < 0) { syslog(LOG_ERR, "msync() failed: %s", strerror(errno)); } } /* find_host -------------------------------------------------------------- */ /* Purpose: Find the entry in the status file for a given host Returns: Pointer to that entry in the mmap() region, or NULL. Notes: Also creates entries if requested. Failure to create also returns NULL. */ HostInfo *find_host(char *hostname, int create) { HostInfo *hp; HostInfo *spare_slot = NULL; HostInfo *result = NULL; struct addrinfo *ai1, *ai2; int i; ai2 = NULL; if (getaddrinfo(hostname, NULL, NULL, &ai1) != 0) ai1 = NULL; for (i = 0, hp = status_info->hosts; i < status_info->noOfHosts; i++, hp++) { if (!strncasecmp(hostname, hp->hostname, SM_MAXSTRLEN)) { result = hp; break; } if (hp->hostname[0] != '\0' && getaddrinfo(hp->hostname, NULL, NULL, &ai2) != 0) ai2 = NULL; if (ai1 && ai2) { struct addrinfo *p1, *p2; for (p1 = ai1; !result && p1; p1 = p1->ai_next) { for (p2 = ai2; !result && p2; p2 = p2->ai_next) { if (p1->ai_family == p2->ai_family && p1->ai_addrlen == p2->ai_addrlen && !memcmp(p1->ai_addr, p2->ai_addr, p1->ai_addrlen)) { result = hp; break; } } } if (result) break; } if (ai2) { freeaddrinfo(ai2); ai2 = NULL; } if (!spare_slot && !hp->monList && !hp->notifyReqd) spare_slot = hp; } if (ai1) freeaddrinfo(ai1); /* Return if entry found, or if not asked to create one. */ if (result || !create) return (result); /* Now create an entry, using the spare slot if one was found or */ /* adding to the end of the list otherwise, extending file if reqd */ if (!spare_slot) { off_t desired_size; spare_slot = &status_info->hosts[status_info->noOfHosts]; desired_size = ((char*)spare_slot - (char*)status_info) + sizeof(HostInfo); if (desired_size > status_file_len) { /* Extend file by writing 1 byte of junk at the desired end pos */ if (lseek(status_fd, desired_size - 1, SEEK_SET) == -1 || write(status_fd, "\0", 1) < 0) { syslog(LOG_ERR, "Unable to extend status file"); return (NULL); } status_file_len = desired_size; } status_info->noOfHosts++; } /* Initialise the spare slot that has been found/created */ /* Note that we do not msync(), since the caller is presumed to be */ /* about to modify the entry further */ memset(spare_slot, 0, sizeof(HostInfo)); strncpy(spare_slot->hostname, hostname, SM_MAXSTRLEN); return (spare_slot); } /* init_file -------------------------------------------------------------- */ /* Purpose: Open file, create if necessary, initialise it. Returns: Nothing - exits on error Notes: Called before process becomes daemon, hence logs to stderr rather than syslog. Opens the file, then mmap()s it for ease of access. Also performs initial clean-up of the file, zeroing monitor list pointers, setting the notifyReqd flag in all hosts that had a monitor list, and incrementing the state number to the next even value. */ void init_file(const char *filename) { int new_file = FALSE; char buf[HEADER_LEN]; int i; /* try to open existing file - if not present, create one */ status_fd = open(filename, O_RDWR); if ((status_fd < 0) && (errno == ENOENT)) { status_fd = open(filename, O_RDWR | O_CREAT, 0644); new_file = TRUE; } if (status_fd < 0) errx(1, "unable to open status file %s", filename); /* File now open. mmap() it, with a generous size to allow for */ /* later growth, where we will extend the file but not re-map it. */ status_info = (FileLayout *) mmap(NULL, 0x10000000, PROT_READ | PROT_WRITE, MAP_SHARED, status_fd, 0); if (status_info == (FileLayout *) MAP_FAILED) err(1, "unable to mmap() status file"); status_file_len = lseek(status_fd, 0L, SEEK_END); /* If the file was not newly created, validate the contents, and if */ /* defective, re-create from scratch. */ if (!new_file) { - if ((status_file_len < HEADER_LEN) || (status_file_len - < (HEADER_LEN + sizeof(HostInfo) * status_info->noOfHosts)) ) + if ((status_file_len < (off_t)HEADER_LEN) || (status_file_len + < (off_t)(HEADER_LEN + sizeof(HostInfo) * status_info->noOfHosts)) ) { warnx("status file is corrupt"); new_file = TRUE; } } /* Initialisation of a new, empty file. */ if (new_file) { memset(buf, 0, sizeof(buf)); lseek(status_fd, 0L, SEEK_SET); write(status_fd, buf, HEADER_LEN); status_file_len = HEADER_LEN; } else { /* Clean-up of existing file - monitored hosts will have a pointer */ /* to a list of clients, which refers to memory in the previous */ /* incarnation of the program and so are meaningless now. These */ /* pointers are zeroed and the fact that the host was previously */ /* monitored is recorded by setting the notifyReqd flag, which will */ /* in due course cause a SM_NOTIFY to be sent. */ /* Note that if we crash twice in quick succession, some hosts may */ /* already have notifyReqd set, where we didn't manage to notify */ /* them before the second crash occurred. */ for (i = 0; i < status_info->noOfHosts; i++) { HostInfo *this_host = &status_info->hosts[i]; if (this_host->monList) { this_host->notifyReqd = TRUE; this_host->monList = NULL; } } /* Select the next higher even number for the state counter */ status_info->ourState = (status_info->ourState + 2) & 0xfffffffe; /*???????******/ status_info->ourState++; } } /* notify_one_host --------------------------------------------------------- */ /* Purpose: Perform SM_NOTIFY procedure at specified host Returns: TRUE if success, FALSE if failed. Notes: Only report failure if verbose is non-zero. Caller will only set verbose to non-zero for the first attempt to contact the host. */ static int notify_one_host(char *hostname, int verbose) { struct timeval timeout = { 20, 0 }; /* 20 secs timeout */ CLIENT *cli; char dummy; stat_chge arg; char our_hostname[SM_MAXSTRLEN+1]; gethostname(our_hostname, sizeof(our_hostname)); our_hostname[SM_MAXSTRLEN] = '\0'; arg.mon_name = our_hostname; arg.state = status_info->ourState; if (debug) syslog (LOG_DEBUG, "Sending SM_NOTIFY to host %s from %s", hostname, our_hostname); cli = clnt_create(hostname, SM_PROG, SM_VERS, "udp"); if (!cli) { syslog(LOG_ERR, "Failed to contact host %s%s", hostname, clnt_spcreateerror("")); return (FALSE); } if (clnt_call(cli, SM_NOTIFY, (xdrproc_t)xdr_stat_chge, &arg, (xdrproc_t)xdr_void, &dummy, timeout) != RPC_SUCCESS) { if (verbose) syslog(LOG_ERR, "Failed to contact rpc.statd at host %s", hostname); clnt_destroy(cli); return (FALSE); } clnt_destroy(cli); return (TRUE); } /* notify_hosts ------------------------------------------------------------ */ /* Purpose: Send SM_NOTIFY to all hosts marked as requiring it Returns: Nothing, immediately - forks a process to do the work. Notes: Does nothing if there are no monitored hosts. Called after all the initialisation has been done - logs to syslog. */ void notify_hosts(void) { int i; int attempts; int work_to_do = FALSE; HostInfo *hp; pid_t pid; /* First check if there is in fact any work to do. */ for (i = status_info->noOfHosts, hp = status_info->hosts; i ; i--, hp++) { if (hp->notifyReqd) { work_to_do = TRUE; break; } } if (!work_to_do) return; /* No work found */ pid = fork(); if (pid == -1) { syslog(LOG_ERR, "Unable to fork notify process - %s", strerror(errno)); return; } if (pid) return; /* Here in the child process. We continue until all the hosts marked */ /* as requiring notification have been duly notified. */ /* If one of the initial attempts fails, we sleep for a while and */ /* have another go. This is necessary because when we have crashed, */ /* (eg. a power outage) it is quite possible that we won't be able to */ /* contact all monitored hosts immediately on restart, either because */ /* they crashed too and take longer to come up (in which case the */ /* notification isn't really required), or more importantly if some */ /* router etc. needed to reach the monitored host has not come back */ /* up yet. In this case, we will be a bit late in re-establishing */ /* locks (after the grace period) but that is the best we can do. */ /* We try 10 times at 5 sec intervals, 10 more times at 1 minute */ /* intervals, then 24 more times at hourly intervals, finally */ /* giving up altogether if the host hasn't come back to life after */ /* 24 hours. */ for (attempts = 0; attempts < 44; attempts++) { work_to_do = FALSE; /* Unless anything fails */ for (i = status_info->noOfHosts, hp = status_info->hosts; i ; i--, hp++) { if (hp->notifyReqd) { if (notify_one_host(hp->hostname, attempts == 0)) { hp->notifyReqd = FALSE; sync_file(); } else work_to_do = TRUE; } } if (!work_to_do) break; if (attempts < 10) sleep(5); else if (attempts < 20) sleep(60); else sleep(60*60); } exit(0); } Index: projects/clang900-import/usr.sbin/rpc.statd/procs.c =================================================================== --- projects/clang900-import/usr.sbin/rpc.statd/procs.c (revision 352586) +++ projects/clang900-import/usr.sbin/rpc.statd/procs.c (revision 352587) @@ -1,438 +1,439 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 * A.R. Gordon (andrew.gordon@net-tel.co.uk). All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the FreeBSD project * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY ANDREW GORDON AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* for getaddrinfo() */ #include #include #include #include #include "statd.h" +static const char *from_addr(struct sockaddr *); + static const char * -from_addr(saddr) - struct sockaddr *saddr; +from_addr(struct sockaddr *saddr) { static char inet_buf[INET6_ADDRSTRLEN]; if (getnameinfo(saddr, saddr->sa_len, inet_buf, sizeof(inet_buf), NULL, 0, NI_NUMERICHOST) == 0) return inet_buf; return "???"; } /* sm_check_hostname -------------------------------------------------------- */ /* * Purpose: Check `mon_name' member of sm_name struct to ensure that the array * consists only of printable characters. * * Returns: TRUE if hostname is good. FALSE if hostname contains binary or * otherwise non-printable characters. * * Notes: Will syslog(3) to warn of corrupt hostname. */ int sm_check_hostname(struct svc_req *req, char *arg) { int len, dstlen, ret; struct sockaddr *claddr; char *dst; len = strlen(arg); dstlen = (4 * len) + 1; dst = malloc(dstlen); claddr = (struct sockaddr *) (svc_getrpccaller(req->rq_xprt)->buf) ; ret = 1; if (claddr == NULL || dst == NULL) { ret = 0; } else if (strvis(dst, arg, VIS_WHITE) != len) { syslog(LOG_ERR, "sm_stat: client %s hostname %s contained invalid characters.", from_addr(claddr), dst); ret = 0; } free(dst); return (ret); } /* sm_stat_1 --------------------------------------------------------------- */ /* Purpose: RPC call to enquire if a host can be monitored Returns: TRUE for any hostname that can be looked up to give an address. */ struct sm_stat_res *sm_stat_1_svc(sm_name *arg, struct svc_req *req) { static sm_stat_res res; struct addrinfo *ai; struct sockaddr *claddr; static int err; err = 1; if ((err = sm_check_hostname(req, arg->mon_name)) == 0) { res.res_stat = stat_fail; } if (err != 0) { if (debug) syslog(LOG_DEBUG, "stat called for host %s", arg->mon_name); if (getaddrinfo(arg->mon_name, NULL, NULL, &ai) == 0) { res.res_stat = stat_succ; freeaddrinfo(ai); } else { claddr = (struct sockaddr *) (svc_getrpccaller(req->rq_xprt)->buf) ; syslog(LOG_ERR, "invalid hostname to sm_stat from %s: %s", from_addr(claddr), arg->mon_name); res.res_stat = stat_fail; } } res.state = status_info->ourState; return (&res); } /* sm_mon_1 ---------------------------------------------------------------- */ /* Purpose: RPC procedure to establish a monitor request Returns: Success, unless lack of resources prevents the necessary structures from being set up to record the request, or if the hostname is not valid (as judged by getaddrinfo()) */ struct sm_stat_res *sm_mon_1_svc(mon *arg, struct svc_req *req) { static sm_stat_res res; HostInfo *hp; static int err; MonList *lp; struct addrinfo *ai; if ((err = sm_check_hostname(req, arg->mon_id.mon_name)) == 0) { res.res_stat = stat_fail; } if (err != 0) { if (debug) { syslog(LOG_DEBUG, "monitor request for host %s", arg->mon_id.mon_name); syslog(LOG_DEBUG, "recall host: %s prog: %d ver: %d proc: %d", arg->mon_id.my_id.my_name, arg->mon_id.my_id.my_prog, arg->mon_id.my_id.my_vers, arg->mon_id.my_id.my_proc); } res.res_stat = stat_fail; /* Assume fail until set otherwise */ res.state = status_info->ourState; /* Find existing host entry, or create one if not found */ /* If find_host() fails, it will have logged the error already. */ if (getaddrinfo(arg->mon_id.mon_name, NULL, NULL, &ai) != 0) { syslog(LOG_ERR, "Invalid hostname to sm_mon: %s", arg->mon_id.mon_name); return (&res); } freeaddrinfo(ai); if ((hp = find_host(arg->mon_id.mon_name, TRUE))) { lp = (MonList *)malloc(sizeof(MonList)); if (!lp) { syslog(LOG_ERR, "Out of memory"); } else { strncpy(lp->notifyHost, arg->mon_id.my_id.my_name, SM_MAXSTRLEN); lp->notifyProg = arg->mon_id.my_id.my_prog; lp->notifyVers = arg->mon_id.my_id.my_vers; lp->notifyProc = arg->mon_id.my_id.my_proc; memcpy(lp->notifyData, arg->priv, sizeof(lp->notifyData)); lp->next = hp->monList; hp->monList = lp; sync_file(); res.res_stat = stat_succ; /* Report success */ } } } return (&res); } /* do_unmon ---------------------------------------------------------------- */ /* Purpose: Remove a monitor request from a host Returns: TRUE if found, FALSE if not found. Notes: Common code from sm_unmon_1_svc and sm_unmon_all_1_svc In the unlikely event of more than one identical monitor request, all are removed. */ static int do_unmon(HostInfo *hp, my_id *idp) { MonList *lp, *next; MonList *last = NULL; int result = FALSE; lp = hp->monList; while (lp) { if (!strncasecmp(idp->my_name, lp->notifyHost, SM_MAXSTRLEN) && (idp->my_prog == lp->notifyProg) && (idp->my_proc == lp->notifyProc) && (idp->my_vers == lp->notifyVers)) { /* found one. Unhook from chain and free. */ next = lp->next; if (last) last->next = next; else hp->monList = next; free(lp); lp = next; result = TRUE; } else { last = lp; lp = lp->next; } } return (result); } /* sm_unmon_1 -------------------------------------------------------------- */ /* Purpose: RPC procedure to release a monitor request. Returns: Local machine's status number Notes: The supplied mon_id should match the value passed in an earlier call to sm_mon_1 */ struct sm_stat *sm_unmon_1_svc(mon_id *arg, struct svc_req *req __unused) { static sm_stat res; HostInfo *hp; if (debug) { syslog(LOG_DEBUG, "un-monitor request for host %s", arg->mon_name); syslog(LOG_DEBUG, "recall host: %s prog: %d ver: %d proc: %d", arg->mon_name, arg->my_id.my_prog, arg->my_id.my_vers, arg->my_id.my_proc); } if ((hp = find_host(arg->mon_name, FALSE))) { if (do_unmon(hp, &arg->my_id)) sync_file(); else { syslog(LOG_ERR, "unmon request from %s, no matching monitor", arg->my_id.my_name); } } else syslog(LOG_ERR, "unmon request from %s for unknown host %s", arg->my_id.my_name, arg->mon_name); res.state = status_info->ourState; return (&res); } /* sm_unmon_all_1 ---------------------------------------------------------- */ /* Purpose: RPC procedure to release monitor requests. Returns: Local machine's status number Notes: Releases all monitor requests (if any) from the specified host and program number. */ struct sm_stat *sm_unmon_all_1_svc(my_id *arg, struct svc_req *req __unused) { static sm_stat res; HostInfo *hp; int i; if (debug) { syslog(LOG_DEBUG, "unmon_all for host: %s prog: %d ver: %d proc: %d", arg->my_name, arg->my_prog, arg->my_vers, arg->my_proc); } for (i = status_info->noOfHosts, hp = status_info->hosts; i; i--, hp++) { do_unmon(hp, arg); } sync_file(); res.state = status_info->ourState; return (&res); } /* sm_simu_crash_1 --------------------------------------------------------- */ /* Purpose: RPC procedure to simulate a crash Returns: Nothing Notes: Standardised mechanism for debug purposes The specification says that we should drop all of our status information (apart from the list of monitored hosts on disc). However, this would confuse the rpc.lockd which would be unaware that all of its monitor requests had been silently junked. Hence we in fact retain all current requests and simply increment the status counter and inform all hosts on the monitor list. */ void *sm_simu_crash_1_svc(void *v __unused, struct svc_req *req __unused) { static char dummy; int work_to_do; HostInfo *hp; int i; work_to_do = FALSE; if (debug) syslog(LOG_DEBUG, "simu_crash called!!"); /* Simulate crash by setting notify-required flag on all monitored */ /* hosts, and incrementing our status number. notify_hosts() is */ /* then called to fork a process to do the notifications. */ for (i = status_info->noOfHosts, hp = status_info->hosts; i ; i--, hp++) { if (hp->monList) { work_to_do = TRUE; hp->notifyReqd = TRUE; } } status_info->ourState += 2; /* always even numbers if not crashed */ if (work_to_do) notify_hosts(); return (&dummy); } /* sm_notify_1 ------------------------------------------------------------- */ /* Purpose: RPC procedure notifying local statd of the crash of another Returns: Nothing Notes: There is danger of deadlock, since it is quite likely that the client procedure that we call will in turn call us to remove or adjust the monitor request. We therefore fork() a process to do the notifications. Note that the main HostInfo structure is in a mmap() region and so will be shared with the child, but the monList pointed to by the HostInfo is in normal memory. Hence if we read the monList before forking, we are protected from the parent servicing other requests that modify the list. */ void *sm_notify_1_svc(stat_chge *arg, struct svc_req *req __unused) { struct timeval timeout = { 20, 0 }; /* 20 secs timeout */ CLIENT *cli; static char dummy; sm_status tx_arg; /* arg sent to callback procedure */ MonList *lp; HostInfo *hp; pid_t pid; if (debug) syslog(LOG_DEBUG, "notify from host %s, new state %d", arg->mon_name, arg->state); hp = find_host(arg->mon_name, FALSE); if (!hp) { /* Never heard of this host - why is it notifying us? */ syslog(LOG_ERR, "Unsolicited notification from host %s", arg->mon_name); return (&dummy); } lp = hp->monList; if (!lp) return (&dummy); /* We know this host, but have no */ /* outstanding requests. */ pid = fork(); if (pid == -1) { syslog(LOG_ERR, "Unable to fork notify process - %s", strerror(errno)); return (NULL); /* no answer, the client will retry */ } if (pid) return (&dummy); /* Parent returns */ while (lp) { tx_arg.mon_name = arg->mon_name; tx_arg.state = arg->state; memcpy(tx_arg.priv, lp->notifyData, sizeof(tx_arg.priv)); cli = clnt_create(lp->notifyHost, lp->notifyProg, lp->notifyVers, "udp"); if (!cli) { syslog(LOG_ERR, "Failed to contact host %s%s", lp->notifyHost, clnt_spcreateerror("")); } else { if (clnt_call(cli, lp->notifyProc, (xdrproc_t)xdr_sm_status, &tx_arg, (xdrproc_t)xdr_void, &dummy, timeout) != RPC_SUCCESS) { syslog(LOG_ERR, "Failed to call rpc.statd client at host %s", lp->notifyHost); } clnt_destroy(cli); } lp = lp->next; } exit (0); /* Child quits */ } Index: projects/clang900-import/usr.sbin/rpc.statd/statd.c =================================================================== --- projects/clang900-import/usr.sbin/rpc.statd/statd.c (revision 352586) +++ projects/clang900-import/usr.sbin/rpc.statd/statd.c (revision 352587) @@ -1,657 +1,658 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 * A.R. Gordon (andrew.gordon@net-tel.co.uk). All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the FreeBSD project * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY ANDREW GORDON AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* main() function for status monitor daemon. Some of the code in this */ /* file was generated by running rpcgen /usr/include/rpcsvc/sm_inter.x */ /* The actual program logic is in the file procs.c */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "statd.h" #define GETPORT_MAXTRY 20 /* Max tries to get a port # */ int debug = 0; /* Controls syslog() calls for debug messages */ -char **hosts, *svcport_str = NULL; -int nhosts = 0; -int xcreated = 0; +static char **hosts, *svcport_str = NULL; +static int nhosts = 0; +static int xcreated = 0; static int mallocd_svcport = 0; static int *sock_fd; static int sock_fdcnt; static int sock_fdpos; static int create_service(struct netconfig *nconf); static void complete_service(struct netconfig *nconf, char *port_str); static void clearout_service(void); static void handle_sigchld(int sig); void out_of_mem(void) __dead2; static void usage(void) __dead2; int main(int argc, char **argv) { struct sigaction sa; struct netconfig *nconf; void *nc_handle; in_port_t svcport; int ch, i, s; - char *endptr, **hosts_bak; + char *endptr; + char **hosts_bak; int have_v6 = 1; int maxrec = RPC_MAXDATASIZE; int attempt_cnt, port_len, port_pos, ret; char **port_list; while ((ch = getopt(argc, argv, "dh:p:")) != -1) switch (ch) { case 'd': debug = 1; break; case 'h': ++nhosts; hosts_bak = hosts; hosts_bak = realloc(hosts, nhosts * sizeof(char *)); if (hosts_bak == NULL) { if (hosts != NULL) { for (i = 0; i < nhosts; i++) free(hosts[i]); free(hosts); out_of_mem(); } } hosts = hosts_bak; hosts[nhosts - 1] = strdup(optarg); if (hosts[nhosts - 1] == NULL) { for (i = 0; i < (nhosts - 1); i++) free(hosts[i]); free(hosts); out_of_mem(); } break; case 'p': endptr = NULL; svcport = (in_port_t)strtoul(optarg, &endptr, 10); if (endptr == NULL || *endptr != '\0' || svcport == 0 || svcport >= IPPORT_MAX) usage(); svcport_str = strdup(optarg); break; default: usage(); } argc -= optind; argv += optind; (void)rpcb_unset(SM_PROG, SM_VERS, NULL); /* * Check if IPv6 support is present. */ s = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (s < 0) have_v6 = 0; else close(s); rpc_control(RPC_SVC_CONNMAXREC_SET, &maxrec); /* * If no hosts were specified, add a wildcard entry to bind to * INADDR_ANY. Otherwise make sure 127.0.0.1 and ::1 are added to the * list. */ if (nhosts == 0) { hosts = malloc(sizeof(char *)); if (hosts == NULL) out_of_mem(); - hosts[0] = "*"; + hosts[0] = strdup("*"); nhosts = 1; } else { hosts_bak = hosts; if (have_v6) { hosts_bak = realloc(hosts, (nhosts + 2) * sizeof(char *)); if (hosts_bak == NULL) { for (i = 0; i < nhosts; i++) free(hosts[i]); free(hosts); out_of_mem(); } else hosts = hosts_bak; nhosts += 2; - hosts[nhosts - 2] = "::1"; + hosts[nhosts - 2] = strdup("::1"); } else { hosts_bak = realloc(hosts, (nhosts + 1) * sizeof(char *)); if (hosts_bak == NULL) { for (i = 0; i < nhosts; i++) free(hosts[i]); free(hosts); out_of_mem(); } else { nhosts += 1; hosts = hosts_bak; } } - hosts[nhosts - 1] = "127.0.0.1"; + hosts[nhosts - 1] = strdup("127.0.0.1"); } attempt_cnt = 1; sock_fdcnt = 0; sock_fd = NULL; port_list = NULL; port_len = 0; nc_handle = setnetconfig(); while ((nconf = getnetconfig(nc_handle))) { /* We want to listen only on udp6, tcp6, udp, tcp transports */ if (nconf->nc_flag & NC_VISIBLE) { /* Skip if there's no IPv6 support */ if (have_v6 == 0 && strcmp(nconf->nc_protofmly, "inet6") == 0) { /* DO NOTHING */ } else { ret = create_service(nconf); if (ret == 1) /* Ignore this call */ continue; if (ret < 0) { /* * Failed to bind port, so close off * all sockets created and try again * if the port# was dynamically * assigned via bind(2). */ clearout_service(); if (mallocd_svcport != 0 && attempt_cnt < GETPORT_MAXTRY) { free(svcport_str); svcport_str = NULL; mallocd_svcport = 0; } else { errno = EADDRINUSE; syslog(LOG_ERR, "bindresvport_sa: %m"); exit(1); } /* Start over at the first service. */ free(sock_fd); sock_fdcnt = 0; sock_fd = NULL; nc_handle = setnetconfig(); attempt_cnt++; } else if (mallocd_svcport != 0 && attempt_cnt == GETPORT_MAXTRY) { /* * For the last attempt, allow * different port #s for each nconf * by saving the svcport_str and * setting it back to NULL. */ port_list = realloc(port_list, (port_len + 1) * sizeof(char *)); if (port_list == NULL) out_of_mem(); port_list[port_len++] = svcport_str; svcport_str = NULL; mallocd_svcport = 0; } } } } /* * Successfully bound the ports, so call complete_service() to * do the rest of the setup on the service(s). */ sock_fdpos = 0; port_pos = 0; nc_handle = setnetconfig(); while ((nconf = getnetconfig(nc_handle))) { /* We want to listen only on udp6, tcp6, udp, tcp transports */ if (nconf->nc_flag & NC_VISIBLE) { /* Skip if there's no IPv6 support */ if (have_v6 == 0 && strcmp(nconf->nc_protofmly, "inet6") == 0) { /* DO NOTHING */ } else if (port_list != NULL) { if (port_pos >= port_len) { syslog(LOG_ERR, "too many port#s"); exit(1); } complete_service(nconf, port_list[port_pos++]); } else complete_service(nconf, svcport_str); } } endnetconfig(nc_handle); free(sock_fd); if (port_list != NULL) { for (port_pos = 0; port_pos < port_len; port_pos++) free(port_list[port_pos]); free(port_list); } init_file("/var/db/statd.status"); /* Note that it is NOT sensible to run this program from inetd - the */ /* protocol assumes that it will run immediately at boot time. */ daemon(0, 0); openlog("rpc.statd", 0, LOG_DAEMON); if (debug) syslog(LOG_INFO, "Starting - debug enabled"); else syslog(LOG_INFO, "Starting"); /* Install signal handler to collect exit status of child processes */ sa.sa_handler = handle_sigchld; sigemptyset(&sa.sa_mask); sigaddset(&sa.sa_mask, SIGCHLD); sa.sa_flags = SA_RESTART; sigaction(SIGCHLD, &sa, NULL); /* Initialisation now complete - start operating */ notify_hosts(); /* Forks a process (if necessary) to do the */ /* SM_NOTIFY calls, which may be slow. */ svc_run(); /* Should never return */ exit(1); } /* * This routine creates and binds sockets on the appropriate * addresses. It gets called one time for each transport. * It returns 0 upon success, 1 for ingore the call and -1 to indicate * bind failed with EADDRINUSE. * Any file descriptors that have been created are stored in sock_fd and * the total count of them is maintained in sock_fdcnt. */ static int create_service(struct netconfig *nconf) { struct addrinfo hints, *res = NULL; struct sockaddr_in *sin; struct sockaddr_in6 *sin6; struct __rpc_sockinfo si; int aicode; int fd; int nhostsbak; int r; u_int32_t host_addr[4]; /* IPv4 or IPv6 */ int mallocd_res; if ((nconf->nc_semantics != NC_TPI_CLTS) && (nconf->nc_semantics != NC_TPI_COTS) && (nconf->nc_semantics != NC_TPI_COTS_ORD)) return (1); /* not my type */ /* * XXX - using RPC library internal functions. */ if (!__rpc_nconf2sockinfo(nconf, &si)) { syslog(LOG_ERR, "cannot get information for %s", nconf->nc_netid); return (1); } /* Get rpc.statd's address on this transport */ memset(&hints, 0, sizeof hints); hints.ai_family = si.si_af; hints.ai_socktype = si.si_socktype; hints.ai_protocol = si.si_proto; /* * Bind to specific IPs if asked to */ nhostsbak = nhosts; while (nhostsbak > 0) { --nhostsbak; sock_fd = realloc(sock_fd, (sock_fdcnt + 1) * sizeof(int)); if (sock_fd == NULL) out_of_mem(); sock_fd[sock_fdcnt++] = -1; /* Set invalid for now. */ mallocd_res = 0; hints.ai_flags = AI_PASSIVE; /* * XXX - using RPC library internal functions. */ if ((fd = __rpc_nconf2fd(nconf)) < 0) { syslog(LOG_ERR, "cannot create socket for %s", nconf->nc_netid); continue; } switch (hints.ai_family) { case AF_INET: if (inet_pton(AF_INET, hosts[nhostsbak], host_addr) == 1) { hints.ai_flags |= AI_NUMERICHOST; } else { /* * Skip if we have an AF_INET6 address. */ if (inet_pton(AF_INET6, hosts[nhostsbak], host_addr) == 1) { close(fd); continue; } } break; case AF_INET6: if (inet_pton(AF_INET6, hosts[nhostsbak], host_addr) == 1) { hints.ai_flags |= AI_NUMERICHOST; } else { /* * Skip if we have an AF_INET address. */ if (inet_pton(AF_INET, hosts[nhostsbak], host_addr) == 1) { close(fd); continue; } } break; default: break; } /* * If no hosts were specified, just bind to INADDR_ANY */ if (strcmp("*", hosts[nhostsbak]) == 0) { if (svcport_str == NULL) { res = malloc(sizeof(struct addrinfo)); if (res == NULL) out_of_mem(); mallocd_res = 1; res->ai_flags = hints.ai_flags; res->ai_family = hints.ai_family; res->ai_protocol = hints.ai_protocol; switch (res->ai_family) { case AF_INET: sin = malloc(sizeof(struct sockaddr_in)); if (sin == NULL) out_of_mem(); sin->sin_family = AF_INET; sin->sin_port = htons(0); sin->sin_addr.s_addr = htonl(INADDR_ANY); res->ai_addr = (struct sockaddr*) sin; res->ai_addrlen = (socklen_t) sizeof(struct sockaddr_in); break; case AF_INET6: sin6 = malloc(sizeof(struct sockaddr_in6)); if (sin6 == NULL) out_of_mem(); sin6->sin6_family = AF_INET6; sin6->sin6_port = htons(0); sin6->sin6_addr = in6addr_any; res->ai_addr = (struct sockaddr*) sin6; res->ai_addrlen = (socklen_t) sizeof(struct sockaddr_in6); break; default: syslog(LOG_ERR, "bad addr fam %d", res->ai_family); exit(1); } } else { if ((aicode = getaddrinfo(NULL, svcport_str, &hints, &res)) != 0) { syslog(LOG_ERR, "cannot get local address for %s: %s", nconf->nc_netid, gai_strerror(aicode)); close(fd); continue; } } } else { if ((aicode = getaddrinfo(hosts[nhostsbak], svcport_str, &hints, &res)) != 0) { syslog(LOG_ERR, "cannot get local address for %s: %s", nconf->nc_netid, gai_strerror(aicode)); close(fd); continue; } } /* Store the fd. */ sock_fd[sock_fdcnt - 1] = fd; /* Now, attempt the bind. */ r = bindresvport_sa(fd, res->ai_addr); if (r != 0) { if (errno == EADDRINUSE && mallocd_svcport != 0) { if (mallocd_res != 0) { free(res->ai_addr); free(res); } else freeaddrinfo(res); return (-1); } syslog(LOG_ERR, "bindresvport_sa: %m"); exit(1); } if (svcport_str == NULL) { svcport_str = malloc(NI_MAXSERV * sizeof(char)); if (svcport_str == NULL) out_of_mem(); mallocd_svcport = 1; if (getnameinfo(res->ai_addr, res->ai_addr->sa_len, NULL, NI_MAXHOST, svcport_str, NI_MAXSERV * sizeof(char), NI_NUMERICHOST | NI_NUMERICSERV)) errx(1, "Cannot get port number"); } if (mallocd_res != 0) { free(res->ai_addr); free(res); } else freeaddrinfo(res); res = NULL; } return (0); } /* * Called after all the create_service() calls have succeeded, to complete * the setup and registration. */ static void complete_service(struct netconfig *nconf, char *port_str) { struct addrinfo hints, *res = NULL; struct __rpc_sockinfo si; struct netbuf servaddr; SVCXPRT *transp = NULL; int aicode, fd, nhostsbak; int registered = 0; if ((nconf->nc_semantics != NC_TPI_CLTS) && (nconf->nc_semantics != NC_TPI_COTS) && (nconf->nc_semantics != NC_TPI_COTS_ORD)) return; /* not my type */ /* * XXX - using RPC library internal functions. */ if (!__rpc_nconf2sockinfo(nconf, &si)) { syslog(LOG_ERR, "cannot get information for %s", nconf->nc_netid); return; } nhostsbak = nhosts; while (nhostsbak > 0) { --nhostsbak; if (sock_fdpos >= sock_fdcnt) { /* Should never happen. */ syslog(LOG_ERR, "Ran out of socket fd's"); return; } fd = sock_fd[sock_fdpos++]; if (fd < 0) continue; if (nconf->nc_semantics != NC_TPI_CLTS) listen(fd, SOMAXCONN); transp = svc_tli_create(fd, nconf, NULL, RPC_MAXDATASIZE, RPC_MAXDATASIZE); if (transp != (SVCXPRT *) NULL) { if (!svc_register(transp, SM_PROG, SM_VERS, sm_prog_1, 0)) { syslog(LOG_ERR, "can't register on %s", nconf->nc_netid); } else { if (!svc_reg(transp, SM_PROG, SM_VERS, sm_prog_1, NULL)) syslog(LOG_ERR, "can't register %s SM_PROG service", nconf->nc_netid); } } else syslog(LOG_WARNING, "can't create %s services", nconf->nc_netid); if (registered == 0) { registered = 1; memset(&hints, 0, sizeof hints); hints.ai_flags = AI_PASSIVE; hints.ai_family = si.si_af; hints.ai_socktype = si.si_socktype; hints.ai_protocol = si.si_proto; if ((aicode = getaddrinfo(NULL, port_str, &hints, &res)) != 0) { syslog(LOG_ERR, "cannot get local address: %s", gai_strerror(aicode)); exit(1); } servaddr.buf = malloc(res->ai_addrlen); memcpy(servaddr.buf, res->ai_addr, res->ai_addrlen); servaddr.len = res->ai_addrlen; rpcb_set(SM_PROG, SM_VERS, nconf, &servaddr); xcreated++; freeaddrinfo(res); } } /* end while */ } /* * Clear out sockets after a failure to bind one of them, so that the * cycle of socket creation/binding can start anew. */ static void clearout_service(void) { int i; for (i = 0; i < sock_fdcnt; i++) { if (sock_fd[i] >= 0) { shutdown(sock_fd[i], SHUT_RDWR); close(sock_fd[i]); } } } static void usage(void) { fprintf(stderr, "usage: rpc.statd [-d] [-h ] [-p ]\n"); exit(1); } /* handle_sigchld ---------------------------------------------------------- */ /* Purpose: Catch SIGCHLD and collect process status Retruns: Nothing. Notes: No special action required, other than to collect the process status and hence allow the child to die: we only use child processes for asynchronous transmission of SM_NOTIFY to other systems, so it is normal for the children to exit when they have done their work. */ static void handle_sigchld(int sig __unused) { int pid, status; pid = wait4(-1, &status, WNOHANG, (struct rusage*)0); if (!pid) syslog(LOG_ERR, "Phantom SIGCHLD??"); else if (status == 0) { if (debug) syslog(LOG_DEBUG, "Child %d exited OK", pid); } else syslog(LOG_ERR, "Child %d failed with status %d", pid, WEXITSTATUS(status)); } /* * Out of memory, fatal */ void out_of_mem(void) { syslog(LOG_ERR, "out of memory"); exit(2); } Index: projects/clang900-import/usr.sbin/rpc.statd/test.c =================================================================== --- projects/clang900-import/usr.sbin/rpc.statd/test.c (revision 352586) +++ projects/clang900-import/usr.sbin/rpc.statd/test.c (revision 352587) @@ -1,144 +1,144 @@ - #ifndef lint static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include +#include +#include #include #include - /* Default timeout can be changed using clnt_control() */ static struct timeval TIMEOUT = { 25, 0 }; struct sm_stat_res * sm_stat_1(argp, clnt) struct sm_name *argp; CLIENT *clnt; { static struct sm_stat_res res; bzero((char *)&res, sizeof(res)); - if (clnt_call(clnt, SM_STAT, xdr_sm_name, argp, xdr_sm_stat_res, &res, TIMEOUT) != RPC_SUCCESS) { + if (clnt_call(clnt, SM_STAT, (xdrproc_t)xdr_sm_name, argp, + (xdrproc_t)xdr_sm_stat_res, &res, TIMEOUT) != RPC_SUCCESS) { return (NULL); } return (&res); } struct sm_stat_res * sm_mon_1(argp, clnt) struct mon *argp; CLIENT *clnt; { static struct sm_stat_res res; bzero((char *)&res, sizeof(res)); - if (clnt_call(clnt, SM_MON, xdr_mon, argp, xdr_sm_stat_res, &res, TIMEOUT) != RPC_SUCCESS) { + if (clnt_call(clnt, SM_MON, (xdrproc_t)xdr_mon, argp, + (xdrproc_t)xdr_sm_stat_res, &res, TIMEOUT) != RPC_SUCCESS) { return (NULL); } return (&res); } struct sm_stat * sm_unmon_1(argp, clnt) struct mon_id *argp; CLIENT *clnt; { static struct sm_stat res; bzero((char *)&res, sizeof(res)); - if (clnt_call(clnt, SM_UNMON, xdr_mon_id, argp, xdr_sm_stat, &res, TIMEOUT) != RPC_SUCCESS) { + if (clnt_call(clnt, SM_UNMON, (xdrproc_t)xdr_mon_id, argp, + (xdrproc_t)xdr_sm_stat, &res, TIMEOUT) != RPC_SUCCESS) { return (NULL); } return (&res); } struct sm_stat * sm_unmon_all_1(argp, clnt) struct my_id *argp; CLIENT *clnt; { static struct sm_stat res; bzero((char *)&res, sizeof(res)); - if (clnt_call(clnt, SM_UNMON_ALL, xdr_my_id, argp, xdr_sm_stat, &res, TIMEOUT) != RPC_SUCCESS) { + if (clnt_call(clnt, SM_UNMON_ALL, (xdrproc_t)xdr_my_id, argp, + (xdrproc_t)xdr_sm_stat, &res, TIMEOUT) != RPC_SUCCESS) { return (NULL); } return (&res); } void * sm_simu_crash_1(argp, clnt) void *argp; CLIENT *clnt; { static char res; bzero((char *)&res, sizeof(res)); - if (clnt_call(clnt, SM_SIMU_CRASH, xdr_void, argp, xdr_void, &res, TIMEOUT) != RPC_SUCCESS) { + if (clnt_call(clnt, SM_SIMU_CRASH, (xdrproc_t)xdr_void, argp, + (xdrproc_t)xdr_void, &res, TIMEOUT) != RPC_SUCCESS) { return (NULL); } return ((void *)&res); } int main(int argc, char **argv) { CLIENT *cli; char dummy; void *out; struct mon mon; if (argc < 2) { fprintf(stderr, "usage: test | crash\n"); fprintf(stderr, "always talks to statd at localhost\n"); exit(1); } printf("Creating client for localhost\n" ); cli = clnt_create("localhost", SM_PROG, SM_VERS, "udp"); if (!cli) { printf("Failed to create client\n"); exit(1); } mon.mon_id.mon_name = argv[1]; mon.mon_id.my_id.my_name = argv[1]; mon.mon_id.my_id.my_prog = SM_PROG; mon.mon_id.my_id.my_vers = SM_VERS; mon.mon_id.my_id.my_proc = 1; /* have it call sm_stat() !!! */ if (strcmp(argv[1], "crash")) { /* Hostname given */ struct sm_stat_res *res; - if (res = sm_mon_1(&mon, cli)) - { + + res = sm_mon_1(&mon, cli); + if (res) printf("Success!\n"); - } else - { printf("Fail\n"); - } } else { - if (out = sm_simu_crash_1(&dummy, cli)) - { + out = sm_simu_crash_1(&dummy, cli); + if (out) printf("Success!\n"); - } else - { printf("Fail\n"); - } } return 0; } Index: projects/clang900-import =================================================================== --- projects/clang900-import (revision 352586) +++ projects/clang900-import (revision 352587) Property changes on: projects/clang900-import ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r352537-352586