Index: projects/runtime-coverage/Makefile.inc1 =================================================================== --- projects/runtime-coverage/Makefile.inc1 (revision 324497) +++ projects/runtime-coverage/Makefile.inc1 (revision 324498) @@ -1,2907 +1,2907 @@ # # $FreeBSD$ # # Make command line options: # -DNO_CLEANDIR run ${MAKE} clean, instead of ${MAKE} cleandir # -DNO_CLEAN do not clean at all # -DDB_FROM_SRC use the user/group databases in src/etc instead of # the system database when installing. # -DNO_SHARE do not go into share subdir # -DKERNFAST define NO_KERNEL{CONFIG,CLEAN,OBJ} # -DNO_KERNELCONFIG do not run config in ${MAKE} buildkernel # -DNO_KERNELCLEAN do not run ${MAKE} clean in ${MAKE} buildkernel # -DNO_KERNELOBJ do not run ${MAKE} obj in ${MAKE} buildkernel # -DNO_PORTSUPDATE do not update ports in ${MAKE} update # -DNO_ROOT install without using root privilege # -DNO_DOCUPDATE do not update doc in ${MAKE} update # -DWITHOUT_CTF do not run the DTrace CTF conversion tools on built objects # LOCAL_DIRS="list of dirs" to add additional dirs to the SUBDIR list # LOCAL_ITOOLS="list of tools" to add additional tools to the ITOOLS list # LOCAL_LIB_DIRS="list of dirs" to add additional dirs to libraries target # LOCAL_MTREE="list of mtree files" to process to allow local directories # to be created before files are installed # LOCAL_TOOL_DIRS="list of dirs" to add additional dirs to the build-tools # list # LOCAL_XTOOL_DIRS="list of dirs" to add additional dirs to the # cross-tools target # METALOG="path to metadata log" to write permission and ownership # when NO_ROOT is set. (default: ${DESTDIR}/METALOG) # TARGET="machine" to crossbuild world for a different machine type # TARGET_ARCH= may be required when a TARGET supports multiple endians # BUILDENV_SHELL= shell to launch for the buildenv target (def:${SHELL}) # WORLD_FLAGS= additional flags to pass to make(1) during buildworld # KERNEL_FLAGS= additional flags to pass to make(1) during buildkernel # SUBDIR_OVERRIDE="list of dirs" to build rather than everything. # All libraries and includes, and some build tools will still build. # # The intended user-driven targets are: # buildworld - rebuild *everything*, including glue to help do upgrades # installworld- install everything built by "buildworld" # checkworld - run test suite on installed world # doxygen - build API documentation of the kernel # update - convenient way to update your source tree (eg: svn/svnup) # # Standard targets (not defined here) are documented in the makefiles in # /usr/share/mk. These include: # obj depend all install clean cleandepend cleanobj .if !defined(TARGET) || !defined(TARGET_ARCH) .error "Both TARGET and TARGET_ARCH must be defined." .endif SRCDIR?= ${.CURDIR} LOCALBASE?= /usr/local # Cross toolchain changes must be in effect before bsd.compiler.mk # so that gets the right CC, and pass CROSS_TOOLCHAIN to submakes. .if defined(CROSS_TOOLCHAIN) .include "${LOCALBASE}/share/toolchains/${CROSS_TOOLCHAIN}.mk" CROSSENV+=CROSS_TOOLCHAIN="${CROSS_TOOLCHAIN}" .endif .if defined(CROSS_TOOLCHAIN_PREFIX) CROSS_COMPILER_PREFIX?=${CROSS_TOOLCHAIN_PREFIX} .endif XCOMPILERS= CC CXX CPP .for COMPILER in ${XCOMPILERS} .if defined(CROSS_COMPILER_PREFIX) X${COMPILER}?= ${CROSS_COMPILER_PREFIX}${${COMPILER}} .else X${COMPILER}?= ${${COMPILER}} .endif .endfor # If a full path to an external cross compiler is given, don't build # a cross compiler. .if ${XCC:N${CCACHE_BIN}:M/*} MK_CLANG_BOOTSTRAP= no MK_GCC_BOOTSTRAP= no .endif MAKEOBJDIRPREFIX?= /usr/obj .if ${MACHINE} == ${TARGET} && ${MACHINE_ARCH} == ${TARGET_ARCH} && !defined(CROSS_BUILD_TESTING) OBJTREE= ${MAKEOBJDIRPREFIX} .else OBJTREE= ${MAKEOBJDIRPREFIX}/${TARGET}.${TARGET_ARCH} .endif # Pull in compiler metadata from buildworld/toolchain if possible to avoid # running CC from bsd.compiler.mk. .if make(installworld) || make(install) || make(distributeworld) || \ make(stageworld) .-include "${OBJTREE}${.CURDIR}/compiler-metadata.mk" .endif # Pull in COMPILER_TYPE and COMPILER_FREEBSD_VERSION early. .include .include "share/mk/src.opts.mk" # Check if there is a local compiler that can satisfy as an external compiler. # Which compiler is expected to be used? .if ${MK_CLANG_BOOTSTRAP} == "yes" WANT_COMPILER_TYPE= clang .elif ${MK_GCC_BOOTSTRAP} == "yes" WANT_COMPILER_TYPE= gcc .else WANT_COMPILER_TYPE= .endif .if !defined(WANT_COMPILER_FREEBSD_VERSION) .if ${WANT_COMPILER_TYPE} == "clang" WANT_COMPILER_FREEBSD_VERSION_FILE= lib/clang/freebsd_cc_version.h WANT_COMPILER_FREEBSD_VERSION!= \ awk '$$2 == "FREEBSD_CC_VERSION" {printf("%d\n", $$3)}' \ ${SRCDIR}/${WANT_COMPILER_FREEBSD_VERSION_FILE} || echo unknown WANT_COMPILER_VERSION_FILE= lib/clang/include/clang/Basic/Version.inc WANT_COMPILER_VERSION!= \ awk '$$2 == "CLANG_VERSION" {split($$3, a, "."); print a[1] * 10000 + a[2] * 100 + a[3]}' \ ${SRCDIR}/${WANT_COMPILER_VERSION_FILE} || echo unknown .elif ${WANT_COMPILER_TYPE} == "gcc" WANT_COMPILER_FREEBSD_VERSION_FILE= gnu/usr.bin/cc/cc_tools/freebsd-native.h WANT_COMPILER_FREEBSD_VERSION!= \ awk '$$2 == "FBSD_CC_VER" {printf("%d\n", $$3)}' \ ${SRCDIR}/${WANT_COMPILER_FREEBSD_VERSION_FILE} || echo unknown WANT_COMPILER_VERSION_FILE= contrib/gcc/BASE-VER WANT_COMPILER_VERSION!= \ awk -F. '{print $$1 * 10000 + $$2 * 100 + $$3}' \ ${SRCDIR}/${WANT_COMPILER_VERSION_FILE} || echo unknown .endif .export WANT_COMPILER_FREEBSD_VERSION WANT_COMPILER_VERSION .endif # !defined(WANT_COMPILER_FREEBSD_VERSION) # It needs to be the same revision as we would build for the bootstrap. # If the expected vs CC is different then we can't skip. # GCC cannot be used for cross-arch yet. For clang we pass -target later if # TARGET_ARCH!=MACHINE_ARCH. .if ${MK_SYSTEM_COMPILER} == "yes" && \ (${MK_CLANG_BOOTSTRAP} == "yes" || ${MK_GCC_BOOTSTRAP} == "yes") && \ !make(showconfig) && !make(native-xtools) && !make(xdev*) && \ ${WANT_COMPILER_TYPE} == ${COMPILER_TYPE} && \ (${COMPILER_TYPE} == "clang" || ${TARGET_ARCH} == ${MACHINE_ARCH}) && \ ${COMPILER_VERSION} == ${WANT_COMPILER_VERSION} && \ ${COMPILER_FREEBSD_VERSION} == ${WANT_COMPILER_FREEBSD_VERSION} # Everything matches, disable the bootstrap compiler. MK_CLANG_BOOTSTRAP= no MK_GCC_BOOTSTRAP= no USING_SYSTEM_COMPILER= yes .endif # ${WANT_COMPILER_TYPE} == ${COMPILER_TYPE} USING_SYSTEM_COMPILER?= no TEST_SYSTEM_COMPILER_VARS= \ USING_SYSTEM_COMPILER MK_SYSTEM_COMPILER \ MK_CROSS_COMPILER MK_CLANG_BOOTSTRAP MK_GCC_BOOTSTRAP \ WANT_COMPILER_TYPE WANT_COMPILER_VERSION WANT_COMPILER_VERSION_FILE \ WANT_COMPILER_FREEBSD_VERSION WANT_COMPILER_FREEBSD_VERSION_FILE \ CC COMPILER_TYPE COMPILER_FEATURES COMPILER_VERSION \ COMPILER_FREEBSD_VERSION \ LINKER_TYPE LINKER_VERSION test-system-compiler: .PHONY .for v in ${TEST_SYSTEM_COMPILER_VARS} ${_+_}@printf "%-35s= %s\n" "${v}" "${${v}}" .endfor .if ${USING_SYSTEM_COMPILER} == "yes" && \ (make(buildworld) || make(buildkernel) || make(kernel-toolchain) || \ make(toolchain) || make(_cross-tools)) .info SYSTEM_COMPILER: Determined that CC=${CC} matches the source tree. Not bootstrapping a cross-compiler. .endif # For installworld need to ensure that the looked-up compiler metadata is # passed along rather than trying to run cc from the restricted # STRICTTMPPATH. .if ${MK_CLANG_BOOTSTRAP} == "no" && ${MK_GCC_BOOTSTRAP} == "no" .if !defined(X_COMPILER_TYPE) CROSSENV+= COMPILER_VERSION=${COMPILER_VERSION} \ COMPILER_TYPE=${COMPILER_TYPE} \ COMPILER_FEATURES=${COMPILER_FEATURES} \ COMPILER_FREEBSD_VERSION=${COMPILER_FREEBSD_VERSION} .else CROSSENV+= COMPILER_VERSION=${X_COMPILER_VERSION} \ COMPILER_FEATURES=${X_COMPILER_FEATURES} \ COMPILER_TYPE=${X_COMPILER_TYPE} \ COMPILER_FREEBSD_VERSION=${X_COMPILER_FREEBSD_VERSION} .endif .endif # Store some compiler metadata for use in installworld where we don't # want to invoke CC at all. _COMPILER_METADATA_VARS= COMPILER_VERSION \ COMPILER_TYPE \ COMPILER_FEATURES \ COMPILER_FREEBSD_VERSION \ LINKER_VERSION \ LINKER_TYPE compiler-metadata.mk: .PHONY .META @: > ${.TARGET} @echo ".info Using cached compiler metadata from build at $$(hostname) on $$(date)" \ > ${.TARGET} .for v in ${_COMPILER_METADATA_VARS} @echo "${v}=${${v}}" >> ${.TARGET} .endfor @echo ".export ${_COMPILER_METADATA_VARS}" >> ${.TARGET} # Handle external binutils. .if defined(CROSS_TOOLCHAIN_PREFIX) CROSS_BINUTILS_PREFIX?=${CROSS_TOOLCHAIN_PREFIX} .endif # If we do not have a bootstrap binutils (because the in-tree one does not # support the target architecture), provide a default cross-binutils prefix. # This allows riscv64 builds, for example, to automatically use the # riscv64-binutils port or package. .if !make(showconfig) .if !empty(BROKEN_OPTIONS:MBINUTILS_BOOTSTRAP) && \ ${MK_LLD_BOOTSTRAP} == "no" && \ !defined(CROSS_BINUTILS_PREFIX) CROSS_BINUTILS_PREFIX=/usr/local/${TARGET_ARCH}-freebsd/bin/ .if !exists(${CROSS_BINUTILS_PREFIX}) .error In-tree binutils does not support the ${TARGET_ARCH} architecture. Install the ${TARGET_ARCH}-binutils port or package or set CROSS_BINUTILS_PREFIX. .endif .endif .endif XBINUTILS= AS AR LD NM OBJCOPY RANLIB SIZE STRINGS .for BINUTIL in ${XBINUTILS} .if defined(CROSS_BINUTILS_PREFIX) && \ exists(${CROSS_BINUTILS_PREFIX}${${BINUTIL}}) X${BINUTIL}?= ${CROSS_BINUTILS_PREFIX}${${BINUTIL}} .else X${BINUTIL}?= ${${BINUTIL}} .endif .endfor # We must do lib/ and libexec/ before bin/ in case of a mid-install error to # keep the users system reasonably usable. For static->dynamic root upgrades, # we don't want to install a dynamic binary without rtld and the needed # libraries. More commonly, for dynamic root, we don't want to install a # binary that requires a newer library version that hasn't been installed yet. # This ordering is not a guarantee though. The only guarantee of a working # system here would require fine-grained ordering of all components based # on their dependencies. .if !empty(SUBDIR_OVERRIDE) SUBDIR= ${SUBDIR_OVERRIDE} .else SUBDIR= lib libexec .if !defined(NO_ROOT) && (make(installworld) || make(install)) # Ensure libraries are installed before progressing. SUBDIR+=.WAIT .endif SUBDIR+=bin .if ${MK_CDDL} != "no" SUBDIR+=cddl .endif SUBDIR+=gnu include .if ${MK_KERBEROS} != "no" SUBDIR+=kerberos5 .endif .if ${MK_RESCUE} != "no" SUBDIR+=rescue .endif SUBDIR+=sbin .if ${MK_CRYPT} != "no" SUBDIR+=secure .endif .if !defined(NO_SHARE) SUBDIR+=share .endif SUBDIR+=sys usr.bin usr.sbin .if ${MK_TESTS} != "no" SUBDIR+= tests .endif .if ${MK_OFED} != "no" SUBDIR+=contrib/ofed .endif # Local directories are last, since it is nice to at least get the base # system rebuilt before you do them. .for _DIR in ${LOCAL_DIRS} .if exists(${.CURDIR}/${_DIR}/Makefile) SUBDIR+= ${_DIR} .endif .endfor # Add LOCAL_LIB_DIRS, but only if they will not be picked up as a SUBDIR # of a LOCAL_DIRS directory. This allows LOCAL_DIRS=foo and # LOCAL_LIB_DIRS=foo/lib to behave as expected. .for _DIR in ${LOCAL_DIRS:M*/} ${LOCAL_DIRS:N*/:S|$|/|} _REDUNDANT_LIB_DIRS+= ${LOCAL_LIB_DIRS:M${_DIR}*} .endfor .for _DIR in ${LOCAL_LIB_DIRS} .if empty(_REDUNDANT_LIB_DIRS:M${_DIR}) && exists(${.CURDIR}/${_DIR}/Makefile) SUBDIR+= ${_DIR} .endif .endfor # We must do etc/ last as it hooks into building the man whatis file # by calling 'makedb' in share/man. This is only relevant for # install/distribute so they build the whatis file after every manpage is # installed. .if make(installworld) || make(install) SUBDIR+=.WAIT .endif SUBDIR+=etc .endif # !empty(SUBDIR_OVERRIDE) .if defined(NOCLEAN) .warning NOCLEAN option is deprecated. Use NO_CLEAN instead. NO_CLEAN= ${NOCLEAN} .endif .if defined(NO_CLEANDIR) CLEANDIR= clean cleandepend .else CLEANDIR= cleandir .endif .if defined(WORLDFAST) NO_CLEAN= t NO_OBJ= t .endif .if ${MK_META_MODE} == "yes" # If filemon is used then we can rely on the build being incremental-safe. # The .meta files will also track the build command and rebuild should # it change. .if empty(.MAKE.MODE:Mnofilemon) NO_CLEAN= t .endif .endif .if defined(NO_OBJ) || ${MK_AUTO_OBJ} == "yes" NO_OBJ= t NO_KERNELOBJ= t .endif .if !defined(NO_OBJ) _obj= obj .endif LOCAL_TOOL_DIRS?= PACKAGEDIR?= ${DESTDIR}/${DISTDIR} .if empty(SHELL:M*csh*) BUILDENV_SHELL?=${SHELL} .else BUILDENV_SHELL?=/bin/sh .endif .if !defined(SVN) || empty(SVN) . for _P in /usr/bin /usr/local/bin . for _S in svn svnlite . if exists(${_P}/${_S}) SVN= ${_P}/${_S} . endif . endfor . endfor .endif SVNFLAGS?= -r HEAD .if !defined(VCS_REVISION) && empty(VCS_REVISION) _VCS_REVISION?= $$(eval ${SVNVERSION_CMD} ${SRCDIR}) . if !empty(_VCS_REVISION) VCS_REVISION= $$(echo r${_VCS_REVISION}) . endif .endif .if !defined(OSRELDATE) .if exists(/usr/include/osreldate.h) OSRELDATE!= awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \ /usr/include/osreldate.h .else OSRELDATE= 0 .endif .export OSRELDATE .endif # Set VERSION for CTFMERGE to use via the default CTFFLAGS=-L VERSION. .if !defined(_REVISION) _REVISION!= MK_AUTO_OBJ=no ${MAKE} -C ${SRCDIR}/release -V REVISION .export _REVISION .endif .if !defined(_BRANCH) _BRANCH!= MK_AUTO_OBJ=no ${MAKE} -C ${SRCDIR}/release -V BRANCH .export _BRANCH .endif .if !defined(SRCRELDATE) SRCRELDATE!= awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \ ${SRCDIR}/sys/sys/param.h .export SRCRELDATE .endif .if !defined(VERSION) VERSION= FreeBSD ${_REVISION}-${_BRANCH:C/-p[0-9]+$//} ${TARGET_ARCH} ${SRCRELDATE} .export VERSION .endif .if !defined(PKG_VERSION) .if ${_BRANCH:MSTABLE*} || ${_BRANCH:MCURRENT*} || ${_BRANCH:MALPHA*} TIMENOW= %Y%m%d%H%M%S EXTRA_REVISION= .s${TIMENOW:gmtime} .endif .if ${_BRANCH:M*-p*} EXTRA_REVISION= _${_BRANCH:C/.*-p([0-9]+$)/\1/} .endif PKG_VERSION= ${_REVISION}${EXTRA_REVISION} .endif KNOWN_ARCHES?= aarch64/arm64 \ amd64 \ arm \ armeb/arm \ armv6/arm \ armv7/arm \ i386 \ mips \ mipsel/mips \ mips64el/mips \ mipsn32el/mips \ mips64/mips \ mipsn32/mips \ mipshf/mips \ mipselhf/mips \ mips64elhf/mips \ mips64hf/mips \ powerpc \ powerpc64/powerpc \ powerpcspe/powerpc \ riscv64/riscv \ riscv64sf/riscv \ sparc64 .if ${TARGET} == ${TARGET_ARCH} _t= ${TARGET} .else _t= ${TARGET_ARCH}/${TARGET} .endif .for _t in ${_t} .if empty(KNOWN_ARCHES:M${_t}) .error Unknown target ${TARGET_ARCH}:${TARGET}. .endif .endfor .if ${TARGET} == ${MACHINE} TARGET_CPUTYPE?=${CPUTYPE} .else TARGET_CPUTYPE?= .endif .if !empty(TARGET_CPUTYPE) _TARGET_CPUTYPE=${TARGET_CPUTYPE} .else _TARGET_CPUTYPE=dummy .endif _CPUTYPE!= MK_AUTO_OBJ=no MAKEFLAGS= CPUTYPE=${_TARGET_CPUTYPE} ${MAKE} \ -f /dev/null -m ${.CURDIR}/share/mk -V CPUTYPE .if ${_CPUTYPE} != ${_TARGET_CPUTYPE} .error CPUTYPE global should be set with ?=. .endif .if make(buildworld) BUILD_ARCH!= uname -p .if ${MACHINE_ARCH} != ${BUILD_ARCH} .error To cross-build, set TARGET_ARCH. .endif .endif WORLDTMP= ${OBJTREE}${.CURDIR}/tmp BPATH= ${CCACHE_WRAPPER_PATH_PFX}${WORLDTMP}/legacy/usr/sbin:${WORLDTMP}/legacy/usr/bin:${WORLDTMP}/legacy/bin XPATH= ${WORLDTMP}/usr/sbin:${WORLDTMP}/usr/bin STRICTTMPPATH= ${BPATH}:${XPATH} TMPPATH= ${STRICTTMPPATH}:${PATH} # # Avoid running mktemp(1) unless actually needed. # It may not be functional, e.g., due to new ABI # when in the middle of installing over this system. # .if make(distributeworld) || make(installworld) || make(stageworld) INSTALLTMP!= /usr/bin/mktemp -d -u -t install .endif .if make(stagekernel) || make(distributekernel) TAGS+= kernel PACKAGE= kernel .endif # # Building a world goes through the following stages # # 1. legacy stage [BMAKE] # This stage is responsible for creating compatibility # shims that are needed by the bootstrap-tools, # build-tools and cross-tools stages. These are generally # APIs that tools from one of those three stages need to # build that aren't present on the host. # 1. bootstrap-tools stage [BMAKE] # This stage is responsible for creating programs that # are needed for backward compatibility reasons. They # are not built as cross-tools. # 2. build-tools stage [TMAKE] # This stage is responsible for creating the object # tree and building any tools that are needed during # the build process. Some programs are listed during # this phase because they build binaries to generate # files needed to build these programs. This stage also # builds the 'build-tools' target rather than 'all'. # 3. cross-tools stage [XMAKE] # This stage is responsible for creating any tools that # are needed for building the system. A cross-compiler is one # of them. This differs from build tools in two ways: # 1. the 'all' target is built rather than 'build-tools' # 2. these tools are installed into TMPPATH for stage 4. # 4. world stage [WMAKE] # This stage actually builds the world. # 5. install stage (optional) [IMAKE] # This stage installs a previously built world. # BOOTSTRAPPING?= 0 # Keep these in sync -- see below for special case exception MINIMUM_SUPPORTED_OSREL?= 900044 MINIMUM_SUPPORTED_REL?= 9.1 # Common environment for world related stages CROSSENV+= MAKEOBJDIRPREFIX=${OBJTREE} \ MACHINE_ARCH=${TARGET_ARCH} \ MACHINE=${TARGET} \ CPUTYPE=${TARGET_CPUTYPE} .if ${MK_META_MODE} != "no" # Don't rebuild build-tools targets during normal build. CROSSENV+= BUILD_TOOLS_META=.NOMETA .endif .if defined(TARGET_CFLAGS) CROSSENV+= ${TARGET_CFLAGS} .endif # bootstrap-tools stage BMAKEENV= INSTALL="sh ${.CURDIR}/tools/install.sh" \ TOOLS_PREFIX=${WORLDTMP} \ PATH=${BPATH}:${PATH} \ WORLDTMP=${WORLDTMP} \ MAKEFLAGS="-m ${.CURDIR}/tools/build/mk ${.MAKEFLAGS}" # need to keep this in sync with targets/pseudo/bootstrap-tools/Makefile BSARGS= DESTDIR= \ BOOTSTRAPPING=${OSRELDATE} \ BWPHASE=${.TARGET:C,^_,,} \ SSP_CFLAGS= \ MK_COVERAGE=no MK_HTML=no NO_LINT=yes MK_MAN=no \ -DNO_PIC MK_PROFILE=no -DNO_SHARED \ -DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no \ MK_CLANG_EXTRAS=no MK_CLANG_FULL=no \ MK_LLDB=no MK_TESTS=no \ MK_INCLUDES=yes BMAKE= MAKEOBJDIRPREFIX=${WORLDTMP} \ ${BMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ ${BSARGS} # build-tools stage TMAKE= MAKEOBJDIRPREFIX=${OBJTREE} \ ${BMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \ DESTDIR= \ BOOTSTRAPPING=${OSRELDATE} \ BWPHASE=${.TARGET:C,^_,,} \ SSP_CFLAGS= \ -DNO_LINT \ -DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no \ MK_CLANG_EXTRAS=no MK_CLANG_FULL=no \ MK_COVERAGE=no \ MK_LLDB=no MK_TESTS=no # cross-tools stage XMAKE= TOOLS_PREFIX=${WORLDTMP} ${BMAKE} \ TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} MK_GDB=no \ MK_LLD_IS_LD=${MK_LLD_BOOTSTRAP} MK_TESTS=no # kernel-tools stage KTMAKEENV= INSTALL="sh ${.CURDIR}/tools/install.sh" \ PATH=${BPATH}:${PATH} \ WORLDTMP=${WORLDTMP} KTMAKE= TOOLS_PREFIX=${WORLDTMP} MAKEOBJDIRPREFIX=${WORLDTMP} \ ${KTMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ DESTDIR= \ BOOTSTRAPPING=${OSRELDATE} \ SSP_CFLAGS= \ MK_COVERAGE=no MK_HTML=no -DNO_LINT MK_MAN=no \ -DNO_PIC MK_PROFILE=no -DNO_SHARED \ -DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no # world stage WMAKEENV= ${CROSSENV} \ INSTALL="sh ${.CURDIR}/tools/install.sh" \ PATH=${TMPPATH} \ SYSROOT=${WORLDTMP} # make hierarchy HMAKE= PATH=${TMPPATH} ${MAKE} LOCAL_MTREE=${LOCAL_MTREE:Q} .if defined(NO_ROOT) HMAKE+= PATH=${TMPPATH} METALOG=${METALOG} -DNO_ROOT .endif CROSSENV+= CC="${XCC} ${XCFLAGS}" CXX="${XCXX} ${XCXXFLAGS} ${XCFLAGS}" \ CPP="${XCPP} ${XCFLAGS}" \ AS="${XAS}" AR="${XAR}" LD="${XLD}" LLVM_LINK="${XLLVM_LINK}" \ NM=${XNM} OBJCOPY="${XOBJCOPY}" \ RANLIB=${XRANLIB} STRINGS=${XSTRINGS} \ SIZE="${XSIZE}" .if defined(CROSS_BINUTILS_PREFIX) && exists(${CROSS_BINUTILS_PREFIX}) # In the case of xdev-build tools, CROSS_BINUTILS_PREFIX won't be a # directory, but the compiler will look in the right place for its # tools so we don't need to tell it where to look. BFLAGS+= -B${CROSS_BINUTILS_PREFIX} .endif # The internal bootstrap compiler has a default sysroot set by TOOLS_PREFIX # and target set by TARGET/TARGET_ARCH. However, there are several needs to # always pass an explicit --sysroot and -target. # - External compiler needs sysroot and target flags. # - External ld needs sysroot. # - To be clear about the use of a sysroot when using the internal compiler. # - Easier debugging. # - Allowing WITH_SYSTEM_COMPILER+WITH_META_MODE to work together due to # the flip-flopping build command when sometimes using external and # sometimes using internal. # - Allow using lld which has no support for default paths. .if !defined(CROSS_BINUTILS_PREFIX) || !exists(${CROSS_BINUTILS_PREFIX}) BFLAGS+= -B${WORLDTMP}/usr/bin .endif .if ${TARGET} == "arm" .if ${TARGET_ARCH:Marmv[67]*} != "" && ${TARGET_CPUTYPE:M*soft*} == "" TARGET_ABI= gnueabihf .else TARGET_ABI= gnueabi .endif .endif .if ${WANT_COMPILER_TYPE} == gcc || \ (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == gcc) # GCC requires -isystem and -L when using a cross-compiler. --sysroot # won't set header path and -L is used to ensure the base library path # is added before the port PREFIX library path. XCFLAGS+= -isystem ${WORLDTMP}/usr/include -L${WORLDTMP}/usr/lib # GCC requires -B to find /usr/lib/crti.o when using a cross-compiler # combined with --sysroot. XCFLAGS+= -B${WORLDTMP}/usr/lib # Force using libc++ for external GCC. .if ${X_COMPILER_TYPE} == gcc && ${X_COMPILER_VERSION} >= 40800 XCXXFLAGS+= -isystem ${WORLDTMP}/usr/include/c++/v1 -std=c++11 \ -nostdinc++ .endif .elif ${WANT_COMPILER_TYPE} == clang || \ (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == clang) TARGET_ABI?= unknown TARGET_TRIPLE?= ${TARGET_ARCH:C/amd64/x86_64/}-${TARGET_ABI}-freebsd12.0 XCFLAGS+= -target ${TARGET_TRIPLE} .endif XCFLAGS+= --sysroot=${WORLDTMP} .if !empty(BFLAGS) XCFLAGS+= ${BFLAGS} .endif .if ${MK_LIB32} != "no" && (${TARGET_ARCH} == "amd64" || \ ${TARGET_ARCH} == "powerpc64" || ${TARGET_ARCH:Mmips64*} != "") LIBCOMPAT= 32 .include "Makefile.libcompat" .elif ${MK_LIBSOFT} != "no" && ${TARGET_ARCH:Marmv[67]*} != "" LIBCOMPAT= SOFT .include "Makefile.libcompat" .endif # META_MODE normally ignores host file changes since every build updates # timestamps (see NO_META_IGNORE_HOST in sys.mk). There are known times # when the ABI breaks though that we want to force rebuilding WORLDTMP # to get updated host tools. .if ${MK_META_MODE} == "yes" && defined(NO_CLEAN) && \ !defined(NO_META_IGNORE_HOST) && !defined(NO_META_IGNORE_HOST_HEADERS) && \ !make(showconfig) # r318736 - ino64 major ABI breakage META_MODE_BAD_ABI_VERS+= 1200031 .if !defined(OBJDIR_HOST_OSRELDATE) .if exists(${OBJTREE}${.CURDIR}/host-osreldate.h) OBJDIR_HOST_OSRELDATE!= \ awk '/^\#define[[:space:]]*__FreeBSD_version/ { print $$3 }' \ ${OBJTREE}${.CURDIR}/host-osreldate.h .else OBJDIR_HOST_OSRELDATE= 0 .endif .export OBJDIR_HOST_OSRELDATE .endif # Note that this logic is the opposite of normal BOOTSTRAP handling. We want # to compare the WORLDTMP's OSRELDATE to the host's OSRELDATE. If the WORLDTMP # is older than the ABI-breakage OSRELDATE of the HOST then we rebuild. .for _ver in ${META_MODE_BAD_ABI_VERS} .if ${OSRELDATE} >= ${_ver} && ${OBJDIR_HOST_OSRELDATE} < ${_ver} _meta_mode_need_rebuild= ${_ver} .endif .endfor .if defined(_meta_mode_need_rebuild) .info META_MODE: Rebuilding host tools due to ABI breakage in __FreeBSD_version ${_meta_mode_need_rebuild}. NO_META_IGNORE_HOST_HEADERS= 1 .export NO_META_IGNORE_HOST_HEADERS .endif .endif # This is only used for META_MODE+filemon to track what the oldest # __FreeBSD_version is in WORLDTMP. This purposely does NOT have # a make dependency on /usr/include/osreldate.h as the file should # only be copied when it is missing or meta mode determines it has changed. # Since host files are normally ignored without NO_META_IGNORE_HOST # the file will never be updated unless that flag is specified. This # allows tracking the oldest osreldate to force rebuilds via # META_MODE_BADABI_REVS above. host-osreldate.h: # DO NOT ADD /usr/include/osreldate.h here @cp -f /usr/include/osreldate.h ${.TARGET} WMAKE= ${WMAKEENV} ${MAKE} ${WORLD_FLAGS} -f Makefile.inc1 \ BWPHASE=${.TARGET:C,^_,,} \ DESTDIR=${WORLDTMP} IMAKEENV= ${CROSSENV} IMAKE= ${IMAKEENV} ${MAKE} -f Makefile.inc1 \ ${IMAKE_INSTALL} ${IMAKE_MTREE} .if empty(.MAKEFLAGS:M-n) IMAKEENV+= PATH=${STRICTTMPPATH}:${INSTALLTMP} \ LD_LIBRARY_PATH=${INSTALLTMP} \ PATH_LOCALE=${INSTALLTMP}/locale IMAKE+= __MAKE_SHELL=${INSTALLTMP}/sh .else IMAKEENV+= PATH=${TMPPATH}:${INSTALLTMP} .endif .if defined(DB_FROM_SRC) INSTALLFLAGS+= -N ${.CURDIR}/etc MTREEFLAGS+= -N ${.CURDIR}/etc .endif _INSTALL_DDIR= ${DESTDIR}/${DISTDIR} INSTALL_DDIR= ${_INSTALL_DDIR:S://:/:g:C:/$::} .if defined(NO_ROOT) METALOG?= ${DESTDIR}/${DISTDIR}/METALOG METALOG:= ${METALOG:C,//+,/,g} IMAKE+= -DNO_ROOT METALOG=${METALOG} INSTALLFLAGS+= -U -M ${METALOG} -D ${INSTALL_DDIR} MTREEFLAGS+= -W .endif .if defined(BUILD_PKGS) INSTALLFLAGS+= -h sha256 .endif .if defined(DB_FROM_SRC) || defined(NO_ROOT) IMAKE_INSTALL= INSTALL="install ${INSTALLFLAGS}" IMAKE_MTREE= MTREE_CMD="mtree ${MTREEFLAGS}" .endif # kernel stage KMAKEENV= ${WMAKEENV:NSYSROOT=*} KMAKE= ${KMAKEENV} ${MAKE} ${.MAKEFLAGS} ${KERNEL_FLAGS} KERNEL=${INSTKERNNAME} # # buildworld # # Attempt to rebuild the entire system, with reasonable chance of # success, regardless of how old your existing system is. # _worldtmp: .PHONY .if ${.CURDIR:C/[^,]//g} != "" # The m4 build of sendmail files doesn't like it if ',' is used # anywhere in the path of it's files. @echo @echo "*** Error: path to source tree contains a comma ','" @echo false .endif @echo @echo "--------------------------------------------------------------" @echo ">>> Rebuilding the temporary build tree" @echo "--------------------------------------------------------------" .if !defined(NO_CLEAN) rm -rf ${WORLDTMP} .if defined(LIBCOMPAT) rm -rf ${LIBCOMPATTMP} .endif .else .if exists(${WORLDTMP}) @echo ">>> Deleting stale files in build tree..." ${_+_}cd ${.CURDIR}; ${WMAKE} -DBATCH_DELETE_OLD_FILES \ delete-old delete-old-libs >/dev/null .endif .if defined(LIBCOMPAT) && exists(${LIBCOMPATTMP}) ${_+_}cd ${.CURDIR}; ${WMAKE} -DBATCH_DELETE_OLD_FILES \ DESTDIR=${LIBCOMPATTMP} \ delete-old delete-old-libs >/dev/null .endif rm -rf ${WORLDTMP}/legacy/usr/include .if ${USING_SYSTEM_COMPILER} == "yes" .for cc in cc c++ if [ -x ${WORLDTMP}/usr/bin/${cc} ]; then \ inum=$$(stat -f %i ${WORLDTMP}/usr/bin/${cc}); \ find ${WORLDTMP}/usr/bin -inum $${inum} -delete; \ fi .endfor .endif # ${USING_SYSTEM_COMPILER} == "yes" # Our current approach to dependency tracking cannot cope with certain source # tree changes, particularly with respect to removing source files and # replacing generated files. Handle these cases here in an ad-hoc fashion. # # Syscall stubs rewritten in C # Date SVN Rev Syscalls # 20160829 r305012 ptrace # 20170624 r320278 fstat fstatat fstatfs getdirentries getfsstat statfs .for f in fstat fstatat fstatfs getdirentries getfsstat ptrace statfs .if exists(${OBJTREE}${.CURDIR}/lib/libc/.depend.${f}.o) @if egrep -qw '${f}\.[sS]' \ ${OBJTREE}${.CURDIR}/lib/libc/.depend.${f}.o; then \ echo Removing stale dependencies for ${f} syscall wrappers; \ rm -f ${OBJTREE}${.CURDIR}/lib/libc/.depend.${f}.* \ ${OBJTREE}${.CURDIR}/world32/${.CURDIR}/lib/libc/.depend.${f}.*; \ fi .endif .endfor # 20170607 remove stale dependencies for utimens* wrappers removed in r319663 .for f in futimens utimensat .if exists(${OBJTREE}${.CURDIR}/lib/libc/.depend.${f}.o) @if egrep -q '/${f}.c' \ ${OBJTREE}${.CURDIR}/lib/libc/.depend.${f}.o; then \ echo Removing stale dependencies for ${f} syscall wrappers; \ rm -f ${OBJTREE}${.CURDIR}/lib/libc/.depend.${f}.* \ ${OBJTREE}${.CURDIR}/world32/${.CURDIR}/lib/libc/.depend.${f}.*; \ fi .endif .endfor # 20170523 remove stale generated asm files for functions which are no longer # syscalls after r302092 (pipe) and r318736 (others) .for f in getdents lstat mknod pipe stat .if exists(${OBJTREE}${.CURDIR}/lib/libc/${f}.s) || \ exists(${OBJTREE}${.CURDIR}/lib/libc/${f}.S) @echo Removing stale generated ${f} syscall files @rm -f ${OBJTREE}${.CURDIR}/lib/libc/${f}.* \ ${OBJTREE}${.CURDIR}/lib/libc/.depend.${f}.* \ ${OBJTREE}${.CURDIR}/world32/${.CURDIR}/lib/libc/${f}.* \ ${OBJTREE}${.CURDIR}/world32/${.CURDIR}/lib/libc/.depend.${f}.* .endif .endfor .endif # !defined(NO_CLEAN) .for _dir in \ lib lib/casper usr legacy/bin legacy/usr mkdir -p ${WORLDTMP}/${_dir} .endfor mtree -deU -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${WORLDTMP}/legacy/usr >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${WORLDTMP}/legacy/usr/include >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${WORLDTMP}/usr >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${WORLDTMP}/usr/include >/dev/null ln -sf ${.CURDIR}/sys ${WORLDTMP} .if ${MK_DEBUG_FILES} != "no" # We could instead disable debug files for these build stages mtree -deU -f ${.CURDIR}/etc/mtree/BSD.debug.dist \ -p ${WORLDTMP}/legacy/usr/lib >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.debug.dist \ -p ${WORLDTMP}/usr/lib >/dev/null .endif .if defined(LIBCOMPAT) mtree -deU -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${WORLDTMP}/usr >/dev/null .if ${MK_DEBUG_FILES} != "no" mtree -deU -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${WORLDTMP}/legacy/usr/lib/debug/usr >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${WORLDTMP}/usr/lib/debug/usr >/dev/null .endif .endif .if ${MK_TESTS} != "no" mkdir -p ${WORLDTMP}${TESTSBASE} mtree -deU -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${WORLDTMP}${TESTSBASE} >/dev/null .if ${MK_DEBUG_FILES} != "no" mkdir -p ${WORLDTMP}/usr/lib/debug/${TESTSBASE} mtree -deU -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${WORLDTMP}/usr/lib/debug/${TESTSBASE} >/dev/null .endif .endif .for _mtree in ${LOCAL_MTREE} mtree -deU -f ${.CURDIR}/${_mtree} -p ${WORLDTMP} > /dev/null .endfor _legacy: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 1.1: legacy release compatibility shims" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${BMAKE} legacy _bootstrap-tools: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 1.2: bootstrap tools" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${BMAKE} bootstrap-tools _cleanobj: .if !defined(NO_CLEAN) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.1: cleaning up the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${WMAKE} ${CLEANDIR} .if defined(LIBCOMPAT) ${_+_}cd ${.CURDIR}; ${LIBCOMPATWMAKE} -f Makefile.inc1 ${CLEANDIR} .endif .endif _obj: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.2: rebuilding the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${WMAKE} obj _build-tools: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.3: build tools" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${TMAKE} build-tools _cross-tools: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 3: cross tools" @echo "--------------------------------------------------------------" @rm -f ${OBJTREE}${.CURDIR}/compiler-metadata.mk ${_+_}cd ${.CURDIR}; ${XMAKE} cross-tools ${_+_}cd ${.CURDIR}; ${XMAKE} kernel-tools _build-metadata: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 3.1: recording build metadata" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${WMAKE} compiler-metadata.mk ${_+_}cd ${.CURDIR}; ${WMAKE} host-osreldate.h _includes: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 4.1: building includes" @echo "--------------------------------------------------------------" # Special handling for SUBDIR_OVERRIDE in buildworld as they most likely need # headers from default SUBDIR. Do SUBDIR_OVERRIDE includes last. ${_+_}cd ${.CURDIR}; ${WMAKE} SUBDIR_OVERRIDE= SHARED=symlinks \ MK_INCLUDES=yes includes .if !empty(SUBDIR_OVERRIDE) && make(buildworld) ${_+_}cd ${.CURDIR}; ${WMAKE} MK_INCLUDES=yes SHARED=symlinks includes .endif _libraries: @echo @echo "--------------------------------------------------------------" @echo ">>> stage 4.2: building libraries" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; \ ${WMAKE} -DNO_FSCHG MK_HTML=no -DNO_LINT MK_MAN=no \ MK_PROFILE=no MK_TESTS=no MK_TESTS_SUPPORT=${MK_TESTS} libraries everything: .PHONY @echo @echo "--------------------------------------------------------------" @echo ">>> stage 4.3: building everything" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; _PARALLEL_SUBDIR_OK=1 ${WMAKE} all WMAKE_TGTS= .if !defined(WORLDFAST) WMAKE_TGTS+= _worldtmp _legacy .if empty(SUBDIR_OVERRIDE) WMAKE_TGTS+= _bootstrap-tools .endif WMAKE_TGTS+= _cleanobj .if !defined(NO_OBJ) WMAKE_TGTS+= _obj .endif WMAKE_TGTS+= _build-tools _cross-tools WMAKE_TGTS+= _build-metadata WMAKE_TGTS+= _includes .endif .if !defined(NO_LIBS) WMAKE_TGTS+= _libraries .endif WMAKE_TGTS+= everything .if defined(LIBCOMPAT) && empty(SUBDIR_OVERRIDE) WMAKE_TGTS+= build${libcompat} .endif buildworld: buildworld_prologue ${WMAKE_TGTS} buildworld_epilogue .PHONY .ORDER: buildworld_prologue ${WMAKE_TGTS} buildworld_epilogue buildworld_prologue: .PHONY @echo "--------------------------------------------------------------" @echo ">>> World build started on `LC_ALL=C date`" @echo "--------------------------------------------------------------" buildworld_epilogue: .PHONY @echo @echo "--------------------------------------------------------------" @echo ">>> World build completed on `LC_ALL=C date`" @echo "--------------------------------------------------------------" # # We need to have this as a target because the indirection between Makefile # and Makefile.inc1 causes the correct PATH to be used, rather than a # modification of the current environment's PATH. In addition, we need # to quote multiword values. # buildenvvars: .PHONY @echo ${WMAKEENV:Q} ${.MAKE.EXPORTED:@v@$v=\"${$v}\"@} .if ${.TARGETS:Mbuildenv} .if ${.MAKEFLAGS:M-j} .error The buildenv target is incompatible with -j .endif .endif BUILDENV_DIR?= ${.CURDIR} buildenv: .PHONY @echo Entering world for ${TARGET_ARCH}:${TARGET} .if ${BUILDENV_SHELL:M*zsh*} @echo For ZSH you must run: export CPUTYPE=${TARGET_CPUTYPE} .endif @cd ${BUILDENV_DIR} && env ${WMAKEENV} BUILDENV=1 ${BUILDENV_SHELL} \ || true TOOLCHAIN_TGTS= ${WMAKE_TGTS:Neverything:Nbuild${libcompat}} toolchain: ${TOOLCHAIN_TGTS} .PHONY kernel-toolchain: ${TOOLCHAIN_TGTS:N_includes:N_libraries} .PHONY # # installcheck # # Checks to be sure system is ready for installworld/installkernel. # installcheck: _installcheck_world _installcheck_kernel .PHONY _installcheck_world: .PHONY _installcheck_kernel: .PHONY # # Require DESTDIR to be set if installing for a different architecture or # using the user/group database in the source tree. # .if ${TARGET_ARCH} != ${MACHINE_ARCH} || ${TARGET} != ${MACHINE} || \ defined(DB_FROM_SRC) .if !make(distributeworld) _installcheck_world: __installcheck_DESTDIR _installcheck_kernel: __installcheck_DESTDIR __installcheck_DESTDIR: .PHONY .if !defined(DESTDIR) || empty(DESTDIR) @echo "ERROR: Please set DESTDIR!"; \ false .endif .endif .endif .if !defined(DB_FROM_SRC) # # Check for missing UIDs/GIDs. # CHECK_UIDS= auditdistd CHECK_GIDS= audit .if ${MK_SENDMAIL} != "no" CHECK_UIDS+= smmsp CHECK_GIDS+= smmsp .endif .if ${MK_PF} != "no" CHECK_UIDS+= proxy CHECK_GIDS+= proxy authpf .endif .if ${MK_UNBOUND} != "no" CHECK_UIDS+= unbound CHECK_GIDS+= unbound .endif _installcheck_world: __installcheck_UGID __installcheck_UGID: .PHONY .for uid in ${CHECK_UIDS} @if ! `id -u ${uid} >/dev/null 2>&1`; then \ echo "ERROR: Required ${uid} user is missing, see /usr/src/UPDATING."; \ false; \ fi .endfor .for gid in ${CHECK_GIDS} @if ! `find / -prune -group ${gid} >/dev/null 2>&1`; then \ echo "ERROR: Required ${gid} group is missing, see /usr/src/UPDATING."; \ false; \ fi .endfor .endif # # If installing over the running system (DESTDIR is / or unset) and the install # includes rescue, try running rescue from the objdir as a sanity check. If # rescue is not functional (e.g., because it depends on a system call not # supported by the currently running kernel), abort the installation. # .if !make(distributeworld) && ${MK_RESCUE} != "no" && \ (empty(DESTDIR) || ${DESTDIR} == "/") && empty(BYPASS_INSTALLCHECK_SH) _installcheck_world: __installcheck_sh_check __installcheck_sh_check: .PHONY @if [ "`${OBJTREE}${.CURDIR}/rescue/rescue/rescue sh -c 'echo OK'`" != \ OK ]; then \ echo "rescue/sh check failed, installation aborted" >&2; \ false; \ fi .endif # # Required install tools to be saved in a scratch dir for safety. # .if ${MK_ZONEINFO} != "no" _zoneinfo= zic tzsetup .endif ITOOLS= [ awk cap_mkdb cat chflags chmod chown cmp cp \ date echo egrep find grep id install ${_install-info} \ ln make mkdir mtree mv pwd_mkdb \ rm sed services_mkdb sh strip sysctl test true uname wc ${_zoneinfo} \ ${LOCAL_ITOOLS} # Needed for share/man .if ${MK_MAN_UTILS} != "no" ITOOLS+=makewhatis .endif # # distributeworld # # Distributes everything compiled by a `buildworld'. # # installworld # # Installs everything compiled by a 'buildworld'. # # Non-base distributions produced by the base system EXTRA_DISTRIBUTIONS= doc .if defined(LIBCOMPAT) EXTRA_DISTRIBUTIONS+= lib${libcompat} .endif .if ${MK_TESTS} != "no" EXTRA_DISTRIBUTIONS+= tests .endif DEBUG_DISTRIBUTIONS= .if ${MK_DEBUG_FILES} != "no" DEBUG_DISTRIBUTIONS+= base ${EXTRA_DISTRIBUTIONS:S,doc,,:S,tests,,} .endif MTREE_MAGIC?= mtree 2.0 distributeworld installworld stageworld: _installcheck_world .PHONY mkdir -p ${INSTALLTMP} progs=$$(for prog in ${ITOOLS}; do \ if progpath=`which $$prog`; then \ echo $$progpath; \ else \ echo "Required tool $$prog not found in PATH." >&2; \ exit 1; \ fi; \ done); \ libs=$$(ldd -f "%o %p\n" -f "%o %p\n" $$progs 2>/dev/null | sort -u | \ while read line; do \ set -- $$line; \ if [ "$$2 $$3" != "not found" ]; then \ echo $$2; \ else \ echo "Required library $$1 not found." >&2; \ exit 1; \ fi; \ done); \ cp $$libs $$progs ${INSTALLTMP} cp -R $${PATH_LOCALE:-"/usr/share/locale"} ${INSTALLTMP}/locale .if defined(NO_ROOT) -mkdir -p ${METALOG:H} echo "#${MTREE_MAGIC}" > ${METALOG} .endif .if make(distributeworld) .for dist in ${EXTRA_DISTRIBUTIONS} -mkdir ${DESTDIR}/${DISTDIR}/${dist} mtree -deU -f ${.CURDIR}/etc/mtree/BSD.root.dist \ -p ${DESTDIR}/${DISTDIR}/${dist} >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/include >/dev/null .if ${MK_DEBUG_FILES} != "no" mtree -deU -f ${.CURDIR}/etc/mtree/BSD.debug.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib >/dev/null .endif .if defined(LIBCOMPAT) mtree -deU -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr >/dev/null .if ${MK_DEBUG_FILES} != "no" mtree -deU -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib/debug/usr >/dev/null .endif .endif .if ${MK_TESTS} != "no" && ${dist} == "tests" -mkdir -p ${DESTDIR}/${DISTDIR}/${dist}${TESTSBASE} mtree -deU -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}${TESTSBASE} >/dev/null .if ${MK_DEBUG_FILES} != "no" mtree -deU -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${DESTDIR}/${DISTDIR}/${dist}/usr/lib/debug/${TESTSBASE} >/dev/null .endif .endif .if defined(NO_ROOT) ${IMAKEENV} mtree -C -f ${.CURDIR}/etc/mtree/BSD.root.dist | \ sed -e 's#^\./#./${dist}/#' >> ${METALOG} ${IMAKEENV} mtree -C -f ${.CURDIR}/etc/mtree/BSD.usr.dist | \ sed -e 's#^\./#./${dist}/usr/#' >> ${METALOG} ${IMAKEENV} mtree -C -f ${.CURDIR}/etc/mtree/BSD.include.dist | \ sed -e 's#^\./#./${dist}/usr/include/#' >> ${METALOG} .if defined(LIBCOMPAT) ${IMAKEENV} mtree -C -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist | \ sed -e 's#^\./#./${dist}/usr/#' >> ${METALOG} .endif .endif .endfor -mkdir ${DESTDIR}/${DISTDIR}/base ${_+_}cd ${.CURDIR}/etc; ${CROSSENV} PATH=${TMPPATH} ${MAKE} \ METALOG=${METALOG} ${IMAKE_INSTALL} ${IMAKE_MTREE} \ DISTBASE=/base DESTDIR=${DESTDIR}/${DISTDIR}/base \ LOCAL_MTREE=${LOCAL_MTREE:Q} distrib-dirs .endif ${_+_}cd ${.CURDIR}; ${IMAKE} re${.TARGET:S/world$//}; \ ${IMAKEENV} rm -rf ${INSTALLTMP} .if make(distributeworld) .for dist in ${EXTRA_DISTRIBUTIONS} find ${DESTDIR}/${DISTDIR}/${dist} -mindepth 1 -type d -empty -delete .endfor .if defined(NO_ROOT) .for dist in base ${EXTRA_DISTRIBUTIONS} @# For each file that exists in this dist, print the corresponding @# line from the METALOG. This relies on the fact that @# a line containing only the filename will sort immediately before @# the relevant mtree line. cd ${DESTDIR}/${DISTDIR}; \ find ./${dist} | sort -u ${METALOG} - | \ awk 'BEGIN { print "#${MTREE_MAGIC}" } !/ type=/ { file = $$1 } / type=/ { if ($$1 == file) { sub(/^\.\/${dist}\//, "./"); print } }' > \ ${DESTDIR}/${DISTDIR}/${dist}.meta .endfor .for dist in ${DEBUG_DISTRIBUTIONS} @# For each file that exists in this dist, print the corresponding @# line from the METALOG. This relies on the fact that @# a line containing only the filename will sort immediately before @# the relevant mtree line. cd ${DESTDIR}/${DISTDIR}; \ find ./${dist}/usr/lib/debug | sort -u ${METALOG} - | \ awk 'BEGIN { print "#${MTREE_MAGIC}" } !/ type=/ { file = $$1 } / type=/ { if ($$1 == file) { sub(/^\.\/${dist}\//, "./"); print } }' > \ ${DESTDIR}/${DISTDIR}/${dist}.debug.meta .endfor .endif .endif packageworld: .PHONY .for dist in base ${EXTRA_DISTRIBUTIONS} .if defined(NO_ROOT) ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvf - --exclude usr/lib/debug \ @${DESTDIR}/${DISTDIR}/${dist}.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}.txz .else ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvf - --exclude usr/lib/debug . | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}.txz .endif .endfor .for dist in ${DEBUG_DISTRIBUTIONS} . if defined(NO_ROOT) ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvf - @${DESTDIR}/${DISTDIR}/${dist}.debug.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}-dbg.txz . else ${_+_}cd ${DESTDIR}/${DISTDIR}/${dist}; \ tar cvLf - usr/lib/debug | \ ${XZ_CMD} > ${PACKAGEDIR}/${dist}-dbg.txz . endif .endfor # # reinstall # # If you have a build server, you can NFS mount the source and obj directories # and do a 'make reinstall' on the *client* to install new binaries from the # most recent server build. # restage reinstall: .MAKE .PHONY @echo "--------------------------------------------------------------" @echo ">>> Making hierarchy" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 \ LOCAL_MTREE=${LOCAL_MTREE:Q} hierarchy .if make(restage) @echo "--------------------------------------------------------------" @echo ">>> Making distribution" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 \ LOCAL_MTREE=${LOCAL_MTREE:Q} distribution .endif @echo @echo "--------------------------------------------------------------" @echo ">>> Installing everything" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 install .if defined(LIBCOMPAT) ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 install${libcompat} .endif redistribute: .MAKE .PHONY @echo "--------------------------------------------------------------" @echo ">>> Distributing everything" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 distribute .if defined(LIBCOMPAT) ${_+_}cd ${.CURDIR}; ${MAKE} -f Makefile.inc1 distribute${libcompat} \ DISTRIBUTION=lib${libcompat} .endif distrib-dirs distribution: .MAKE .PHONY ${_+_}cd ${.CURDIR}/etc; ${CROSSENV} PATH=${TMPPATH} ${MAKE} \ ${IMAKE_INSTALL} ${IMAKE_MTREE} METALOG=${METALOG} ${.TARGET} .if make(distribution) ${_+_}cd ${.CURDIR}; ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} -f Makefile.inc1 ${IMAKE_INSTALL} \ METALOG=${METALOG} MK_TESTS=no installconfig .endif # # buildkernel and installkernel # # Which kernels to build and/or install is specified by setting # KERNCONF. If not defined a GENERIC kernel is built/installed. # Only the existing (depending TARGET) config files are used # for building kernels and only the first of these is designated # as the one being installed. # # Note that we have to use TARGET instead of TARGET_ARCH when # we're in kernel-land. Since only TARGET_ARCH is (expected) to # be set to cross-build, we have to make sure TARGET is set # properly. .if defined(KERNFAST) NO_KERNELCLEAN= t NO_KERNELCONFIG= t NO_KERNELOBJ= t # Shortcut for KERNCONF=Blah -DKERNFAST is now KERNFAST=Blah .if !defined(KERNCONF) && ${KERNFAST} != "1" KERNCONF=${KERNFAST} .endif .endif .if ${TARGET_ARCH} == "powerpc64" KERNCONF?= GENERIC64 .else KERNCONF?= GENERIC .endif INSTKERNNAME?= kernel KERNSRCDIR?= ${.CURDIR}/sys KRNLCONFDIR= ${KERNSRCDIR}/${TARGET}/conf KRNLOBJDIR= ${OBJTREE}${KERNSRCDIR} KERNCONFDIR?= ${KRNLCONFDIR} BUILDKERNELS= INSTALLKERNEL= .if defined(NO_INSTALLKERNEL) # All of the BUILDKERNELS loops start at index 1. BUILDKERNELS+= dummy .endif .for _kernel in ${KERNCONF} .if exists(${KERNCONFDIR}/${_kernel}) BUILDKERNELS+= ${_kernel} .if empty(INSTALLKERNEL) && !defined(NO_INSTALLKERNEL) INSTALLKERNEL= ${_kernel} .endif .else .if make(buildkernel) .error Missing KERNCONF ${KERNCONFDIR}/${_kernel} .endif .endif .endfor ${WMAKE_TGTS:N_worldtmp:Nbuild${libcompat}} ${.ALLTARGETS:M_*:N_worldtmp}: .MAKE .PHONY # # buildkernel # # Builds all kernels defined by BUILDKERNELS. # buildkernel: .MAKE .PHONY .if empty(BUILDKERNELS:Ndummy) @echo "ERROR: Missing kernel configuration file(s) (${KERNCONF})."; \ false .endif @echo .for _kernel in ${BUILDKERNELS:Ndummy} @echo "--------------------------------------------------------------" @echo ">>> Kernel build for ${_kernel} started on `LC_ALL=C date`" @echo "--------------------------------------------------------------" @echo "===> ${_kernel}" mkdir -p ${KRNLOBJDIR} .if !defined(NO_KERNELCONFIG) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 1: configuring the kernel" @echo "--------------------------------------------------------------" cd ${KRNLCONFDIR}; \ PATH=${TMPPATH} \ config ${CONFIGARGS} -d ${KRNLOBJDIR}/${_kernel} \ -I '${KERNCONFDIR}' '${KERNCONFDIR}/${_kernel}' .endif .if !defined(NO_CLEAN) && !defined(NO_KERNELCLEAN) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.1: cleaning up the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} ${CLEANDIR} .endif .if !defined(NO_KERNELOBJ) @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.2: rebuilding the object tree" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} obj .endif @echo @echo "--------------------------------------------------------------" @echo ">>> stage 2.3: build tools" @echo "--------------------------------------------------------------" ${_+_}cd ${.CURDIR}; ${KTMAKE} kernel-tools @echo @echo "--------------------------------------------------------------" @echo ">>> stage 3.1: building everything" @echo "--------------------------------------------------------------" ${_+_}cd ${KRNLOBJDIR}/${_kernel}; ${KMAKE} all -DNO_MODULES_OBJ @echo "--------------------------------------------------------------" @echo ">>> Kernel build for ${_kernel} completed on `LC_ALL=C date`" @echo "--------------------------------------------------------------" .endfor NO_INSTALLEXTRAKERNELS?= yes # # installkernel, etc. # # Install the kernel defined by INSTALLKERNEL # installkernel installkernel.debug \ reinstallkernel reinstallkernel.debug: _installcheck_kernel .PHONY .if !defined(NO_INSTALLKERNEL) .if empty(INSTALLKERNEL) @echo "ERROR: No kernel \"${KERNCONF}\" to install."; \ false .endif @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${INSTALLKERNEL}" @echo "--------------------------------------------------------------" cd ${KRNLOBJDIR}/${INSTALLKERNEL}; \ ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} ${IMAKE_INSTALL} KERNEL=${INSTKERNNAME} ${.TARGET:S/kernel//} .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} @echo "--------------------------------------------------------------" @echo ">>> Installing kernel ${_kernel}" @echo "--------------------------------------------------------------" cd ${KRNLOBJDIR}/${_kernel}; \ ${CROSSENV} PATH=${TMPPATH} \ ${MAKE} ${IMAKE_INSTALL} KERNEL=${INSTKERNNAME}.${_kernel} ${.TARGET:S/kernel//} .endfor .endif distributekernel distributekernel.debug: .PHONY .if !defined(NO_INSTALLKERNEL) .if empty(INSTALLKERNEL) @echo "ERROR: No kernel \"${KERNCONF}\" to install."; \ false .endif mkdir -p ${DESTDIR}/${DISTDIR} .if defined(NO_ROOT) @echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.premeta .endif cd ${KRNLOBJDIR}/${INSTALLKERNEL}; \ ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.premeta/} \ ${IMAKE_MTREE} PATH=${TMPPATH} ${MAKE} KERNEL=${INSTKERNNAME} \ DESTDIR=${INSTALL_DDIR}/kernel \ ${.TARGET:S/distributekernel/install/} .if defined(NO_ROOT) @sed -e 's|^./kernel|.|' ${DESTDIR}/${DISTDIR}/kernel.premeta > \ ${DESTDIR}/${DISTDIR}/kernel.meta .endif .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} .if defined(NO_ROOT) @echo "#${MTREE_MAGIC}" > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta .endif cd ${KRNLOBJDIR}/${_kernel}; \ ${IMAKEENV} ${IMAKE_INSTALL:S/METALOG/kernel.${_kernel}.premeta/} \ ${IMAKE_MTREE} PATH=${TMPPATH} ${MAKE} \ KERNEL=${INSTKERNNAME}.${_kernel} \ DESTDIR=${INSTALL_DDIR}/kernel.${_kernel} \ ${.TARGET:S/distributekernel/install/} .if defined(NO_ROOT) @sed -e "s|^./kernel.${_kernel}|.|" \ ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.premeta > \ ${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta .endif .endfor .endif packagekernel: .PHONY .if defined(NO_ROOT) .if !defined(NO_INSTALLKERNEL) cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --exclude '*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.txz .endif .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --include '*/*/*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.meta | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --exclude '*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.${_kernel}.txz .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --include '*/*/*.debug' \ @${DESTDIR}/${DISTDIR}/kernel.${_kernel}.meta | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}-dbg.txz .endif .endfor .endif .else .if !defined(NO_INSTALLKERNEL) cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --exclude '*.debug' . | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.txz .endif .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel; \ tar cvf - --include '*/*/*.debug' $$(eval find .) | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel-dbg.txz .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" .for _kernel in ${BUILDKERNELS:[2..-1]} cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --exclude '*.debug' . | \ ${XZ_CMD} > ${PACKAGEDIR}/kernel.${_kernel}.txz .if ${MK_DEBUG_FILES} != "no" cd ${DESTDIR}/${DISTDIR}/kernel.${_kernel}; \ tar cvf - --include '*/*/*.debug' $$(eval find .) | \ ${XZ_CMD} > ${DESTDIR}/${DISTDIR}/kernel.${_kernel}-dbg.txz .endif .endfor .endif .endif stagekernel: .PHONY ${_+_}${MAKE} -C ${.CURDIR} ${.MAKEFLAGS} distributekernel PORTSDIR?= /usr/ports WSTAGEDIR?= ${MAKEOBJDIRPREFIX}${.CURDIR}/${TARGET}.${TARGET_ARCH}/worldstage KSTAGEDIR?= ${MAKEOBJDIRPREFIX}${.CURDIR}/${TARGET}.${TARGET_ARCH}/kernelstage REPODIR?= ${MAKEOBJDIRPREFIX}${.CURDIR}/repo PKGSIGNKEY?= # empty .ORDER: stage-packages create-packages .ORDER: create-packages create-world-packages .ORDER: create-packages create-kernel-packages .ORDER: create-packages sign-packages _pkgbootstrap: .PHONY .if !exists(${LOCALBASE}/sbin/pkg) @env ASSUME_ALWAYS_YES=YES pkg bootstrap .endif packages: .PHONY ${_+_}${MAKE} -C ${.CURDIR} PKG_VERSION=${PKG_VERSION} real-packages package-pkg: .PHONY rm -rf /tmp/ports.${TARGET} || : env ${WMAKEENV:Q} SRCDIR=${.CURDIR} PORTSDIR=${PORTSDIR} REVISION=${_REVISION} \ PKG_CMD=${PKG_CMD} PKG_VERSION=${PKG_VERSION} REPODIR=${REPODIR} \ WSTAGEDIR=${WSTAGEDIR} \ sh ${.CURDIR}/release/scripts/make-pkg-package.sh real-packages: stage-packages create-packages sign-packages .PHONY stage-packages-world: .PHONY @mkdir -p ${WSTAGEDIR} ${_+_}@cd ${.CURDIR}; \ ${MAKE} DESTDIR=${WSTAGEDIR} -DNO_ROOT stageworld stage-packages-kernel: .PHONY @mkdir -p ${KSTAGEDIR} ${_+_}@cd ${.CURDIR}; \ ${MAKE} DESTDIR=${KSTAGEDIR} -DNO_ROOT stagekernel stage-packages: .PHONY stage-packages-world stage-packages-kernel _repodir: .PHONY @mkdir -p ${REPODIR} create-packages-world: _pkgbootstrap _repodir .PHONY ${_+_}@cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 \ DESTDIR=${WSTAGEDIR} \ PKG_VERSION=${PKG_VERSION} create-world-packages create-packages-kernel: _pkgbootstrap _repodir .PHONY ${_+_}@cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 \ DESTDIR=${KSTAGEDIR} \ PKG_VERSION=${PKG_VERSION} DISTDIR=kernel \ create-kernel-packages create-packages: .PHONY create-packages-world create-packages-kernel create-world-packages: _pkgbootstrap .PHONY @rm -f ${WSTAGEDIR}/*.plist 2>/dev/null || : @cd ${WSTAGEDIR} ; \ awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk \ ${WSTAGEDIR}/METALOG @for plist in ${WSTAGEDIR}/*.plist; do \ plist=$${plist##*/} ; \ pkgname=$${plist%.plist} ; \ echo "_PKGS+= $${pkgname}" ; \ done > ${WSTAGEDIR}/packages.mk ${_+_}@cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 create-world-packages-jobs \ .MAKE.JOB.PREFIX= .if make(create-world-packages-jobs) .include "${WSTAGEDIR}/packages.mk" .endif create-world-packages-jobs: .PHONY .for pkgname in ${_PKGS} create-world-packages-jobs: create-world-package-${pkgname} create-world-package-${pkgname}: .PHONY @sh ${SRCDIR}/release/packages/generate-ucl.sh -o ${pkgname} \ -s ${SRCDIR} -u ${WSTAGEDIR}/${pkgname}.ucl @awk -F\" ' \ /^name/ { printf("===> Creating %s-", $$2); next } \ /^version/ { print $$2; next } \ ' ${WSTAGEDIR}/${pkgname}.ucl @if [ "${pkgname}" == "runtime" ]; then \ sed -i '' -e "s/%VCS_REVISION%/${VCS_REVISION}/" ${WSTAGEDIR}/${pkgname}.ucl ; \ fi ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh -o ALLOW_BASE_SHLIBS=yes \ create -M ${WSTAGEDIR}/${pkgname}.ucl \ -p ${WSTAGEDIR}/${pkgname}.plist \ -r ${WSTAGEDIR} \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh config ABI)/${PKG_VERSION} .endfor create-kernel-packages: .PHONY _default_flavor= -default .if exists(${KSTAGEDIR}/kernel.meta) . if ${MK_DEBUG_FILES} != "no" _debug=-debug . endif . for flavor in "" ${_debug} create-kernel-packages: create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},} create-kernel-packages-flavor${flavor:C,^""$,${_default_flavor},}: _pkgbootstrap .PHONY @cd ${KSTAGEDIR}/${DISTDIR} ; \ awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk \ -v kernel=yes -v _kernconf=${INSTALLKERNEL} \ ${KSTAGEDIR}/kernel.meta ; \ cap_arg=`cd ${SRCDIR}/etc ; ${MAKE} -VCAP_MKDB_ENDIAN` ; \ pwd_arg=`cd ${SRCDIR}/etc ; ${MAKE} -VPWD_MKDB_ENDIAN` ; \ sed -e "s/%VERSION%/${PKG_VERSION}/" \ -e "s/%PKGNAME%/kernel-${INSTALLKERNEL:tl}${flavor}/" \ -e "s/%COMMENT%/FreeBSD ${INSTALLKERNEL} kernel ${flavor}/" \ -e "s/%DESC%/FreeBSD ${INSTALLKERNEL} kernel ${flavor}/" \ -e "s/%CAP_MKDB_ENDIAN%/$${cap_arg}/g" \ -e "s/%PWD_MKDB_ENDIAN%/$${pwd_arg}/g" \ -e "s/ %VCS_REVISION%/${VCS_REVISION}/" \ ${SRCDIR}/release/packages/kernel.ucl \ > ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \ awk -F\" ' \ /name/ { printf("===> Creating %s-", $$2); next } \ /version/ {print $$2; next } ' \ ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl ; \ ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh -o ALLOW_BASE_SHLIBS=yes \ create -M ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.ucl \ -p ${KSTAGEDIR}/${DISTDIR}/kernel.${INSTALLKERNEL}${flavor}.plist \ -r ${KSTAGEDIR}/${DISTDIR} \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh config ABI)/${PKG_VERSION} . endfor .endif .if ${BUILDKERNELS:[#]} > 1 && ${NO_INSTALLEXTRAKERNELS} != "yes" . for _kernel in ${BUILDKERNELS:[2..-1]} . if exists(${KSTAGEDIR}/kernel.${_kernel}.meta) . if ${MK_DEBUG_FILES} != "no" _debug=-debug . endif . for flavor in "" ${_debug} create-kernel-packages: create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kernel} create-kernel-packages-extra-flavor${flavor:C,^""$,${_default_flavor},}-${_kernel}: _pkgbootstrap .PHONY @cd ${KSTAGEDIR}/kernel.${_kernel} ; \ awk -f ${SRCDIR}/release/scripts/mtree-to-plist.awk \ -v kernel=yes -v _kernconf=${_kernel} \ ${KSTAGEDIR}/kernel.${_kernel}.meta ; \ cap_arg=`cd ${SRCDIR}/etc ; ${MAKE} -VCAP_MKDB_ENDIAN` ; \ pwd_arg=`cd ${SRCDIR}/etc ; ${MAKE} -VPWD_MKDB_ENDIAN` ; \ sed -e "s/%VERSION%/${PKG_VERSION}/" \ -e "s/%PKGNAME%/kernel-${_kernel:tl}${flavor}/" \ -e "s/%COMMENT%/FreeBSD ${_kernel} kernel ${flavor}/" \ -e "s/%DESC%/FreeBSD ${_kernel} kernel ${flavor}/" \ -e "s/%CAP_MKDB_ENDIAN%/$${cap_arg}/g" \ -e "s/%PWD_MKDB_ENDIAN%/$${pwd_arg}/g" \ -e "s/ %VCS_REVISION%/${VCS_REVISION}/" \ ${SRCDIR}/release/packages/kernel.ucl \ > ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \ awk -F\" ' \ /name/ { printf("===> Creating %s-", $$2); next } \ /version/ {print $$2; next } ' \ ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl ; \ ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh -o ALLOW_BASE_SHLIBS=yes \ create -M ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.ucl \ -p ${KSTAGEDIR}/kernel.${_kernel}/kernel.${_kernel}${flavor}.plist \ -r ${KSTAGEDIR}/kernel.${_kernel} \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh config ABI)/${PKG_VERSION} . endfor . endif . endfor .endif sign-packages: _pkgbootstrap .PHONY @[ -L "${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh config ABI)/latest" ] && \ unlink ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh config ABI)/latest ; \ ${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh repo \ -o ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh config ABI)/${PKG_VERSION} \ ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh config ABI)/${PKG_VERSION} \ ${PKGSIGNKEY} ; \ cd ${REPODIR}/$$(${PKG_CMD} -o ABI_FILE=${WSTAGEDIR}/bin/sh config ABI); \ ln -s ${PKG_VERSION} latest # # # checkworld # # Run test suite on installed world. # checkworld: .PHONY @if [ ! -x "${LOCALBASE}/bin/kyua" ]; then \ echo "You need kyua (devel/kyua) to run the test suite." | /usr/bin/fmt; \ exit 1; \ fi ${_+_}PATH="$$PATH:${LOCALBASE}/bin" kyua test -k ${TESTSBASE}/Kyuafile # # # doxygen # # Build the API documentation with doxygen # doxygen: .PHONY @if [ ! -x "${LOCALBASE}/bin/doxygen" ]; then \ echo "You need doxygen (devel/doxygen) to generate the API documentation of the kernel." | /usr/bin/fmt; \ exit 1; \ fi ${_+_}cd ${.CURDIR}/tools/kerneldoc/subsys; ${MAKE} obj all # # update # # Update the source tree(s), by running svn/svnup to update to the # latest copy. # update: .PHONY .if defined(SVN_UPDATE) @echo "--------------------------------------------------------------" @echo ">>> Updating ${.CURDIR} using Subversion" @echo "--------------------------------------------------------------" @(cd ${.CURDIR}; ${SVN} update ${SVNFLAGS}) .endif # # ------------------------------------------------------------------------ # # From here onwards are utility targets used by the 'make world' and # related targets. If your 'world' breaks, you may like to try to fix # the problem and manually run the following targets to attempt to # complete the build. Beware, this is *not* guaranteed to work, you # need to have a pretty good grip on the current state of the system # to attempt to manually finish it. If in doubt, 'make world' again. # # # legacy: Build compatibility shims for the next three targets. This is a # minimal set of tools and shims necessary to compensate for older systems # which don't have the APIs required by the targets built in bootstrap-tools, # build-tools or cross-tools. # # ELF Tool Chain libraries are needed for ELF tools and dtrace tools. # r296685 fix cross-endian objcopy # r310724 fixed PR 215350, a crash in libdwarf with objects built by GCC 6.2. .if ${BOOTSTRAPPING} < 1200020 _elftoolchain_libs= lib/libelf lib/libdwarf .endif legacy: .PHONY # Temporary special case for automatically detecting the clang compiler issue # Note: 9.x didn't have FreeBSD_version bumps often enough, so you may need to # set BOOTSTRAPPING to 0 if you're stable/9 tree post-dates r286035 but is before # the version bump in r296219 (from July 29, 2015 -> Feb 29, 2016). .if ${BOOTSTRAPPING} != 0 && \ ${WANT_COMPILER_TYPE} == "clang" && ${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} < 30601 .if ${BOOTSTRAPPING} > 10000000 && ${BOOTSTRAPPING} < 1002501 @echo "ERROR: Source upgrades from stable/10 prior to r286033 are not supported."; false .elif ${BOOTSTRAPPING} > 9000000 && ${BOOTSTRAPPING} < 903509 @echo "ERROR: Source upgrades from stable/9 prior to r286035 are not supported."; false .endif .endif .if ${BOOTSTRAPPING} < ${MINIMUM_SUPPORTED_OSREL} && ${BOOTSTRAPPING} != 0 @echo "ERROR: Source upgrades from versions prior to ${MINIMUM_SUPPORTED_REL} are not supported."; \ false .endif .for _tool in tools/build ${_elftoolchain_libs} ${_+_}@${ECHODIR} "===> ${_tool} (obj,includes,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJ}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${MAKEOBJDIRPREFIX}/legacy includes; \ ${MAKE} DIRPRFX=${_tool}/ MK_INCLUDES=no all; \ ${MAKE} DIRPRFX=${_tool}/ MK_INCLUDES=no \ DESTDIR=${MAKEOBJDIRPREFIX}/legacy install .endfor # # bootstrap-tools: Build tools needed for compatibility. These are binaries that # are built to build other binaries in the system. However, the focus of these # binaries is usually quite narrow. Bootstrap tools use the host's compiler and # libraries, augmented by -legacy. # _bt= _bootstrap-tools .if ${MK_GAMES} != "no" _strfile= usr.bin/fortune/strfile .endif .if ${MK_GCC} != "no" && ${MK_CXX} != "no" _gperf= gnu/usr.bin/gperf .endif .if ${MK_VT} != "no" _vtfontcvt= usr.bin/vtfontcvt .endif .if ${BOOTSTRAPPING} < 1000033 _libopenbsd= lib/libopenbsd _m4= usr.bin/m4 _lex= usr.bin/lex ${_bt}-usr.bin/m4: ${_bt}-lib/libopenbsd ${_bt}-usr.bin/lex: ${_bt}-usr.bin/m4 .endif # r245440 mtree -N support added # r313404 requires sha384.h for libnetbsd, added to libmd in r292782 .if ${BOOTSTRAPPING} < 1100093 _nmtree= lib/libmd \ lib/libnetbsd \ usr.sbin/nmtree ${_bt}-lib/libnetbsd: ${_bt}-lib/libmd ${_bt}-usr.sbin/nmtree: ${_bt}-lib/libnetbsd .endif # r246097: log addition login.conf.db, passwd, pwd.db, and spwd.db with cat -l .if ${BOOTSTRAPPING} < 1000027 _cat= bin/cat .endif # r277259 crunchide: Correct 64-bit section header offset # r281674 crunchide: always include both 32- and 64-bit ELF support .if ${BOOTSTRAPPING} < 1100078 _crunchide= usr.sbin/crunch/crunchide .endif # r285986 crunchen: use STRIPBIN rather than STRIP # 1100113: Support MK_AUTO_OBJ # 1200006: META_MODE fixes .if ${BOOTSTRAPPING} < 1100078 || \ (${MK_AUTO_OBJ} == "yes" && ${BOOTSTRAPPING} < 1100114) || \ (${MK_META_MODE} == "yes" && ${BOOTSTRAPPING} < 1200006) _crunchgen= usr.sbin/crunch/crunchgen .endif # r296926 -P keymap search path, MFC to stable/10 in r298297 .if ${BOOTSTRAPPING} < 1003501 || \ (${BOOTSTRAPPING} >= 1100000 && ${BOOTSTRAPPING} < 1100103) _kbdcontrol= usr.sbin/kbdcontrol .endif _yacc= lib/liby \ usr.bin/yacc ${_bt}-usr.bin/yacc: ${_bt}-lib/liby .if ${MK_BSNMP} != "no" _gensnmptree= usr.sbin/bsnmpd/gensnmptree .endif # We need to build tblgen when we're building clang or lld, either as # bootstrap tools, or as the part of the normal build. .if ${MK_CLANG_BOOTSTRAP} != "no" || ${MK_CLANG} != "no" || \ ${MK_LLD_BOOTSTRAP} != "no" || ${MK_LLD} != "no" _clang_tblgen= \ lib/clang/libllvmminimal \ usr.bin/clang/llvm-tblgen \ usr.bin/clang/clang-tblgen ${_bt}-usr.bin/clang/clang-tblgen: ${_bt}-lib/clang/libllvmminimal ${_bt}-usr.bin/clang/llvm-tblgen: ${_bt}-lib/clang/libllvmminimal .endif # Default to building the GPL DTC, but build the BSDL one if users explicitly # request it. _dtc= usr.bin/dtc .if ${MK_GPL_DTC} != "no" _dtc= gnu/usr.bin/dtc .endif .if ${MK_KERBEROS} != "no" _kerberos5_bootstrap_tools= \ kerberos5/tools/make-roken \ kerberos5/lib/libroken \ kerberos5/lib/libvers \ kerberos5/tools/asn1_compile \ kerberos5/tools/slc \ usr.bin/compile_et .ORDER: ${_kerberos5_bootstrap_tools:C/^/${_bt}-/g} .endif # r283777 makewhatis(1) replaced with mandoc version which builds a database. _libopenbsd?= lib/libopenbsd _makewhatis= usr.bin/mandoc ${_bt}-usr.bin/mandoc: ${_bt}-lib/libopenbsd bootstrap-tools: .PHONY # Please document (add comment) why something is in 'bootstrap-tools'. # Try to bound the building of the bootstrap-tool to just the # FreeBSD versions that need the tool built at this stage of the build. .for _tool in \ ${_clang_tblgen} \ ${_kerberos5_bootstrap_tools} \ ${_strfile} \ ${_gperf} \ ${_dtc} \ ${_cat} \ ${_kbdcontrol} \ usr.bin/lorder \ ${_libopenbsd} \ ${_makewhatis} \ usr.bin/rpcgen \ ${_yacc} \ ${_m4} \ ${_lex} \ usr.bin/xinstall \ ${_gensnmptree} \ usr.sbin/config \ ${_crunchide} \ ${_crunchgen} \ ${_nmtree} \ ${_vtfontcvt} \ usr.bin/localedef ${_bt}-${_tool}: .PHONY .MAKE ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJ}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ all; \ ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${MAKEOBJDIRPREFIX}/legacy install bootstrap-tools: ${_bt}-${_tool} .endfor # # build-tools: Build special purpose build tools # .if !defined(NO_SHARE) _share= share/syscons/scrnmaps .endif .if ${MK_GCC} != "no" _gcc_tools= gnu/usr.bin/cc/cc_tools .endif .if ${MK_RESCUE} != "no" # rescue includes programs that have build-tools targets _rescue=rescue/rescue .endif .if ${MK_TCSH} != "no" _tcsh=bin/csh .endif .for _tool in \ ${_tcsh} \ bin/sh \ ${LOCAL_TOOL_DIRS} \ lib/ncurses/ncurses \ lib/ncurses/ncursesw \ ${_rescue} \ ${_share} \ usr.bin/awk \ lib/libmagic \ usr.bin/mkesdb_static \ usr.bin/mkcsmapper_static \ usr.bin/vi/catalog build-tools_${_tool}: .PHONY ${_+_}@${ECHODIR} "===> ${_tool} (obj,build-tools)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJ}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ build-tools build-tools: build-tools_${_tool} .endfor .for _tool in \ ${_gcc_tools} build-tools_${_tool}: .PHONY ${_+_}@${ECHODIR} "===> ${_tool} (obj,all)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJ}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ all build-tools: build-tools_${_tool} .endfor # # kernel-tools: Build kernel-building tools # kernel-tools: .PHONY mkdir -p ${MAKEOBJDIRPREFIX}/usr mtree -deU -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${MAKEOBJDIRPREFIX}/usr >/dev/null # # cross-tools: All the tools needed to build the rest of the system after # we get done with the earlier stages. It is the last set of tools needed # to begin building the target binaries. # .if ${TARGET_ARCH} != ${MACHINE_ARCH} .if ${TARGET_ARCH} == "amd64" || ${TARGET_ARCH} == "i386" _btxld= usr.sbin/btxld .endif .endif # Rebuild ctfconvert and ctfmerge to avoid difficult-to-diagnose failures # resulting from missing bug fixes or ELF Toolchain updates. .if ${MK_CDDL} != "no" _dtrace_tools= cddl/lib/libctf cddl/usr.bin/ctfconvert \ cddl/usr.bin/ctfmerge .endif # If we're given an XAS, don't build binutils. .if ${XAS:M/*} == "" .if ${MK_BINUTILS_BOOTSTRAP} != "no" _binutils= gnu/usr.bin/binutils .endif .if ${MK_ELFTOOLCHAIN_BOOTSTRAP} != "no" _elftctools= lib/libelftc \ lib/libpe \ usr.bin/elfcopy \ usr.bin/nm \ usr.bin/size \ usr.bin/strings # These are not required by the build, but can be useful for developers who # cross-build on a FreeBSD 10 host: _elftctools+= usr.bin/addr2line .endif .elif ${TARGET_ARCH} != ${MACHINE_ARCH} && ${MK_ELFTOOLCHAIN_BOOTSTRAP} != "no" # If cross-building with an external binutils we still need to build strip for # the target (for at least crunchide). _elftctools= lib/libelftc \ lib/libpe \ usr.bin/elfcopy .endif .if ${MK_CLANG_BOOTSTRAP} != "no" _clang= usr.bin/clang .endif .if ${MK_LLD_BOOTSTRAP} != "no" _lld= usr.bin/clang/lld .endif .if ${MK_CLANG_BOOTSTRAP} != "no" || ${MK_LLD_BOOTSTRAP} != "no" _clang_libs= lib/clang .endif .if ${MK_GCC_BOOTSTRAP} != "no" _gcc= gnu/usr.bin/cc .endif .if ${MK_USB} != "no" _usb_tools= sys/boot/usb/tools .endif cross-tools: .MAKE .PHONY .for _tool in \ ${LOCAL_XTOOL_DIRS} \ ${_clang_libs} \ ${_clang} \ ${_lld} \ ${_binutils} \ ${_elftctools} \ ${_dtrace_tools} \ ${_gcc} \ ${_btxld} \ ${_usb_tools} ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJ}" ]; then ${MAKE} DIRPRFX=${_tool}/ obj; fi; \ ${MAKE} DIRPRFX=${_tool}/ all; \ ${MAKE} DIRPRFX=${_tool}/ DESTDIR=${MAKEOBJDIRPREFIX} install .endfor NXBDESTDIR= ${OBJTREE}/nxb-bin NXBENV= MAKEOBJDIRPREFIX=${OBJTREE}/nxb \ TOOLS_PREFIX= \ INSTALL="sh ${.CURDIR}/tools/install.sh" \ PATH=${PATH}:${OBJTREE}/gperf_for_gcc/usr/bin NXBMAKE= ${NXBENV} ${MAKE} \ LLVM_TBLGEN=${NXBDESTDIR}/usr/bin/llvm-tblgen \ CLANG_TBLGEN=${NXBDESTDIR}/usr/bin/clang-tblgen \ MACHINE=${TARGET} MACHINE_ARCH=${TARGET_ARCH} \ MK_GDB=no MK_TESTS=no \ SSP_CFLAGS= \ MK_HTML=no NO_LINT=yes MK_MAN=no MK_MAN_UTILS=yes \ -DNO_PIC MK_PROFILE=no -DNO_SHARED \ -DNO_CPU_CFLAGS MK_WARNS=no MK_CTF=no \ MK_CLANG_EXTRAS=no MK_CLANG_FULL=no \ MK_LLDB=no MK_DEBUG_FILES=no # native-xtools is the current target for qemu-user cross builds of ports # via poudriere and the imgact_binmisc kernel module. # For non-clang enabled targets that are still using the in tree gcc # we must build a gperf binary for one instance of its Makefiles. On # clang-enabled systems, the gperf binary is obsolete. native-xtools: .PHONY .if ${MK_GCC_BOOTSTRAP} != "no" mkdir -p ${OBJTREE}/gperf_for_gcc/usr/bin ${_+_}@${ECHODIR} "===> ${_gperf} (obj,all,install)"; \ cd ${.CURDIR}/${_gperf}; \ if [ -z "${NO_OBJ}" ]; then ${NXBMAKE} DIRPRFX=${_gperf}/ obj; fi; \ ${NXBMAKE} DIRPRFX=${_gperf}/ all; \ ${NXBMAKE} DIRPRFX=${_gperf}/ DESTDIR=${OBJTREE}/gperf_for_gcc install .endif mkdir -p ${NXBDESTDIR}/bin ${NXBDESTDIR}/sbin ${NXBDESTDIR}/usr mtree -deU -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${NXBDESTDIR}/usr >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${NXBDESTDIR}/usr/include >/dev/null .if ${MK_DEBUG_FILES} != "no" mtree -deU -f ${.CURDIR}/etc/mtree/BSD.debug.dist \ -p ${NXBDESTDIR}/usr/lib >/dev/null .endif .for _tool in \ bin/cat \ bin/chmod \ bin/cp \ ${_tcsh} \ bin/echo \ bin/expr \ bin/hostname \ bin/ln \ bin/ls \ bin/mkdir \ bin/mv \ bin/ps \ bin/realpath \ bin/rm \ bin/rmdir \ bin/sh \ bin/sleep \ ${_clang_tblgen} \ usr.bin/ar \ ${_binutils} \ ${_elftctools} \ ${_gcc} \ ${_gcc_tools} \ ${_clang_libs} \ ${_clang} \ ${_lld} \ sbin/md5 \ sbin/sysctl \ usr.bin/diff \ usr.bin/awk \ usr.bin/basename \ usr.bin/bmake \ usr.bin/bzip2 \ usr.bin/cmp \ usr.bin/dirname \ usr.bin/env \ usr.bin/fetch \ usr.bin/find \ usr.bin/grep \ usr.bin/gzip \ usr.bin/id \ usr.bin/lex \ usr.bin/limits \ usr.bin/lorder \ ${_libopenbsd} \ ${_makewhatis} \ usr.bin/mktemp \ usr.bin/mt \ usr.bin/patch \ usr.bin/readelf \ usr.bin/sed \ usr.bin/sort \ usr.bin/tar \ usr.bin/touch \ usr.bin/tr \ usr.bin/true \ usr.bin/uniq \ usr.bin/unzip \ usr.bin/xargs \ usr.bin/xinstall \ usr.bin/xz \ usr.bin/yacc \ usr.sbin/chown ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJ}" ]; then ${NXBMAKE} DIRPRFX=${_tool}/ obj; fi; \ ${NXBMAKE} DIRPRFX=${_tool}/ all; \ ${NXBMAKE} DIRPRFX=${_tool}/ DESTDIR=${NXBDESTDIR} install .endfor # # hierarchy - ensure that all the needed directories are present # hierarchy hier: .MAKE .PHONY ${_+_}cd ${.CURDIR}/etc; ${HMAKE} distrib-dirs # # libraries - build all libraries, and install them under ${DESTDIR}. # # The list of libraries with dependents (${_prebuild_libs}) and their # interdependencies (__L) are built automatically by the # ${.CURDIR}/tools/make_libdeps.sh script. # libraries: .MAKE .PHONY ${_+_}cd ${.CURDIR}; \ ${MAKE} -f Makefile.inc1 _prereq_libs; \ ${MAKE} -f Makefile.inc1 _startup_libs; \ ${MAKE} -f Makefile.inc1 _prebuild_libs; \ ${MAKE} -f Makefile.inc1 _generic_libs # # static libgcc.a prerequisite for shared libc # _prereq_libs= lib/libcompiler_rt .if ${MK_SSP} != "no" _prereq_libs+= gnu/lib/libssp/libssp_nonshared .endif # # The coverage libraries must be built for the target prior to ${_startup_libs} # for world to have runtime coverage instrumentation. # .if ${MK_COVERAGE} != "no" _coverage_libs.${MK_CLANG}+= lib/libclang_rt/profile _coverage_libs.${MK_GCC}+= gnu/lib/libgcov .endif _prereq_libs+= ${_coverage_libs.yes} # These dependencies are not automatically generated: # # gnu/lib/csu, gnu/lib/libgcc, lib/csu and lib/libc must be built before # all shared libraries for ELF. # _startup_libs= gnu/lib/csu _startup_libs+= lib/csu _startup_libs+= lib/libcompiler_rt _startup_libs+= lib/libc _startup_libs+= lib/libc_nonshared .if ${MK_LIBCPLUSPLUS} != "no" _startup_libs+= lib/libcxxrt .endif .if ${MK_LLVM_LIBUNWIND} != "no" _prereq_libs+= lib/libgcc_eh lib/libgcc_s _startup_libs+= lib/libgcc_eh lib/libgcc_s lib/libgcc_s__L: lib/libc__L lib/libgcc_s__L: lib/libc_nonshared__L .if ${MK_LIBCPLUSPLUS} != "no" lib/libcxxrt__L: lib/libgcc_s__L .endif .else # MK_LLVM_LIBUNWIND == no _prereq_libs+= gnu/lib/libgcc _startup_libs+= gnu/lib/libgcc gnu/lib/libgcc__L: lib/libc__L gnu/lib/libgcc__L: lib/libc_nonshared__L .if ${MK_LIBCPLUSPLUS} != "no" lib/libcxxrt__L: gnu/lib/libgcc__L .endif .endif _prebuild_libs= ${_kerberos5_lib_libasn1} \ ${_kerberos5_lib_libhdb} \ ${_kerberos5_lib_libheimbase} \ ${_kerberos5_lib_libheimntlm} \ ${_libsqlite3} \ ${_kerberos5_lib_libheimipcc} \ ${_kerberos5_lib_libhx509} ${_kerberos5_lib_libkrb5} \ ${_kerberos5_lib_libroken} \ ${_kerberos5_lib_libwind} \ lib/libbz2 ${_libcom_err} lib/libcrypt \ lib/libelf lib/libexpat \ lib/libfigpar \ ${_lib_libgssapi} \ lib/libkiconv lib/libkvm lib/liblzma lib/libmd lib/libnv \ ${_lib_casper} \ lib/ncurses/ncurses lib/ncurses/ncursesw \ lib/libopie lib/libpam/libpam ${_lib_libthr} \ ${_lib_libradius} lib/libsbuf lib/libtacplus \ lib/libgeom \ ${_cddl_lib_libumem} ${_cddl_lib_libnvpair} \ ${_cddl_lib_libuutil} \ ${_cddl_lib_libavl} \ ${_cddl_lib_libzfs_core} \ ${_cddl_lib_libctf} \ lib/libutil lib/libpjdlog ${_lib_libypclnt} lib/libz lib/msun \ ${_secure_lib_libcrypto} ${_lib_libldns} \ ${_secure_lib_libssh} ${_secure_lib_libssl} .if ${MK_GNUCXX} != "no" _prebuild_libs+= gnu/lib/libstdc++ gnu/lib/libsupc++ gnu/lib/libstdc++__L: lib/msun__L gnu/lib/libsupc++__L: gnu/lib/libstdc++__L .endif .if ${MK_DIALOG} != "no" _prebuild_libs+= gnu/lib/libdialog gnu/lib/libdialog__L: lib/msun__L lib/ncurses/ncursesw__L .endif .if ${MK_LIBCPLUSPLUS} != "no" _prebuild_libs+= lib/libc++ .endif lib/libgeom__L: lib/libexpat__L lib/libkvm__L: lib/libelf__L .if ${MK_LIBTHR} != "no" _lib_libthr= lib/libthr .endif .if ${MK_RADIUS_SUPPORT} != "no" _lib_libradius= lib/libradius .endif .if ${MK_OFED} != "no" _ofed_lib= contrib/ofed/usr.lib _prebuild_libs+= contrib/ofed/usr.lib/libosmcomp _prebuild_libs+= contrib/ofed/usr.lib/libopensm _prebuild_libs+= contrib/ofed/usr.lib/libibcommon _prebuild_libs+= contrib/ofed/usr.lib/libibverbs _prebuild_libs+= contrib/ofed/usr.lib/libibumad contrib/ofed/usr.lib/libopensm__L: lib/libthr__L contrib/ofed/usr.lib/libosmcomp__L: lib/libthr__L contrib/ofed/usr.lib/libibumad__L: contrib/ofed/usr.lib/libibcommon__L .endif .if ${MK_CASPER} != "no" _lib_casper= lib/libcasper .endif lib/libpjdlog__L: lib/libutil__L lib/libcasper__L: lib/libnv__L lib/liblzma__L: lib/libthr__L _generic_libs= ${_cddl_lib} gnu/lib ${_kerberos5_lib} lib ${_secure_lib} usr.bin/lex/lib ${_ofed_lib} .for _DIR in ${LOCAL_LIB_DIRS} .if exists(${.CURDIR}/${_DIR}/Makefile) && empty(_generic_libs:M${_DIR}) _generic_libs+= ${_DIR} .endif .endfor lib/libopie__L lib/libtacplus__L: lib/libmd__L .if ${MK_CDDL} != "no" _cddl_lib_libumem= cddl/lib/libumem _cddl_lib_libnvpair= cddl/lib/libnvpair _cddl_lib_libavl= cddl/lib/libavl _cddl_lib_libuutil= cddl/lib/libuutil .if ${MK_ZFS} != "no" _cddl_lib_libzfs_core= cddl/lib/libzfs_core cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L .endif _cddl_lib_libctf= cddl/lib/libctf _cddl_lib= cddl/lib cddl/lib/libctf__L: lib/libz__L .endif # cddl/lib/libdtrace requires lib/libproc and lib/librtld_db; it's only built # on select architectures though (see cddl/lib/Makefile) .if ${MACHINE_CPUARCH} != "sparc64" _prebuild_libs+= lib/libprocstat lib/libproc lib/librtld_db lib/libprocstat__L: lib/libelf__L lib/libkvm__L lib/libutil__L lib/libproc__L: lib/libprocstat__L lib/librtld_db__L: lib/libprocstat__L .endif .if ${MK_CRYPT} != "no" .if ${MK_OPENSSL} != "no" _secure_lib_libcrypto= secure/lib/libcrypto _secure_lib_libssl= secure/lib/libssl lib/libradius__L secure/lib/libssl__L: secure/lib/libcrypto__L .if ${MK_LDNS} != "no" _lib_libldns= lib/libldns lib/libldns__L: secure/lib/libcrypto__L .endif .if ${MK_OPENSSH} != "no" _secure_lib_libssh= secure/lib/libssh secure/lib/libssh__L: lib/libz__L secure/lib/libcrypto__L lib/libcrypt__L .if ${MK_LDNS} != "no" secure/lib/libssh__L: lib/libldns__L .endif .if ${MK_GSSAPI} != "no" && ${MK_KERBEROS_SUPPORT} != "no" secure/lib/libssh__L: lib/libgssapi__L kerberos5/lib/libkrb5__L \ kerberos5/lib/libhx509__L kerberos5/lib/libasn1__L lib/libcom_err__L \ lib/libmd__L kerberos5/lib/libroken__L .endif .endif .endif _secure_lib= secure/lib .endif .if ${MK_KERBEROS} != "no" kerberos5/lib/libasn1__L: lib/libcom_err__L kerberos5/lib/libroken__L kerberos5/lib/libhdb__L: kerberos5/lib/libasn1__L lib/libcom_err__L \ kerberos5/lib/libkrb5__L kerberos5/lib/libroken__L \ kerberos5/lib/libwind__L lib/libsqlite3__L kerberos5/lib/libheimntlm__L: secure/lib/libcrypto__L kerberos5/lib/libkrb5__L \ kerberos5/lib/libroken__L lib/libcom_err__L kerberos5/lib/libhx509__L: kerberos5/lib/libasn1__L lib/libcom_err__L \ secure/lib/libcrypto__L kerberos5/lib/libroken__L kerberos5/lib/libwind__L kerberos5/lib/libkrb5__L: kerberos5/lib/libasn1__L lib/libcom_err__L \ lib/libcrypt__L secure/lib/libcrypto__L kerberos5/lib/libhx509__L \ kerberos5/lib/libroken__L kerberos5/lib/libwind__L \ kerberos5/lib/libheimbase__L kerberos5/lib/libheimipcc__L kerberos5/lib/libroken__L: lib/libcrypt__L kerberos5/lib/libwind__L: kerberos5/lib/libroken__L lib/libcom_err__L kerberos5/lib/libheimbase__L: lib/libthr__L kerberos5/lib/libheimipcc__L: kerberos5/lib/libroken__L kerberos5/lib/libheimbase__L lib/libthr__L .endif lib/libsqlite3__L: lib/libthr__L .if ${MK_GSSAPI} != "no" _lib_libgssapi= lib/libgssapi .endif .if ${MK_KERBEROS} != "no" _kerberos5_lib= kerberos5/lib _kerberos5_lib_libasn1= kerberos5/lib/libasn1 _kerberos5_lib_libhdb= kerberos5/lib/libhdb _kerberos5_lib_libheimbase= kerberos5/lib/libheimbase _kerberos5_lib_libkrb5= kerberos5/lib/libkrb5 _kerberos5_lib_libhx509= kerberos5/lib/libhx509 _kerberos5_lib_libroken= kerberos5/lib/libroken _kerberos5_lib_libheimntlm= kerberos5/lib/libheimntlm _libsqlite3= lib/libsqlite3 _kerberos5_lib_libheimipcc= kerberos5/lib/libheimipcc _kerberos5_lib_libwind= kerberos5/lib/libwind _libcom_err= lib/libcom_err .endif .if ${MK_NIS} != "no" _lib_libypclnt= lib/libypclnt .endif .if ${MK_OPENSSL} == "no" lib/libradius__L: lib/libmd__L .endif lib/libproc__L: \ ${_cddl_lib_libctf:D${_cddl_lib_libctf}__L} lib/libelf__L lib/librtld_db__L lib/libutil__L .if ${MK_CXX} != "no" .if ${MK_LIBCPLUSPLUS} != "no" lib/libproc__L: lib/libcxxrt__L .else # This implies MK_GNUCXX != "no"; see lib/libproc lib/libproc__L: gnu/lib/libsupc++__L .endif .endif .for _lib in ${_prereq_libs} ${_lib}__PL: .PHONY .MAKE .if exists(${.CURDIR}/${_lib}) ${_+_}@${ECHODIR} "===> ${_lib} (obj,all,install)"; \ cd ${.CURDIR}/${_lib}; \ if [ -z "${NO_OBJ}" ]; then ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ obj; fi; \ ${MAKE} MK_TESTS=no MK_PROFILE=no -DNO_PIC \ DIRPRFX=${_lib}/ all; \ ${MAKE} MK_TESTS=no MK_PROFILE=no -DNO_PIC \ DIRPRFX=${_lib}/ install .endif .endfor .for _lib in ${_startup_libs} ${_prebuild_libs} ${_generic_libs} ${_lib}__L: .PHONY .MAKE .if exists(${.CURDIR}/${_lib}) ${_+_}@${ECHODIR} "===> ${_lib} (obj,all,install)"; \ cd ${.CURDIR}/${_lib}; \ if [ -z "${NO_OBJ}" ]; then ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ obj; fi; \ ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ all; \ ${MAKE} MK_TESTS=no DIRPRFX=${_lib}/ install .endif .endfor _prereq_libs: ${_prereq_libs:S/$/__PL/} _startup_libs: ${_startup_libs:S/$/__L/} _prebuild_libs: ${_prebuild_libs:S/$/__L/} _generic_libs: ${_generic_libs:S/$/__L/} # Enable SUBDIR_PARALLEL when not calling 'make all', unless called from # 'everything' with _PARALLEL_SUBDIR_OK set. This is because it is unlikely # that running 'make all' from the top-level, especially with a SUBDIR_OVERRIDE # or LOCAL_DIRS set, will have a reliable build if SUBDIRs are built in # parallel. This is safe for the world stage of buildworld though since it has # already built libraries in a proper order and installed includes into # WORLDTMP. Special handling is done for SUBDIR ordering for 'install*' to # avoid trashing a system if it crashes mid-install. .if !make(all) || defined(_PARALLEL_SUBDIR_OK) SUBDIR_PARALLEL= .endif .include .if make(check-old) || make(check-old-dirs) || \ make(check-old-files) || make(check-old-libs) || \ make(delete-old) || make(delete-old-dirs) || \ make(delete-old-files) || make(delete-old-libs) # # check for / delete old files section # .include "ObsoleteFiles.inc" OLD_LIBS_MESSAGE="Please be sure no application still uses those libraries, \ else you can not start such an application. Consult UPDATING for more \ information regarding how to cope with the removal/revision bump of a \ specific library." .if !defined(BATCH_DELETE_OLD_FILES) RM_I=-i .else RM_I=-v .endif delete-old-files: .PHONY @echo ">>> Removing old files (only deletes safe to delete libs)" # Ask for every old file if the user really wants to remove it. # It's annoying, but better safe than sorry. # NB: We cannot pass the list of OLD_FILES as a parameter because the # argument list will get too long. Using .for/.endfor make "loops" will make # the Makefile parser segfault. @exec 3<&0; \ cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_FILES -V "OLD_FILES:Musr/share/*.gz:R" | xargs -n1 | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ chflags noschg "${DESTDIR}/$${file}" 2>/dev/null || true; \ rm ${RM_I} "${DESTDIR}/$${file}" <&3; \ fi; \ for ext in debug symbols; do \ if ! [ -e "${DESTDIR}/$${file}" ] && [ -f \ "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ rm ${RM_I} "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" \ <&3; \ fi; \ done; \ done # Remove catpages without corresponding manpages. @exec 3<&0; \ find ${DESTDIR}/usr/share/man/cat* ! -type d 2>/dev/null | \ sed -ep -e's:${DESTDIR}/usr/share/man/cat:${DESTDIR}/usr/share/man/man:' | \ while read catpage; do \ read manpage; \ if [ ! -e "$${manpage}" ]; then \ rm ${RM_I} $${catpage} <&3; \ fi; \ done @echo ">>> Old files removed" check-old-files: .PHONY @echo ">>> Checking for old files" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_FILES -V "OLD_FILES:Musr/share/*.gz:R" | xargs -n1 | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ echo "${DESTDIR}/$${file}"; \ fi; \ for ext in debug symbols; do \ if [ -f "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}"; \ fi; \ done; \ done # Check for catpages without corresponding manpages. - @find ${DESTDIR}/usr/share/man/cat* ! -type d | \ + @find ${DESTDIR}/usr/share/man/cat* ! -type d 2>/dev/null | \ sed -ep -e's:${DESTDIR}/usr/share/man/cat:${DESTDIR}/usr/share/man/man:' | \ while read catpage; do \ read manpage; \ if [ ! -e "$${manpage}" ]; then \ echo $${catpage}; \ fi; \ done delete-old-libs: .PHONY @echo ">>> Removing old libraries" @echo "${OLD_LIBS_MESSAGE}" | fmt @exec 3<&0; \ cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_LIBS | xargs -n1 | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ chflags noschg "${DESTDIR}/$${file}" 2>/dev/null || true; \ rm ${RM_I} "${DESTDIR}/$${file}" <&3; \ fi; \ for ext in debug symbols; do \ if ! [ -e "${DESTDIR}/$${file}" ] && [ -f \ "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ rm ${RM_I} "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" \ <&3; \ fi; \ done; \ done @echo ">>> Old libraries removed" check-old-libs: .PHONY @echo ">>> Checking for old libraries" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_LIBS | xargs -n1 | \ while read file; do \ if [ -f "${DESTDIR}/$${file}" -o -L "${DESTDIR}/$${file}" ]; then \ echo "${DESTDIR}/$${file}"; \ fi; \ for ext in debug symbols; do \ if [ -f "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${file}.$${ext}"; \ fi; \ done; \ done delete-old-dirs: .PHONY @echo ">>> Removing old directories" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_DIRS | xargs -n1 | sort -r | \ while read dir; do \ if [ -d "${DESTDIR}/$${dir}" ]; then \ rmdir -v "${DESTDIR}/$${dir}" || true; \ elif [ -L "${DESTDIR}/$${dir}" ]; then \ echo "${DESTDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ if [ -d "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ rmdir -v "${DESTDIR}${DEBUGDIR}/$${dir}" || true; \ elif [ -L "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ done @echo ">>> Old directories removed" check-old-dirs: .PHONY @echo ">>> Checking for old directories" @cd ${.CURDIR}; \ ${MAKE} -f ${.CURDIR}/Makefile.inc1 ${.MAKEFLAGS} ${.TARGET} \ -V OLD_DIRS | xargs -n1 | \ while read dir; do \ if [ -d "${DESTDIR}/$${dir}" ]; then \ echo "${DESTDIR}/$${dir}"; \ elif [ -L "${DESTDIR}/$${dir}" ]; then \ echo "${DESTDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ if [ -d "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${dir}"; \ elif [ -L "${DESTDIR}${DEBUGDIR}/$${dir}" ]; then \ echo "${DESTDIR}${DEBUGDIR}/$${dir} is a link, please remove everything manually."; \ fi; \ done delete-old: delete-old-files delete-old-dirs .PHONY @echo "To remove old libraries run '${MAKE_CMD} delete-old-libs'." check-old: check-old-files check-old-libs check-old-dirs .PHONY @echo "To remove old files and directories run '${MAKE_CMD} delete-old'." @echo "To remove old libraries run '${MAKE_CMD} delete-old-libs'." .endif # # showconfig - show build configuration. # showconfig: .PHONY @(${MAKE} -n -f ${.CURDIR}/sys/conf/kern.opts.mk -V dummy -dg1 UPDATE_DEPENDFILE=no NO_OBJ=yes; \ ${MAKE} -n -f ${.CURDIR}/share/mk/src.opts.mk -V dummy -dg1 UPDATE_DEPENDFILE=no NO_OBJ=yes) 2>&1 | grep ^MK_ | sort -u .if !empty(KRNLOBJDIR) && !empty(KERNCONF) DTBOUTPUTPATH= ${KRNLOBJDIR}/${KERNCONF}/ .if !defined(FDT_DTS_FILE) || empty(FDT_DTS_FILE) .if exists(${KERNCONFDIR}/${KERNCONF}) FDT_DTS_FILE!= awk 'BEGIN {FS="="} /^makeoptions[[:space:]]+FDT_DTS_FILE/ {print $$2}' \ '${KERNCONFDIR}/${KERNCONF}' ; echo .endif .endif .endif .if !defined(DTBOUTPUTPATH) || !exists(${DTBOUTPUTPATH}) DTBOUTPUTPATH= ${.CURDIR} .endif # # Build 'standalone' Device Tree Blob # builddtb: .PHONY @PATH=${TMPPATH} MACHINE=${TARGET} \ ${.CURDIR}/sys/tools/fdt/make_dtb.sh ${.CURDIR}/sys \ "${FDT_DTS_FILE}" ${DTBOUTPUTPATH} ############### # cleanworld # In the following, the first 'rm' in a series will usually remove all # files and directories. If it does not, then there are probably some # files with file flags set, so this unsets them and tries the 'rm' a # second time. There are situations where this target will be cleaning # some directories via more than one method, but that duplication is # needed to correctly handle all the possible situations. Removing all # files without file flags set in the first 'rm' instance saves time, # because 'chflags' will need to operate on fewer files afterwards. # # It is expected that BW_CANONICALOBJDIR == the CANONICALOBJDIR as would be # created by bsd.obj.mk, except that we don't want to .include that file # in this makefile. # BW_CANONICALOBJDIR:=${OBJTREE}${.CURDIR} cleanworld: .PHONY .if exists(${BW_CANONICALOBJDIR}/) -rm -rf ${BW_CANONICALOBJDIR}/* -chflags -R 0 ${BW_CANONICALOBJDIR} rm -rf ${BW_CANONICALOBJDIR}/* .endif .if ${.CURDIR} == ${.OBJDIR} || ${.CURDIR}/obj == ${.OBJDIR} # To be safe in this case, fall back to a 'make cleandir' ${_+_}@cd ${.CURDIR}; ${MAKE} cleandir .endif .if defined(TARGET) && defined(TARGET_ARCH) .if ${TARGET} == ${MACHINE} && ${TARGET_ARCH} == ${MACHINE_ARCH} XDEV_CPUTYPE?=${CPUTYPE} .else XDEV_CPUTYPE?=${TARGET_CPUTYPE} .endif NOFUN= MK_COVERAGE=no -DNO_FSCHG MK_HTML=no -DNO_LINT \ MK_MAN=no MK_NLS=no MK_PROFILE=no \ MK_KERBEROS=no MK_RESCUE=no MK_TESTS=no MK_WARNS=no \ TARGET=${TARGET} TARGET_ARCH=${TARGET_ARCH} \ CPUTYPE=${XDEV_CPUTYPE} XDDIR=${TARGET_ARCH}-freebsd XDTP?=/usr/${XDDIR} .if ${XDTP:N/*} .error XDTP variable should be an absolute path .endif CDBENV=MAKEOBJDIRPREFIX=${MAKEOBJDIRPREFIX}/${XDDIR} \ INSTALL="sh ${.CURDIR}/tools/install.sh" CDENV= ${CDBENV} \ TOOLS_PREFIX=${XDTP} .if ${WANT_COMPILER_TYPE} == gcc || \ (defined(X_COMPILER_TYPE) && ${X_COMPILER_TYPE} == gcc) # GCC requires -isystem and -L when using a cross-compiler. --sysroot # won't set header path and -L is used to ensure the base library path # is added before the port PREFIX library path. CD2CFLAGS+= -isystem ${XDDESTDIR}/usr/include -L${XDDESTDIR}/usr/lib # GCC requires -B to find /usr/lib/crti.o when using a cross-compiler # combined with --sysroot. CD2CFLAGS+= -B${XDDESTDIR}/usr/lib # Force using libc++ for external GCC. .if ${X_COMPILER_TYPE} == gcc && ${X_COMPILER_VERSION} >= 40800 CD2CXXFLAGS+= -isystem ${XDDESTDIR}/usr/include/c++/v1 -std=c++11 \ -nostdinc++ .endif .endif CD2CFLAGS+= --sysroot=${XDDESTDIR}/ CD2ENV=${CDENV} CC="${CC} ${CD2CFLAGS}" CXX="${CXX} ${CD2CXXFLAGS} ${CD2CFLAGS}" \ CPP="${CPP} ${CD2CFLAGS}" \ MACHINE=${TARGET} MACHINE_ARCH=${TARGET_ARCH} CDTMP= ${MAKEOBJDIRPREFIX}/${XDDIR}/${.CURDIR}/tmp CDMAKE=${CDENV} PATH=${CDTMP}/usr/bin:${PATH} ${MAKE} ${NOFUN} CD2MAKE=${CD2ENV} PATH=${CDTMP}/usr/bin:${XDDESTDIR}/usr/bin:${PATH} ${MAKE} ${NOFUN} .if ${MK_META_MODE} != "no" # Don't rebuild build-tools targets during normal build. CD2MAKE+= BUILD_TOOLS_META=.NOMETA .endif XDDESTDIR=${DESTDIR}/${XDTP} .ORDER: xdev-build xdev-install xdev-links xdev: xdev-build xdev-install .PHONY .ORDER: _xb-worldtmp _xb-bootstrap-tools _xb-build-tools _xb-cross-tools xdev-build: _xb-worldtmp _xb-bootstrap-tools _xb-build-tools _xb-cross-tools .PHONY _xb-worldtmp: .PHONY mkdir -p ${CDTMP}/usr mtree -deU -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${CDTMP}/usr >/dev/null _xb-bootstrap-tools: .PHONY .for _tool in \ ${_clang_tblgen} \ ${_gperf} \ ${_yacc} ${_+_}@${ECHODIR} "===> ${_tool} (obj,all,install)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJ}" ]; then ${CDMAKE} DIRPRFX=${_tool}/ obj; fi; \ ${CDMAKE} DIRPRFX=${_tool}/ all; \ ${CDMAKE} DIRPRFX=${_tool}/ DESTDIR=${CDTMP} install .endfor _xb-build-tools: .PHONY ${_+_}@cd ${.CURDIR}; \ ${CDBENV} ${MAKE} -f Makefile.inc1 ${NOFUN} build-tools _xb-cross-tools: .PHONY .for _tool in \ ${_binutils} \ ${_elftctools} \ usr.bin/ar \ ${_clang_libs} \ ${_clang} \ ${_gcc} ${_+_}@${ECHODIR} "===> xdev ${_tool} (obj,all)"; \ cd ${.CURDIR}/${_tool}; \ if [ -z "${NO_OBJ}" ]; then ${CDMAKE} DIRPRFX=${_tool}/ obj; fi; \ ${CDMAKE} DIRPRFX=${_tool}/ all .endfor _xi-mtree: .PHONY ${_+_}@${ECHODIR} "mtree populating ${XDDESTDIR}" mkdir -p ${XDDESTDIR} mtree -deU -f ${.CURDIR}/etc/mtree/BSD.root.dist \ -p ${XDDESTDIR} >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.usr.dist \ -p ${XDDESTDIR}/usr >/dev/null mtree -deU -f ${.CURDIR}/etc/mtree/BSD.include.dist \ -p ${XDDESTDIR}/usr/include >/dev/null .if defined(LIBCOMPAT) mtree -deU -f ${.CURDIR}/etc/mtree/BSD.lib${libcompat}.dist \ -p ${XDDESTDIR}/usr >/dev/null .endif .if ${MK_TESTS} != "no" mkdir -p ${XDDESTDIR}${TESTSBASE} mtree -deU -f ${.CURDIR}/etc/mtree/BSD.tests.dist \ -p ${XDDESTDIR}${TESTSBASE} >/dev/null .endif .ORDER: xdev-build _xi-mtree _xi-cross-tools _xi-includes _xi-libraries xdev-install: xdev-build _xi-mtree _xi-cross-tools _xi-includes _xi-libraries .PHONY _xi-cross-tools: .PHONY @echo "_xi-cross-tools" .for _tool in \ ${_binutils} \ ${_elftctools} \ usr.bin/ar \ ${_clang_libs} \ ${_clang} \ ${_gcc} ${_+_}@${ECHODIR} "===> xdev ${_tool} (install)"; \ cd ${.CURDIR}/${_tool}; \ ${CDMAKE} DIRPRFX=${_tool}/ install DESTDIR=${XDDESTDIR} .endfor _xi-includes: .PHONY ${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 includes \ DESTDIR=${XDDESTDIR} _xi-libraries: .PHONY ${_+_}cd ${.CURDIR}; ${CD2MAKE} -f Makefile.inc1 libraries \ DESTDIR=${XDDESTDIR} xdev-links: .PHONY ${_+_}cd ${XDDESTDIR}/usr/bin; \ mkdir -p ../../../../usr/bin; \ for i in *; do \ ln -sf ../../${XDTP}/usr/bin/$$i \ ../../../../usr/bin/${XDDIR}-$$i; \ ln -sf ../../${XDTP}/usr/bin/$$i \ ../../../../usr/bin/${XDDIR}${_REVISION}-$$i; \ done .else xdev xdev-build xdev-install xdev-links: .PHONY @echo "*** Error: Both TARGET and TARGET_ARCH must be defined for \"${.TARGET}\" target" .endif Index: projects/runtime-coverage/sys/arm64/arm64/efirt_machdep.c =================================================================== --- projects/runtime-coverage/sys/arm64/arm64/efirt_machdep.c (nonexistent) +++ projects/runtime-coverage/sys/arm64/arm64/efirt_machdep.c (revision 324498) @@ -0,0 +1,252 @@ +/*- + * Copyright (c) 2004 Marcel Moolenaar + * Copyright (c) 2001 Doug Rabson + * Copyright (c) 2016 The FreeBSD Foundation + * Copyright (c) 2017 Andrew Turner + * All rights reserved. + * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-10-C-0237 + * ("CTSRD"), as part of the DARPA CRASH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static vm_object_t obj_1t1_pt; +static vm_page_t efi_l0_page; +static pd_entry_t *efi_l0; + +void +efi_destroy_1t1_map(void) +{ + vm_page_t m; + + if (obj_1t1_pt != NULL) { + VM_OBJECT_RLOCK(obj_1t1_pt); + TAILQ_FOREACH(m, &obj_1t1_pt->memq, listq) + m->wire_count = 0; + atomic_subtract_int(&vm_cnt.v_wire_count, + obj_1t1_pt->resident_page_count); + VM_OBJECT_RUNLOCK(obj_1t1_pt); + vm_object_deallocate(obj_1t1_pt); + } + + obj_1t1_pt = NULL; + efi_l0 = NULL; + efi_l0_page = NULL; +} + +static vm_page_t +efi_1t1_page(vm_pindex_t idx) +{ + + return (vm_page_grab(obj_1t1_pt, idx, VM_ALLOC_NOBUSY | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)); +} + +static pt_entry_t * +efi_1t1_l3(vm_offset_t va) +{ + pd_entry_t *l0, *l1, *l2; + pt_entry_t *l3; + vm_pindex_t l0_idx, l1_idx, l2_idx; + vm_page_t m; + vm_paddr_t mphys; + + l0_idx = pmap_l0_index(va); + l0 = &efi_l0[l0_idx]; + if (*l0 == 0) { + m = efi_1t1_page(1 + l0_idx); + mphys = VM_PAGE_TO_PHYS(m); + *l0 = mphys | L0_TABLE; + } else { + mphys = *l0 & ~ATTR_MASK; + } + + l1 = (pd_entry_t *)PHYS_TO_DMAP(mphys); + l1_idx = pmap_l1_index(va); + l1 += l1_idx; + if (*l1 == 0) { + m = efi_1t1_page(1 + L0_ENTRIES + (l0_idx + 1) * (l1_idx + 1)); + mphys = VM_PAGE_TO_PHYS(m); + *l1 = mphys | L1_TABLE; + } else { + mphys = *l1 & ~ATTR_MASK; + } + + l2 = (pd_entry_t *)PHYS_TO_DMAP(mphys); + l2_idx = pmap_l2_index(va); + l2 += l2_idx; + if (*l2 == 0) { + m = efi_1t1_page(1 + L0_ENTRIES + L0_ENTRIES * Ln_ENTRIES + + (l0_idx + 1) * (l1_idx + 1) * (l2_idx + 1)); + mphys = VM_PAGE_TO_PHYS(m); + *l2 = mphys | L2_TABLE; + } else { + mphys = *l2 & ~ATTR_MASK; + } + + l3 = (pt_entry_t *)PHYS_TO_DMAP(mphys); + l3 += pmap_l3_index(va); + KASSERT(*l3 == 0, ("%s: Already mapped: va %#jx *pt %#jx", __func__, + va, *l3)); + + return (l3); +} + +/* + * Create the 1:1 virtual to physical map for EFI + */ +bool +efi_create_1t1_map(struct efi_md *map, int ndesc, int descsz) +{ + struct efi_md *p; + pt_entry_t *l3; + vm_offset_t va; + uint64_t idx; + int i, mode; + + obj_1t1_pt = vm_pager_allocate(OBJT_PHYS, NULL, L0_ENTRIES + + L0_ENTRIES * Ln_ENTRIES + L0_ENTRIES * Ln_ENTRIES * Ln_ENTRIES + + L0_ENTRIES * Ln_ENTRIES * Ln_ENTRIES * Ln_ENTRIES, + VM_PROT_ALL, 0, NULL); + VM_OBJECT_WLOCK(obj_1t1_pt); + efi_l0_page = efi_1t1_page(0); + VM_OBJECT_WUNLOCK(obj_1t1_pt); + efi_l0 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(efi_l0_page)); + bzero(efi_l0, L0_ENTRIES * sizeof(*efi_l0)); + + for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, + descsz)) { + if ((p->md_attr & EFI_MD_ATTR_RT) == 0) + continue; + if (p->md_virt != NULL) { + if (bootverbose) + printf("EFI Runtime entry %d is mapped\n", i); + goto fail; + } + if ((p->md_phys & EFI_PAGE_MASK) != 0) { + if (bootverbose) + printf("EFI Runtime entry %d is not aligned\n", + i); + goto fail; + } + if (p->md_phys + p->md_pages * EFI_PAGE_SIZE < p->md_phys || + p->md_phys + p->md_pages * EFI_PAGE_SIZE >= + VM_MAXUSER_ADDRESS) { + printf("EFI Runtime entry %d is not in mappable for RT:" + "base %#016jx %#jx pages\n", + i, (uintmax_t)p->md_phys, + (uintmax_t)p->md_pages); + goto fail; + } + if ((p->md_attr & EFI_MD_ATTR_WB) != 0) + mode = VM_MEMATTR_WRITE_BACK; + else if ((p->md_attr & EFI_MD_ATTR_WT) != 0) + mode = VM_MEMATTR_WRITE_THROUGH; + else if ((p->md_attr & EFI_MD_ATTR_WC) != 0) + mode = VM_MEMATTR_WRITE_COMBINING; + else if ((p->md_attr & EFI_MD_ATTR_UC) != 0) + mode = VM_MEMATTR_UNCACHEABLE; + else { + if (bootverbose) + printf("EFI Runtime entry %d mapping " + "attributes unsupported\n", i); + mode = VM_MEMATTR_UNCACHEABLE; + } + + printf("MAP %lx mode %x pages %lu\n", p->md_phys, mode, p->md_pages); + VM_OBJECT_WLOCK(obj_1t1_pt); + for (va = p->md_phys, idx = 0; idx < p->md_pages; idx++, + va += PAGE_SIZE) { + l3 = efi_1t1_l3(va); + *l3 = va | ATTR_DEFAULT | ATTR_IDX(mode) | + ATTR_AP(ATTR_AP_RW) | L3_PAGE; + } + VM_OBJECT_WUNLOCK(obj_1t1_pt); + } + + return (true); +fail: + efi_destroy_1t1_map(); + return (false); +} + +int +efi_arch_enter(void) +{ + + __asm __volatile( + "msr ttbr0_el1, %0 \n" + "dsb ishst \n" + "tlbi vmalle1is \n" + "dsb ish \n" + "isb \n" + : : "r"(VM_PAGE_TO_PHYS(efi_l0_page))); + + return (0); +} + +void +efi_arch_leave(void) +{ + struct thread *td; + + td = curthread; + __asm __volatile( + "msr ttbr0_el1, %0 \n" + "dsb ishst \n" + "tlbi vmalle1is \n" + "dsb ish \n" + "isb \n" + : : "r"(td->td_proc->p_md.md_l0addr)); +} Property changes on: projects/runtime-coverage/sys/arm64/arm64/efirt_machdep.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: projects/runtime-coverage/sys/arm64/arm64/machdep.c =================================================================== --- projects/runtime-coverage/sys/arm64/arm64/machdep.c (revision 324497) +++ projects/runtime-coverage/sys/arm64/arm64/machdep.c (revision 324498) @@ -1,1172 +1,1180 @@ /*- * Copyright (c) 2014 Andrew Turner * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include "opt_acpi.h" #include "opt_platform.h" #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VFP #include #endif #ifdef DEV_ACPI #include #include #endif #ifdef FDT #include #include #endif enum arm64_bus arm64_bus_method = ARM64_BUS_NONE; struct pcpu __pcpu[MAXCPU]; static struct trapframe proc0_tf; vm_paddr_t phys_avail[PHYS_AVAIL_SIZE + 2]; vm_paddr_t dump_avail[PHYS_AVAIL_SIZE + 2]; int early_boot = 1; int cold = 1; long realmem = 0; long Maxmem = 0; #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t physmap[PHYSMAP_SIZE]; u_int physmap_idx; struct kva_md_info kmi; int64_t dcache_line_size; /* The minimum D cache line size */ int64_t icache_line_size; /* The minimum I cache line size */ int64_t idcache_line_size; /* The minimum cache line size */ int64_t dczva_line_size; /* The size of cache line the dc zva zeroes */ int has_pan; +/* + * Physical address of the EFI System Table. Stashed from the metadata hints + * passed into the kernel and used by the EFI code to call runtime services. + */ +vm_paddr_t efi_systbl_phys; + /* pagezero_* implementations are provided in support.S */ void pagezero_simple(void *); void pagezero_cache(void *); /* pagezero_simple is default pagezero */ void (*pagezero)(void *p) = pagezero_simple; static void pan_setup(void) { uint64_t id_aa64mfr1; id_aa64mfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); if (ID_AA64MMFR1_PAN(id_aa64mfr1) != ID_AA64MMFR1_PAN_NONE) has_pan = 1; } void pan_enable(void) { /* * The LLVM integrated assembler doesn't understand the PAN * PSTATE field. Because of this we need to manually create * the instruction in an asm block. This is equivalent to: * msr pan, #1 * * This sets the PAN bit, stopping the kernel from accessing * memory when userspace can also access it unless the kernel * uses the userspace load/store instructions. */ if (has_pan) { WRITE_SPECIALREG(sctlr_el1, READ_SPECIALREG(sctlr_el1) & ~SCTLR_SPAN); __asm __volatile(".inst 0xd500409f | (0x1 << 8)"); } } static void cpu_startup(void *dummy) { undef_init(); identify_cpu(); vm_ksubmap_init(&kmi); bufinit(); vm_pager_bufferinit(); } SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); int cpu_idle_wakeup(int cpu) { return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *frame; frame = td->td_frame; regs->sp = frame->tf_sp; regs->lr = frame->tf_lr; regs->elr = frame->tf_elr; regs->spsr = frame->tf_spsr; memcpy(regs->x, frame->tf_x, sizeof(regs->x)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *frame; frame = td->td_frame; frame->tf_sp = regs->sp; frame->tf_lr = regs->lr; frame->tf_elr = regs->elr; frame->tf_spsr = regs->spsr; memcpy(frame->tf_x, regs->x, sizeof(frame->tf_x)); return (0); } int fill_fpregs(struct thread *td, struct fpreg *regs) { #ifdef VFP struct pcb *pcb; pcb = td->td_pcb; if ((pcb->pcb_fpflags & PCB_FP_STARTED) != 0) { /* * If we have just been running VFP instructions we will * need to save the state to memcpy it below. */ if (td == curthread) vfp_save_state(td, pcb); KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate, ("Called fill_fpregs while the kernel is using the VFP")); memcpy(regs->fp_q, pcb->pcb_fpustate.vfp_regs, sizeof(regs->fp_q)); regs->fp_cr = pcb->pcb_fpustate.vfp_fpcr; regs->fp_sr = pcb->pcb_fpustate.vfp_fpsr; } else #endif memset(regs->fp_q, 0, sizeof(regs->fp_q)); return (0); } int set_fpregs(struct thread *td, struct fpreg *regs) { #ifdef VFP struct pcb *pcb; pcb = td->td_pcb; KASSERT(pcb->pcb_fpusaved == &pcb->pcb_fpustate, ("Called set_fpregs while the kernel is using the VFP")); memcpy(pcb->pcb_fpustate.vfp_regs, regs->fp_q, sizeof(regs->fp_q)); pcb->pcb_fpustate.vfp_fpcr = regs->fp_cr; pcb->pcb_fpustate.vfp_fpsr = regs->fp_sr; #endif return (0); } int fill_dbregs(struct thread *td, struct dbreg *regs) { printf("ARM64TODO: fill_dbregs"); return (EDOOFUS); } int set_dbregs(struct thread *td, struct dbreg *regs) { printf("ARM64TODO: set_dbregs"); return (EDOOFUS); } int ptrace_set_pc(struct thread *td, u_long addr) { printf("ARM64TODO: ptrace_set_pc"); return (EDOOFUS); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_spsr |= PSR_SS; td->td_pcb->pcb_flags |= PCB_SINGLE_STEP; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_spsr &= ~PSR_SS; td->td_pcb->pcb_flags &= ~PCB_SINGLE_STEP; return (0); } void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *tf = td->td_frame; memset(tf, 0, sizeof(struct trapframe)); /* * We need to set x0 for init as it doesn't call * cpu_set_syscall_retval to copy the value. We also * need to set td_retval for the cases where we do. */ tf->tf_x[0] = td->td_retval[0] = stack; tf->tf_sp = STACKALIGN(stack); tf->tf_lr = imgp->entry_addr; tf->tf_elr = imgp->entry_addr; } /* Sanity check these are the same size, they will be memcpy'd to and fro */ CTASSERT(sizeof(((struct trapframe *)0)->tf_x) == sizeof((struct gpregs *)0)->gp_x); CTASSERT(sizeof(((struct trapframe *)0)->tf_x) == sizeof((struct reg *)0)->x); int get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret) { struct trapframe *tf = td->td_frame; if (clear_ret & GET_MC_CLEAR_RET) { mcp->mc_gpregs.gp_x[0] = 0; mcp->mc_gpregs.gp_spsr = tf->tf_spsr & ~PSR_C; } else { mcp->mc_gpregs.gp_x[0] = tf->tf_x[0]; mcp->mc_gpregs.gp_spsr = tf->tf_spsr; } memcpy(&mcp->mc_gpregs.gp_x[1], &tf->tf_x[1], sizeof(mcp->mc_gpregs.gp_x[1]) * (nitems(mcp->mc_gpregs.gp_x) - 1)); mcp->mc_gpregs.gp_sp = tf->tf_sp; mcp->mc_gpregs.gp_lr = tf->tf_lr; mcp->mc_gpregs.gp_elr = tf->tf_elr; return (0); } int set_mcontext(struct thread *td, mcontext_t *mcp) { struct trapframe *tf = td->td_frame; memcpy(tf->tf_x, mcp->mc_gpregs.gp_x, sizeof(tf->tf_x)); tf->tf_sp = mcp->mc_gpregs.gp_sp; tf->tf_lr = mcp->mc_gpregs.gp_lr; tf->tf_elr = mcp->mc_gpregs.gp_elr; tf->tf_spsr = mcp->mc_gpregs.gp_spsr; return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp) { #ifdef VFP struct pcb *curpcb; critical_enter(); curpcb = curthread->td_pcb; if ((curpcb->pcb_fpflags & PCB_FP_STARTED) != 0) { /* * If we have just been running VFP instructions we will * need to save the state to memcpy it below. */ vfp_save_state(td, curpcb); KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate, ("Called get_fpcontext while the kernel is using the VFP")); KASSERT((curpcb->pcb_fpflags & ~PCB_FP_USERMASK) == 0, ("Non-userspace FPU flags set in get_fpcontext")); memcpy(mcp->mc_fpregs.fp_q, curpcb->pcb_fpustate.vfp_regs, sizeof(mcp->mc_fpregs)); mcp->mc_fpregs.fp_cr = curpcb->pcb_fpustate.vfp_fpcr; mcp->mc_fpregs.fp_sr = curpcb->pcb_fpustate.vfp_fpsr; mcp->mc_fpregs.fp_flags = curpcb->pcb_fpflags; mcp->mc_flags |= _MC_FP_VALID; } critical_exit(); #endif } static void set_fpcontext(struct thread *td, mcontext_t *mcp) { #ifdef VFP struct pcb *curpcb; critical_enter(); if ((mcp->mc_flags & _MC_FP_VALID) != 0) { curpcb = curthread->td_pcb; /* * Discard any vfp state for the current thread, we * are about to override it. */ vfp_discard(td); KASSERT(curpcb->pcb_fpusaved == &curpcb->pcb_fpustate, ("Called set_fpcontext while the kernel is using the VFP")); memcpy(curpcb->pcb_fpustate.vfp_regs, mcp->mc_fpregs.fp_q, sizeof(mcp->mc_fpregs)); curpcb->pcb_fpustate.vfp_fpcr = mcp->mc_fpregs.fp_cr; curpcb->pcb_fpustate.vfp_fpsr = mcp->mc_fpregs.fp_sr; curpcb->pcb_fpflags = mcp->mc_fpregs.fp_flags & PCB_FP_USERMASK; } critical_exit(); #endif } void cpu_idle(int busy) { spinlock_enter(); if (!busy) cpu_idleclock(); if (!sched_runnable()) __asm __volatile( "dsb sy \n" "wfi \n"); if (!busy) cpu_activeclock(); spinlock_exit(); } void cpu_halt(void) { /* We should have shutdown by now, if not enter a low power sleep */ intr_disable(); while (1) { __asm __volatile("wfi"); } } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* ARM64TODO TBD */ } /* Get current clock frequency for the given CPU ID. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { struct pcpu *pc; pc = pcpu_find(cpu_id); if (pc == NULL || rate == NULL) return (EINVAL); if (pc->pc_clock == 0) return (EOPNOTSUPP); *rate = pc->pc_clock; return (0); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } void spinlock_enter(void) { struct thread *td; register_t daif; td = curthread; if (td->td_md.md_spinlock_count == 0) { daif = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_daif = daif; } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t daif; td = curthread; critical_exit(); daif = td->td_md.md_saved_daif; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(daif); } #ifndef _SYS_SYSPROTO_H_ struct sigreturn_args { ucontext_t *ucp; }; #endif int sys_sigreturn(struct thread *td, struct sigreturn_args *uap) { ucontext_t uc; uint32_t spsr; if (uap == NULL) return (EFAULT); if (copyin(uap->sigcntxp, &uc, sizeof(uc))) return (EFAULT); spsr = uc.uc_mcontext.mc_gpregs.gp_spsr; if ((spsr & PSR_M_MASK) != PSR_M_EL0t || (spsr & (PSR_F | PSR_I | PSR_A | PSR_D)) != 0) return (EINVAL); set_mcontext(td, &uc.uc_mcontext); set_fpcontext(td, &uc.uc_mcontext); /* Restore signal mask. */ kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); return (EJUSTRETURN); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { int i; for (i = 0; i < PCB_LR; i++) pcb->pcb_x[i] = tf->tf_x[i]; pcb->pcb_x[PCB_LR] = tf->tf_lr; pcb->pcb_pc = tf->tf_elr; pcb->pcb_sp = tf->tf_sp; } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td; struct proc *p; struct trapframe *tf; struct sigframe *fp, frame; struct sigacts *psp; struct sysentvec *sysent; int code, onstack, sig; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; onstack = sigonstack(tf->tf_sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else { fp = (struct sigframe *)td->td_frame->tf_sp; } /* Make room, keeping the stack aligned */ fp--; fp = (struct sigframe *)STACKALIGN(fp); /* Fill in the frame to copy out */ get_mcontext(td, &frame.sf_uc.uc_mcontext, 0); get_fpcontext(td, &frame.sf_uc.uc_mcontext); frame.sf_si = ksi->ksi_info; frame.sf_uc.uc_sigmask = *mask; frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE; frame.sf_uc.uc_stack = td->td_sigstk; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(td->td_proc); /* Copy the sigframe out to the user's stack. */ if (copyout(&frame, fp, sizeof(*fp)) != 0) { /* Process has trashed its stack. Kill it. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp); PROC_LOCK(p); sigexit(td, SIGILL); } tf->tf_x[0]= sig; tf->tf_x[1] = (register_t)&fp->sf_si; tf->tf_x[2] = (register_t)&fp->sf_uc; tf->tf_elr = (register_t)catcher; tf->tf_sp = (register_t)fp; sysent = p->p_sysent; if (sysent->sv_sigcode_base != 0) tf->tf_lr = (register_t)sysent->sv_sigcode_base; else tf->tf_lr = (register_t)(sysent->sv_psstrings - *(sysent->sv_szsigcode)); CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_elr, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } static void init_proc0(vm_offset_t kstack) { struct pcpu *pcpup = &__pcpu[0]; proc_linkup0(&proc0, &thread0); thread0.td_kstack = kstack; thread0.td_pcb = (struct pcb *)(thread0.td_kstack) - 1; thread0.td_pcb->pcb_fpflags = 0; thread0.td_pcb->pcb_fpusaved = &thread0.td_pcb->pcb_fpustate; thread0.td_pcb->pcb_vfpcpu = UINT_MAX; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; } typedef struct { uint32_t type; uint64_t phys_start; uint64_t virt_start; uint64_t num_pages; uint64_t attr; } EFI_MEMORY_DESCRIPTOR; static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, u_int *physmap_idxp) { u_int i, insert_idx, _physmap_idx; _physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. */ insert_idx = _physmap_idx; for (i = 0; i <= _physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= _physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } _physmap_idx += 2; *physmap_idxp = _physmap_idx; if (_physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = _physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } #ifdef FDT static void add_fdt_mem_regions(struct mem_region *mr, int mrcnt, vm_paddr_t *physmap, u_int *physmap_idxp) { for (int i = 0; i < mrcnt; i++) { if (!add_physmap_entry(mr[i].mr_start, mr[i].mr_size, physmap, physmap_idxp)) break; } } #endif static void add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, u_int *physmap_idxp) { struct efi_md *map, *p; const char *type; size_t efisz; int ndesc, i; static const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode", "PersistentMemory" }; /* * Memory map data provided by UEFI via the GetMemoryMap * Boot Services API. */ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return; ndesc = efihdr->memory_size / efihdr->descriptor_size; if (boothowto & RB_VERBOSE) printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { if (p->md_type < nitems(types)) type = types[p->md_type]; else type = ""; printf("%23s %012lx %12p %08lx ", type, p->md_phys, p->md_virt, p->md_pages); if (p->md_attr & EFI_MD_ATTR_UC) printf("UC "); if (p->md_attr & EFI_MD_ATTR_WC) printf("WC "); if (p->md_attr & EFI_MD_ATTR_WT) printf("WT "); if (p->md_attr & EFI_MD_ATTR_WB) printf("WB "); if (p->md_attr & EFI_MD_ATTR_UCE) printf("UCE "); if (p->md_attr & EFI_MD_ATTR_WP) printf("WP "); if (p->md_attr & EFI_MD_ATTR_RP) printf("RP "); if (p->md_attr & EFI_MD_ATTR_XP) printf("XP "); if (p->md_attr & EFI_MD_ATTR_NV) printf("NV "); if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) printf("MORE_RELIABLE "); if (p->md_attr & EFI_MD_ATTR_RO) printf("RO "); if (p->md_attr & EFI_MD_ATTR_RT) printf("RUNTIME"); printf("\n"); } switch (p->md_type) { case EFI_MD_TYPE_CODE: case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. */ break; default: continue; } if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), physmap, physmap_idxp)) break; } } #ifdef FDT static void try_load_dtb(caddr_t kmdp) { vm_offset_t dtbp; dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t); if (dtbp == (vm_offset_t)NULL) { printf("ERROR loading DTB\n"); return; } if (OF_install(OFW_FDT, 0) == FALSE) panic("Cannot install FDT"); if (OF_init((void *)dtbp) != 0) panic("OF_init failed with the found device tree"); } #endif static bool bus_probe(void) { bool has_acpi, has_fdt; char *order, *env; has_acpi = has_fdt = false; #ifdef FDT has_fdt = (OF_peer(0) != 0); #endif #ifdef DEV_ACPI has_acpi = (acpi_find_table(ACPI_SIG_SPCR) != 0); #endif env = kern_getenv("kern.cfg.order"); if (env != NULL) { order = env; while (order != NULL) { if (has_acpi && strncmp(order, "acpi", 4) == 0 && (order[4] == ',' || order[4] == '\0')) { arm64_bus_method = ARM64_BUS_ACPI; break; } if (has_fdt && strncmp(order, "fdt", 3) == 0 && (order[3] == ',' || order[3] == '\0')) { arm64_bus_method = ARM64_BUS_FDT; break; } order = strchr(order, ','); } freeenv(env); /* If we set the bus method it is valid */ if (arm64_bus_method != ARM64_BUS_NONE) return (true); } /* If no order or an invalid order was set use the default */ if (arm64_bus_method == ARM64_BUS_NONE) { if (has_fdt) arm64_bus_method = ARM64_BUS_FDT; else if (has_acpi) arm64_bus_method = ARM64_BUS_ACPI; } /* * If no option was set the default is valid, otherwise we are * setting one to get cninit() working, then calling panic to tell * the user about the invalid bus setup. */ return (env == NULL); } static void cache_setup(void) { int dcache_line_shift, icache_line_shift, dczva_line_shift; uint32_t ctr_el0; uint32_t dczid_el0; ctr_el0 = READ_SPECIALREG(ctr_el0); /* Read the log2 words in each D cache line */ dcache_line_shift = CTR_DLINE_SIZE(ctr_el0); /* Get the D cache line size */ dcache_line_size = sizeof(int) << dcache_line_shift; /* And the same for the I cache */ icache_line_shift = CTR_ILINE_SIZE(ctr_el0); icache_line_size = sizeof(int) << icache_line_shift; idcache_line_size = MIN(dcache_line_size, icache_line_size); dczid_el0 = READ_SPECIALREG(dczid_el0); /* Check if dc zva is not prohibited */ if (dczid_el0 & DCZID_DZP) dczva_line_size = 0; else { /* Same as with above calculations */ dczva_line_shift = DCZID_BS_SIZE(dczid_el0); dczva_line_size = sizeof(int) << dczva_line_shift; /* Change pagezero function */ pagezero = pagezero_cache; } } void initarm(struct arm64_bootparams *abp) { struct efi_map_header *efihdr; struct pcpu *pcpup; #ifdef FDT struct mem_region mem_regions[FDT_MEM_REGIONS]; int mem_regions_sz; #endif vm_offset_t lastaddr; caddr_t kmdp; vm_paddr_t mem_len; bool valid; int i; /* Set the module data location */ preload_metadata = (caddr_t)(uintptr_t)(abp->modulep); /* Find the kernel address */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); init_static_kenv(MD_FETCH(kmdp, MODINFOMD_ENVP, char *), 0); #ifdef FDT try_load_dtb(kmdp); #endif + + efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); /* Find the address to start allocating from */ lastaddr = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t); /* Load the physical memory ranges */ physmap_idx = 0; efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr != NULL) add_efi_map_entries(efihdr, physmap, &physmap_idx); #ifdef FDT else { /* Grab physical memory regions information from device tree. */ if (fdt_get_mem_regions(mem_regions, &mem_regions_sz, NULL) != 0) panic("Cannot get physical memory regions"); add_fdt_mem_regions(mem_regions, mem_regions_sz, physmap, &physmap_idx); } #endif /* Print the memory map */ mem_len = 0; for (i = 0; i < physmap_idx; i += 2) { dump_avail[i] = physmap[i]; dump_avail[i + 1] = physmap[i + 1]; mem_len += physmap[i + 1] - physmap[i]; } dump_avail[i] = 0; dump_avail[i + 1] = 0; /* Set the pcpu data, this is needed by pmap_bootstrap */ pcpup = &__pcpu[0]; pcpu_init(pcpup, 0, sizeof(struct pcpu)); /* * Set the pcpu pointer with a backup in tpidr_el1 to be * loaded when entering the kernel from userland. */ __asm __volatile( "mov x18, %0 \n" "msr tpidr_el1, %0" :: "r"(pcpup)); PCPU_SET(curthread, &thread0); /* Do basic tuning, hz etc */ init_param1(); cache_setup(); pan_setup(); /* Bootstrap enough of pmap to enter the kernel proper */ pmap_bootstrap(abp->kern_l0pt, abp->kern_l1pt, KERNBASE - abp->kern_delta, lastaddr - KERNBASE); devmap_bootstrap(0, NULL); valid = bus_probe(); cninit(); if (!valid) panic("Invalid bus configuration: %s", kern_getenv("kern.cfg.order")); init_proc0(abp->kern_stack); msgbufinit(msgbufp, msgbufsize); mutex_init(); init_param2(physmem); dbg_init(); kdb_init(); pan_enable(); early_boot = 0; } void dbg_init(void) { /* Clear OS lock */ WRITE_SPECIALREG(OSLAR_EL1, 0); /* This permits DDB to use debug registers for watchpoints. */ dbg_monitor_init(); /* TODO: Eventually will need to initialize debug registers here. */ } #ifdef DDB #include DB_SHOW_COMMAND(specialregs, db_show_spregs) { #define PRINT_REG(reg) \ db_printf(__STRING(reg) " = %#016lx\n", READ_SPECIALREG(reg)) PRINT_REG(actlr_el1); PRINT_REG(afsr0_el1); PRINT_REG(afsr1_el1); PRINT_REG(aidr_el1); PRINT_REG(amair_el1); PRINT_REG(ccsidr_el1); PRINT_REG(clidr_el1); PRINT_REG(contextidr_el1); PRINT_REG(cpacr_el1); PRINT_REG(csselr_el1); PRINT_REG(ctr_el0); PRINT_REG(currentel); PRINT_REG(daif); PRINT_REG(dczid_el0); PRINT_REG(elr_el1); PRINT_REG(esr_el1); PRINT_REG(far_el1); #if 0 /* ARM64TODO: Enable VFP before reading floating-point registers */ PRINT_REG(fpcr); PRINT_REG(fpsr); #endif PRINT_REG(id_aa64afr0_el1); PRINT_REG(id_aa64afr1_el1); PRINT_REG(id_aa64dfr0_el1); PRINT_REG(id_aa64dfr1_el1); PRINT_REG(id_aa64isar0_el1); PRINT_REG(id_aa64isar1_el1); PRINT_REG(id_aa64pfr0_el1); PRINT_REG(id_aa64pfr1_el1); PRINT_REG(id_afr0_el1); PRINT_REG(id_dfr0_el1); PRINT_REG(id_isar0_el1); PRINT_REG(id_isar1_el1); PRINT_REG(id_isar2_el1); PRINT_REG(id_isar3_el1); PRINT_REG(id_isar4_el1); PRINT_REG(id_isar5_el1); PRINT_REG(id_mmfr0_el1); PRINT_REG(id_mmfr1_el1); PRINT_REG(id_mmfr2_el1); PRINT_REG(id_mmfr3_el1); #if 0 /* Missing from llvm */ PRINT_REG(id_mmfr4_el1); #endif PRINT_REG(id_pfr0_el1); PRINT_REG(id_pfr1_el1); PRINT_REG(isr_el1); PRINT_REG(mair_el1); PRINT_REG(midr_el1); PRINT_REG(mpidr_el1); PRINT_REG(mvfr0_el1); PRINT_REG(mvfr1_el1); PRINT_REG(mvfr2_el1); PRINT_REG(revidr_el1); PRINT_REG(sctlr_el1); PRINT_REG(sp_el0); PRINT_REG(spsel); PRINT_REG(spsr_el1); PRINT_REG(tcr_el1); PRINT_REG(tpidr_el0); PRINT_REG(tpidr_el1); PRINT_REG(tpidrro_el0); PRINT_REG(ttbr0_el1); PRINT_REG(ttbr1_el1); PRINT_REG(vbar_el1); #undef PRINT_REG } DB_SHOW_COMMAND(vtop, db_show_vtop) { uint64_t phys; if (have_addr) { phys = arm64_address_translate_s1e1r(addr); db_printf("EL1 physical address reg (read): 0x%016lx\n", phys); phys = arm64_address_translate_s1e1w(addr); db_printf("EL1 physical address reg (write): 0x%016lx\n", phys); phys = arm64_address_translate_s1e0r(addr); db_printf("EL0 physical address reg (read): 0x%016lx\n", phys); phys = arm64_address_translate_s1e0w(addr); db_printf("EL0 physical address reg (write): 0x%016lx\n", phys); } else db_printf("show vtop \n"); } #endif Index: projects/runtime-coverage/sys/arm64/arm64/pmap.c =================================================================== --- projects/runtime-coverage/sys/arm64/arm64/pmap.c (revision 324497) +++ projects/runtime-coverage/sys/arm64/arm64/pmap.c (revision 324498) @@ -1,4839 +1,4834 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * This software was developed by Andrew Turner under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif /* * These are configured by the mair_el1 register. This is set up in locore.S */ #define DEVICE_MEMORY 0 #define UNCACHED_MEMORY 1 #define CACHED_MEMORY 2 #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; struct msgbuf *msgbufp = NULL; /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv_list_locks[] * elements, but reads are not. */ static struct md_page *pv_table; static struct md_page pv_dummy; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) extern pt_entry_t pagetable_dmap[]; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, "Are large page mappings enabled?"); /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set(table, mask) atomic_set_64(table, mask) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load(table) (*table) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } -#define pmap_l0_index(va) (((va) >> L0_SHIFT) & L0_ADDR_MASK) -#define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) -#define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) -#define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) - static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[pmap_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { pd_entry_t *l2; l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { pt_entry_t *l3; l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); return (&l3[pmap_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == L1_BLOCK) { *level = 1; return (l1); } if (desc != L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == L2_BLOCK) { *level = 2; return (l2); } if (desc != L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) return (NULL); return (l3); } static inline bool pmap_superpages_enabled(void) { return (superpages_enabled != 0); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l0p, *l1p, *l2p; if (pmap->pm_l0 == NULL) return (false); l0p = pmap_l0(pmap, va); *l0 = l0p; if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) return (false); l1p = pmap_l0_to_l1(l0p, va); *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { *l2 = NULL; *l3 = NULL; return (true); } if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) return (false); l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { *l3 = NULL; return (true); } *l3 = pmap_l2_to_l3(l2p, va); return (true); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); } CTASSERT(L1_BLOCK == L2_BLOCK); /* * Checks if the page is dirty. We currently lack proper tracking of this on * arm64 so for now assume is a page mapped as rw was accessed it is. */ static inline int pmap_page_dirty(pt_entry_t pte) { return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) == (ATTR_AF | ATTR_AP(ATTR_AP_RW))); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); } static void pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) { vm_offset_t va; vm_paddr_t pa; u_int l1_slot; pa = dmap_phys_base = min_pa & ~L1_OFFSET; va = DMAP_MIN_ADDRESS; for (; va < DMAP_MAX_ADDRESS && pa < max_pa; pa += L1_SIZE, va += L1_SIZE, l1_slot++) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_load_store(&pagetable_dmap[l1_slot], (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L1_BLOCK); } /* Set the upper limit of the DMAP region */ dmap_phys_max = pa; dmap_max_addr = va; cpu_tlb_flushID(); } static vm_offset_t pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) { vm_offset_t l2pt; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(va); l2pt = l2_start; for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); pa = pmap_early_vtophys(l1pt, l2pt); pmap_load_store(&l1[l1_slot], (pa & ~Ln_TABLE_MASK) | L1_TABLE); l2pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l2_start, 0, l2pt - l2_start); return l2pt; } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l2pt, l3pt; vm_paddr_t pa; pd_entry_t *l2; u_int l2_slot; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); l2pt = (vm_offset_t)l2; l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pmap_load_store(&l2[l2_slot], (pa & ~Ln_TABLE_MASK) | L2_TABLE); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return l3pt; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot; uint64_t kern_delta; pt_entry_t *l2; vm_offset_t va, freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t pa, max_pa, min_pa; int i; kern_delta = KERNBASE - kernstart; physmem = 0; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; PMAP_LOCK_INIT(kernel_pmap); /* Assume the address we were loaded to is a valid physical address */ min_pa = max_pa = KERNBASE - kern_delta; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < (physmap_idx * 2); i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; if (physmap[i + 1] > max_pa) max_pa = physmap[i + 1]; } /* Create a direct map region early so we can use it for pa -> va */ pmap_bootstrap_dmap(l1pt, min_pa, max_pa); va = KERNBASE; pa = KERNBASE - kern_delta; /* * Start to initialise phys_avail by copying from physmap * up to the physical address KERNBASE points at. */ map_slot = avail_slot = 0; for (; map_slot < (physmap_idx * 2) && avail_slot < (PHYS_AVAIL_SIZE - 2); map_slot += 2) { if (physmap[map_slot] == physmap[map_slot + 1]) continue; if (physmap[map_slot] <= pa && physmap[map_slot + 1] > pa) break; phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } /* Add the memory before the kernel */ if (physmap[avail_slot] < pa && avail_slot < (PHYS_AVAIL_SIZE - 2)) { phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = pa; physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } used_map_slot = map_slot; /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); /* Find how many pages we have mapped */ for (; l2_slot < Ln_ENTRIES; l2_slot++) { if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0) break; /* Check locore used L2 blocks */ KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK, ("Invalid bootstrap L2 table")); KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa, ("Incorrect PA in L2 table")); va += L2_SIZE; pa += L2_SIZE; } va = roundup2(va, L1_SIZE); freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */ freemempos = pmap_bootstrap_l2(l1pt, va, freemempos); /* And the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; virtual_avail = roundup2(freemempos, L1_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); /* Finish initialising physmap */ map_slot = used_map_slot; for (; avail_slot < (PHYS_AVAIL_SIZE - 2) && map_slot < (physmap_idx * 2); map_slot += 2) { if (physmap[map_slot] == physmap[map_slot + 1]) continue; /* Have we used the current range? */ if (physmap[map_slot + 1] <= pa) continue; /* Do we need to split the entry? */ if (physmap[map_slot] < pa) { phys_avail[avail_slot] = pa; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; } else { phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; } physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } phys_avail[avail_slot] = 0; phys_avail[avail_slot + 1] = 0; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". */ Maxmem = atop(phys_avail[avail_slot - 1]); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Invalidate a single TLB entry. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vaae1is, %0 \n" "dsb ish \n" "isb \n" : : "r"(va >> PAGE_SHIFT)); sched_unpin(); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; sched_pin(); dsb(ishst); for (addr = sva; addr < eva; addr += PAGE_SIZE) { __asm __volatile( "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT)); } __asm __volatile( "dsb ish \n" "isb \n"); sched_unpin(); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n"); sched_unpin(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; pa = 0; PMAP_LOCK(pmap); /* * Find the block or page map for this virtual address. pmap_pte * will return either a valid block/page entry, or NULL. */ pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_extract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_extract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_extract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte, tpte; vm_offset_t off; vm_paddr_t pa; vm_page_t m; int lvl; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); KASSERT(lvl > 0 && lvl <= 3, ("pmap_extract_and_hold: Invalid level %d", lvl)); CTASSERT(L1_BLOCK == L2_BLOCK); KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, tpte & ATTR_DESCR_MASK)); if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) || ((prot & VM_PROT_WRITE) == 0)) { switch(lvl) { case 1: off = va & L1_OFFSET; break; case 2: off = va & L2_OFFSET; break; case 3: default: off = 0; } if (vm_page_pa_tryrelock(pmap, (tpte & ~ATTR_MASK) | off, &pa)) goto retry; m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); vm_page_hold(m); } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pa = 0; pte = pmap_pte(kernel_pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_kextract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_kextract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_kextract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ static void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pd_entry_t *pde; pt_entry_t *pte, attr; vm_offset_t va; int lvl; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE; if (mode == DEVICE_MEMORY) attr |= ATTR_XN; va = sva; while (size != 0) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, DEVICE_MEMORY); } /* * Remove a page from the kernel pagetables. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; int lvl; pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); pmap_load_clear(pte); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); pmap_load_clear(pte); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pd_entry_t *pde; pt_entry_t *pte, pa; vm_offset_t va; vm_page_t m; int i, lvl; va = sva; for (i = 0; i < count; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_qenter: Invalid level %d", lvl)); m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | ATTR_IDX(m->md.pv_memattr) | L3_PAGE; if (m->md.pv_memattr == DEVICE_MEMORY) pa |= ATTR_XN; pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, pa); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); va = sva; while (count-- > 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); if (pte != NULL) { pmap_load_clear(pte); } va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_load_clear(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_load_clear(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_load_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); return (pmap_unwire_l3(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l0 = kernel_pmap->pm_l0; pmap->pm_root.rt_root = 0; } int pmap_pinit(pmap_t pmap) { vm_paddr_t l0phys; vm_page_t l0pt; /* * allocate the l0 page */ while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) VM_WAIT; l0phys = VM_PAGE_TO_PHYS(l0pt); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys); if ((l0pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l0); pmap->pm_root.rt_root = 0; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); VM_WAIT; PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0 = &pmap->pm_l0[l0index]; pmap_load_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, lockp) == NULL) { --m->wire_count; /* XXX: release mem barrier? */ atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->wire_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { --m->wire_count; /* XXX: release mem barrier? */ atomic_subtract_int( &vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->wire_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pde, tpde; #ifdef INVARIANTS pt_entry_t *pte; #endif vm_page_t m; int lvl; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment the hold count, * and activate it. If we get a level 2 pde it will point to a level 3 * table. */ switch (lvl) { case -1: break; case 0: #ifdef INVARIANTS pte = pmap_l0_to_l1(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l0 superpages")); #endif break; case 1: #ifdef INVARIANTS pte = pmap_l1_to_l2(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l1 superpages")); #endif break; case 2: tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); m->wire_count++; return (m); } break; default: panic("pmap_alloc_l3: Invalid level %d", lvl); } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0)); m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l0, *l1, *l2; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { l0 = pmap_l0(kernel_pmap, kernel_vm_end); KASSERT(pmap_load(l0) != 0, ("pmap_growkernel: No level 0 kernel entry")); l1 = pmap_l0_to_l1(l0, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_load_store(l1, paddr | L1_TABLE); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if ((pmap_load(l2) & ATTR_AF) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_load_store(l2, paddr | L2_TABLE); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed, lvl; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&new_tail); mtx_lock(&pv_chunks_mutex); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); if (pmap != pc->pc_pmap) { if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); } else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); mtx_lock(&pv_chunks_mutex); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va, &lvl); if (lvl != 2) continue; pte = pmap_l2_to_l3(pde, va); tpte = pmap_load(pte); if ((tpte & ATTR_SW_WIRED) != 0) continue; tpte = pmap_load_clear(pte); pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); if (pmap_page_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, pmap_load(pde), &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); mtx_lock(&pv_chunks_mutex); continue; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; } TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&vm_cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire(m, PQ_NONE); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; int avail, free; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_demote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * pmap_remove_l2: do the things to unmap a level 2 superpage in a process */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); old_l2 = pmap_load_clear(l2); pmap_invalidate_range(pmap, sva, sva + L2_SIZE); if (old_l2 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if (old_l2 & ATTR_SW_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); pvh = pa_to_pvh(old_l2 & ~ATTR_MASK); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); va < eva; va += PAGE_SIZE, m++) { if (pmap_page_dirty(old_l2)) vm_page_dirty(m); if (old_l2 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } KASSERT(pmap != kernel_pmap, ("Attempting to remove an l2 kernel page")); ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { pmap_resident_count_dec(pmap, 1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_pages: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_page_dirty(old_l3)) vm_page_dirty(m); if (old_l3 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t l3_paddr, *l3; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_paddr = pmap_load(l2); if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (pmap_demote_l2_locked(pmap, l2, sva &~L2_OFFSET, &lock) == NULL) continue; l3_paddr = pmap_load(l2); } /* * Weed out invalid mappings. */ if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (l3 == NULL) panic("l3 == NULL"); if (pmap_load(l3) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free, &lock)) { sva += L3_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; struct spglist free; int lvl, pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pte = pmap_pte(pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_remove_all: no page table entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pte level %d", lvl)); pmap_demote_l2_locked(pmap, pte, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_remove_all: no page directory entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pde level %d", lvl)); tpde = pmap_load(pde); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); pmap_load_clear(pte); pmap_invalidate_page(pmap, pv->pv_va); if (tpte & ATTR_SW_WIRED) pmap->pm_stats.wired_count--; if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pmap_page_dirty(tpte)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); pmap_free_zero_pages(&free); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3p, l3, nbits; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { l3p = pmap_demote_l2(pmap, l2, sva); if (l3p == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_protect: Invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); if (!pmap_l3_valid(l3)) continue; nbits = 0; if ((prot & VM_PROT_WRITE) == 0) { if ((l3 & ATTR_SW_MANAGED) && pmap_page_dirty(l3)) { vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); } nbits |= ATTR_AP(ATTR_AP_RO); } if ((prot & VM_PROT_EXECUTE) == 0) nbits |= ATTR_XN; pmap_set(l3p, nbits); /* XXX: Use pmap_invalidate_range */ pmap_invalidate_page(pmap, sva); } } PMAP_UNLOCK(pmap); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Performs a break-before-make update of a pmap entry. This is needed when * either promoting or demoting pages to ensure the TLB doesn't get into an * inconsistent state. */ static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, vm_offset_t va, vm_size_t size) { register_t intr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Ensure we don't get switched out with the page table in an * inconsistent state. We also need to ensure no interrupts fire * as they may make use of an address we are about to invalidate. */ intr = intr_disable(); critical_enter(); /* Clear the old mapping */ pmap_load_clear(pte); pmap_invalidate_range(pmap, va, va + size); /* Create the new mapping */ pmap_load_store(pte, newpte); critical_exit(); intr_restore(intr); } /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_promote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single level 2 table entry to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3, newl2, oldl3, pa; vm_page_t mpte; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); sva = va & ~L2_OFFSET; firstl3 = pmap_l2_to_l3(l2, sva); newl2 = pmap_load(firstl3); /* Check the alingment is valid */ if (((newl2 & ~ATTR_MASK) & L2_OFFSET) != 0) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa = newl2 + L2_SIZE - PAGE_SIZE; for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { oldl3 = pmap_load(l3); if (oldl3 != pa) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the L2 * mapping the superpage is demoted by pmap_demote_l2() or * destroyed by pmap_remove_l3(). */ mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l2: page table page is out of range")); KASSERT(mpte->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx in pmap %p", va, pmap); return; } if ((newl2 & ATTR_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); newl2 &= ~ATTR_DESCR_MASK; newl2 |= L2_BLOCK; pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l2, *l3; pv_entry_t pv; vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa; vm_page_t mpte, om, l1_m, l2_m, l3_m; boolean_t nosleep; int lvl; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | L3_PAGE); if ((prot & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l3 |= ATTR_XN; if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= ATTR_SW_WIRED; if (va < VM_MAXUSER_ADDRESS) new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); mpte = NULL; lock = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && (l3 = pmap_demote_l2_locked(pmap, l2, va & ~L2_OFFSET, &lock)) != NULL) { l3 = &l3[pmap_l3_index(va)]; if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE( pmap_load(l2) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } } if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } pde = pmap_pde(pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } else { /* * If we get a level 2 pde it must point to a level 3 entry * otherwise we will need to create the intermediate tables */ if (lvl < 2) { switch(lvl) { default: case -1: /* Get the l0 pde to update */ pde = pmap_l0(pmap, va); KASSERT(pde != NULL, ("...")); l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l1_m == NULL) panic("pmap_enter: l1 pte_m == NULL"); if ((l1_m->flags & PG_ZERO) == 0) pmap_zero_page(l1_m); l1_pa = VM_PAGE_TO_PHYS(l1_m); pmap_load_store(pde, l1_pa | L0_TABLE); /* FALLTHROUGH */ case 0: /* Get the l1 pde to update */ pde = pmap_l1_to_l2(pde, va); KASSERT(pde != NULL, ("...")); l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l2_m == NULL) panic("pmap_enter: l2 pte_m == NULL"); if ((l2_m->flags & PG_ZERO) == 0) pmap_zero_page(l2_m); l2_pa = VM_PAGE_TO_PHYS(l2_m); pmap_load_store(pde, l2_pa | L1_TABLE); /* FALLTHROUGH */ case 1: /* Get the l2 pde to update */ pde = pmap_l1_to_l2(pde, va); l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l3_m == NULL) panic("pmap_enter: l3 pte_m == NULL"); if ((l3_m->flags & PG_ZERO) == 0) pmap_zero_page(l3_m); l3_pa = VM_PAGE_TO_PHYS(l3_m); pmap_load_store(pde, l3_pa | L2_TABLE); break; } } l3 = pmap_l2_to_l3(pde, va); pmap_invalidate_page(pmap, va); } havel3: om = NULL; orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & ATTR_SW_MANAGED) != 0) { new_l3 |= ATTR_SW_MANAGED; if ((new_l3 & ATTR_AP(ATTR_AP_RW)) == ATTR_AP(ATTR_AP_RW)) { vm_page_aflag_set(m, PGA_WRITEABLE); } } goto validate; } } else { /* * Increment the counters. */ if ((new_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= ATTR_SW_MANAGED; pv = get_pv_entry(pmap, &lock); pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the L3 entry. */ if (orig_l3 != 0) { validate: orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; if (opa != pa) { pmap_update_entry(pmap, l3, new_l3, va, PAGE_SIZE); if ((orig_l3 & ATTR_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); if (pmap_page_dirty(orig_l3)) vm_page_dirty(om); if ((orig_l3 & ATTR_AF) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pmap_pvh_free(&om->md, pmap, va); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } } else { pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); if (pmap_page_dirty(orig_l3) && (orig_l3 & ATTR_SW_MANAGED) != 0) vm_page_dirty(m); } } else { pmap_load_store(l3, new_l3); } pmap_invalidate_page(pmap, va); if (pmap != pmap_kernel()) { if (pmap == &curproc->p_vmspace->vm_pmap && (prot & VM_PROT_EXECUTE) != 0) cpu_icache_sync_range(va, PAGE_SIZE); if ((mpte == NULL || mpte->wire_count == NL3PG) && pmap_superpages_enabled() && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_l2(pmap, pde, va, &lock); } } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pd_entry_t *pde; pt_entry_t *l2, *l3; vm_paddr_t pa; int lvl; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) return (NULL); } if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter_quick_locked: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L3_PAGE; if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) pa |= ATTR_XN; else if (va < VM_MAXUSER_ADDRESS) pa |= ATTR_PXN; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) pa |= ATTR_SW_MANAGED; pmap_load_store(l3, pa); pmap_invalidate_page(pmap, va); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { l3 = pmap_demote_l2(pmap, l2, sva); if (l3 == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_unwire: Invalid l2 entry after demotion")); if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(l3, ATTR_SW_WIRED); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, lvl, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, ml3, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx, lvl; vm_paddr_t pa; lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("Attempting to remove an unmapped page")); switch(lvl) { case 1: pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("Attempting to remove an invalid " "block: %lx", tpte)); tpte = pmap_load(pte); break; case 2: pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("Attempting to remove an invalid " "page: %lx", tpte)); break; default: panic( "Invalid page directory level: %d", lvl); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & ATTR_SW_WIRED) { allfree = 0; continue; } pa = tpte & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); pmap_load_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) { switch (lvl) { case 1: for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(m); break; case 2: vm_page_dirty(m); break; } } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; switch (lvl) { case 1: pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); pvh = pa_to_pvh(tpte & ~ATTR_MASK); TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { pmap_resident_count_dec(pmap,1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_pages: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, &free, FALSE); atomic_subtract_int( &vm_cnt.v_wire_count, 1); } break; case 2: pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh( VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } break; } pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * This is used to check if a page has been accessed or modified. As we * don't have a bit to see if it has been modified we have to assume it * has been if the page is read/write. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask, value; pmap_t pmap; int lvl, md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 3, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 2, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L2_BLOCK; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; int lvl; rv = FALSE; PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL && pmap_load(pte) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pt_entry_t oldpte, *pte; vm_offset_t va; int lvl, md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; pte = pmap_pte(pmap, pv->pv_va, &lvl); if ((pmap_load(pte) & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) pmap_demote_l2_locked(pmap, pte, va & ~L2_OFFSET, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); retry: oldpte = pmap_load(pte); if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) { if (!atomic_cmpset_long(pte, oldpte, oldpte | ATTR_AP(ATTR_AP_RO))) goto retry; if ((oldpte & ATTR_AF) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { return (FALSE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; pt_entry_t *l3; vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, lvl, pvh_gen; struct spglist free; bool demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); KASSERT(lvl == 1, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, ("pmap_ts_referenced: found an invalid l1 table")); pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_page_dirty(tpte)) { /* * Although "tpte" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((tpte & ATTR_AF) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (tpte & ATTR_SW_WIRED) == 0) { if (safe_to_clear_referenced(pmap, tpte)) { /* * TODO: We don't handle the access * flag at all. We need to be able * to set it in the exception handler. */ panic("ARM64TODO: " "safe_to_clear_referenced\n"); } else if (pmap_demote_l2_locked(pmap, pte, pv->pv_va, &lock) != NULL) { demoted = true; va += VM_PAGE_TO_PHYS(m) - (tpte & ~ATTR_MASK); l3 = pmap_l2_to_l3(pte, va); pmap_remove_l3(pmap, l3, va, pmap_load(pte), NULL, &lock); } else demoted = true; if (demoted) { /* * The superpage mapping was removed * entirely and therefore 'pv' is no * longer valid. */ if (pvf == pv) pvf = NULL; pv = NULL; } cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); KASSERT(lvl == 2, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_page_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { if (safe_to_clear_referenced(pmap, tpte)) { /* * TODO: We don't handle the access flag * at all. We need to be able to set it in * the exception handler. */ panic("ARM64TODO: safe_to_clear_referenced\n"); } else if ((tpte & ATTR_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_l3(pmap, pte, pv->pv_va, tpde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); pmap_free_zero_pages(&free); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; /* ARM64TODO: We lack support for tracking if a page is modified */ } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return ((void *)PHYS_TO_DMAP(pa)); } void pmap_unmapbios(vm_paddr_t pa, vm_size_t size) { } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pv_memattr) != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pt_entry_t l3, *pte, *newpte; int lvl; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); if (!VIRT_IN_DMAP(base)) return (EINVAL); for (tmpva = base; tmpva < base + size; ) { pte = pmap_pte(kernel_pmap, va, &lvl); if (pte == NULL) return (EINVAL); if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) { /* * We already have the correct attribute, * ignore this entry. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; break; case 2: tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; break; case 3: tmpva += PAGE_SIZE; break; } } else { /* * Split the entry to an level 3 table, then * set the new attribute. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: newpte = pmap_demote_l1(kernel_pmap, pte, tmpva & ~L1_OFFSET); if (newpte == NULL) return (EINVAL); pte = pmap_l1_to_l2(pte, tmpva); case 2: newpte = pmap_demote_l2(kernel_pmap, pte, tmpva & ~L2_OFFSET); if (newpte == NULL) return (EINVAL); pte = pmap_l2_to_l3(pte, tmpva); case 3: /* Update the entry */ l3 = pmap_load(pte); l3 &= ~ATTR_IDX_MASK; l3 |= ATTR_IDX(mode); if (mode == DEVICE_MEMORY) l3 |= ATTR_XN; pmap_update_entry(kernel_pmap, pte, l3, tmpva, PAGE_SIZE); /* * If moving to a non-cacheable entry flush * the cache. */ if (mode == VM_MEMATTR_UNCACHEABLE) cpu_dcache_wbinv_range(tmpva, L3_SIZE); break; } tmpva += PAGE_SIZE; } } return (0); } /* * Create an L2 table to map all addresses within an L1 mapping. */ static pt_entry_t * pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) { pt_entry_t *l2, newl2, oldl1; vm_offset_t tmpl1; vm_paddr_t l2phys, phys; vm_page_t ml2; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl1 = pmap_load(l1); KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_demote_l1: Demoting a non-block entry")); KASSERT((va & L1_OFFSET) == 0, ("pmap_demote_l1: Invalid virtual address %#lx", va)); KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, ("pmap_demote_l1: Level 1 table shouldn't be managed")); tmpl1 = 0; if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { tmpl1 = kva_alloc(PAGE_SIZE); if (tmpl1 == 0) return (NULL); } if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" " in pmap %p", va, pmap); return (NULL); } l2phys = VM_PAGE_TO_PHYS(ml2); l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); /* Address the range points at */ phys = oldl1 & ~ATTR_MASK; /* The attributed from the old l1 table to be copied */ newl2 = oldl1 & ATTR_MASK; /* Create the new entries */ for (i = 0; i < Ln_ENTRIES; i++) { l2[i] = newl2 | phys; phys += L2_SIZE; } KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); if (tmpl1 != 0) { pmap_kenter(tmpl1, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY); l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); } pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); if (tmpl1 != 0) { pmap_kremove(tmpl1); kva_free(tmpl1, PAGE_SIZE); } return (l2); } /* * Create an L3 table to map all addresses within an L2 mapping. */ static pt_entry_t * pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *l3, newl3, oldl2; vm_offset_t tmpl2; vm_paddr_t l3phys, phys; vm_page_t ml3; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); l3 = NULL; oldl2 = pmap_load(l2); KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_demote_l2: Demoting a non-block entry")); KASSERT((va & L2_OFFSET) == 0, ("pmap_demote_l2: Invalid virtual address %#lx", va)); tmpl2 = 0; if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { tmpl2 = kva_alloc(PAGE_SIZE); if (tmpl2 == 0) return (NULL); } if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (ml3 == NULL) { CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if (va < VM_MAXUSER_ADDRESS) pmap_resident_count_inc(pmap, 1); } l3phys = VM_PAGE_TO_PHYS(ml3); l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); /* Address the range points at */ phys = oldl2 & ~ATTR_MASK; /* The attributed from the old l2 table to be copied */ newl3 = (oldl2 & (ATTR_MASK & ~ATTR_DESCR_MASK)) | L3_PAGE; /* * If the page table page is new, initialize it. */ if (ml3->wire_count == 1) { for (i = 0; i < Ln_ENTRIES; i++) { l3[i] = newl3 | phys; phys += L3_SIZE; } } KASSERT(l3[0] == ((oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE), ("Invalid l3 page (%lx != %lx)", l3[0], (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE)); /* * Map the temporary page so we don't lose access to the l2 table. */ if (tmpl2 != 0) { pmap_kenter(tmpl2, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY); l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); } /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the L2 and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l2() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); /* * Demote the PV entry. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" " in pmap %p %lx", va, pmap, l3[0]); fail: if (tmpl2 != 0) { pmap_kremove(tmpl2); kva_free(tmpl2, PAGE_SIZE); } return (l3); } static pt_entry_t * pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { struct rwlock *lock; pt_entry_t *l3; lock = NULL; l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (l3); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *l1p, l1; pd_entry_t *l2p, l2; pt_entry_t *l3p, l3; vm_paddr_t pa; bool managed; int val; PMAP_LOCK(pmap); retry: pa = 0; val = 0; managed = false; l1p = pmap_l1(pmap, addr); if (l1p == NULL) /* No l1 */ goto done; l1 = pmap_load(l1p); if ((l1 & ATTR_DESCR_MASK) == L1_INVAL) goto done; if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) { pa = (l1 & ~ATTR_MASK) | (addr & L1_OFFSET); managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; val = MINCORE_SUPER | MINCORE_INCORE; if (pmap_page_dirty(l1)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((l1 & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; goto done; } l2p = pmap_l1_to_l2(l1p, addr); if (l2p == NULL) /* No l2 */ goto done; l2 = pmap_load(l2p); if ((l2 & ATTR_DESCR_MASK) == L2_INVAL) goto done; if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) { pa = (l2 & ~ATTR_MASK) | (addr & L2_OFFSET); managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; val = MINCORE_SUPER | MINCORE_INCORE; if (pmap_page_dirty(l2)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((l2 & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; goto done; } l3p = pmap_l2_to_l3(l2p, addr); if (l3p == NULL) /* No l3 */ goto done; l3 = pmap_load(l2p); if ((l3 & ATTR_DESCR_MASK) == L3_INVAL) goto done; if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) { pa = (l3 & ~ATTR_MASK) | (addr & L3_OFFSET); managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; val = MINCORE_INCORE; if (pmap_page_dirty(l3)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((l3 & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } done: if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0); __asm __volatile("msr ttbr0_el1, %0" : : "r"(td->td_proc->p_md.md_l0addr)); pmap_invalidate_all(pmap); critical_exit(); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { if (va >= VM_MIN_KERNEL_ADDRESS) { cpu_icache_sync_range(va, sz); } else { u_int len, offset; vm_paddr_t pa; /* Find the length of data in this page to flush */ offset = va & PAGE_MASK; len = imin(PAGE_SIZE - offset, sz); while (sz != 0) { /* Extract the physical address & find it in the DMAP */ pa = pmap_extract(pmap, va); if (pa != 0) cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); /* Move to the next page */ sz -= len; va += len; /* Set the length for the next iteration */ len = imin(PAGE_SIZE, sz); } } } int pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) { #ifdef SMP uint64_t par; #endif switch (ESR_ELx_EXCEPTION(esr)) { case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: break; default: return (KERN_FAILURE); } #ifdef SMP PMAP_LOCK(pmap); switch (esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: /* Ask the MMU to check the address */ if (pmap == kernel_pmap) par = arm64_address_translate_s1e1r(far); else par = arm64_address_translate_s1e0r(far); /* * If the translation was successful the address was invalid * due to a break-before-make sequence. We can unlock and * return success to the trap handler. */ if (PAR_SUCCESS(par)) { PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } break; default: break; } PMAP_UNLOCK(pmap); #endif return (KERN_FAILURE); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(!PHYS_IN_DMAP(paddr))) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); } } } Index: projects/runtime-coverage/sys/arm64/include/efi.h =================================================================== --- projects/runtime-coverage/sys/arm64/include/efi.h (revision 324497) +++ projects/runtime-coverage/sys/arm64/include/efi.h (revision 324498) @@ -1,12 +1,44 @@ /*- - * This file is in the public domain since it's just boilerplate. + * Copyright (c) 2017 Andrew Turner + * All rights reserved. * + * This software was developed by SRI International and the University of + * Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-10-C-0237 + * ("CTSRD"), as part of the DARPA CRASH research programme. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * * $FreeBSD$ */ #ifndef __ARM64_INCLUDE_EFI_H_ #define __ARM64_INCLUDE_EFI_H_ #define EFIABI_ATTR + +#ifdef _KERNEL +#define EFI_TIME_LOCK() +#define EFI_TIME_UNLOCK() +#define EFI_TIME_OWNED() +#endif #endif /* __ARM64_INCLUDE_EFI_H_ */ Index: projects/runtime-coverage/sys/arm64/include/fpu.h =================================================================== --- projects/runtime-coverage/sys/arm64/include/fpu.h (nonexistent) +++ projects/runtime-coverage/sys/arm64/include/fpu.h (revision 324498) @@ -0,0 +1,6 @@ +/*- + * This file is in the public domain. + * + * $FreeBSD$ + */ +#include Property changes on: projects/runtime-coverage/sys/arm64/include/fpu.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: projects/runtime-coverage/sys/arm64/include/pte.h =================================================================== --- projects/runtime-coverage/sys/arm64/include/pte.h (revision 324497) +++ projects/runtime-coverage/sys/arm64/include/pte.h (revision 324498) @@ -1,123 +1,128 @@ /*- * Copyright (c) 2014 Andrew Turner * Copyright (c) 2014-2015 The FreeBSD Foundation * All rights reserved. * * This software was developed by Andrew Turner under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_PTE_H_ #define _MACHINE_PTE_H_ #ifndef LOCORE typedef uint64_t pd_entry_t; /* page directory entry */ typedef uint64_t pt_entry_t; /* page table entry */ #endif /* Block and Page attributes */ /* TODO: Add the upper attributes */ #define ATTR_MASK_H UINT64_C(0xfff0000000000000) #define ATTR_MASK_L UINT64_C(0x0000000000000fff) #define ATTR_MASK (ATTR_MASK_H | ATTR_MASK_L) /* Bits 58:55 are reserved for software */ #define ATTR_SW_MANAGED (1UL << 56) #define ATTR_SW_WIRED (1UL << 55) #define ATTR_UXN (1UL << 54) #define ATTR_PXN (1UL << 53) #define ATTR_XN (ATTR_PXN | ATTR_UXN) #define ATTR_CONTIGUOUS (1UL << 52) #define ATTR_DBM (1UL << 51) #define ATTR_nG (1 << 11) #define ATTR_AF (1 << 10) #define ATTR_SH(x) ((x) << 8) #define ATTR_SH_MASK ATTR_SH(3) #define ATTR_SH_NS 0 /* Non-shareable */ #define ATTR_SH_OS 2 /* Outer-shareable */ #define ATTR_SH_IS 3 /* Inner-shareable */ #define ATTR_AP_RW_BIT (1 << 7) #define ATTR_AP(x) ((x) << 6) #define ATTR_AP_MASK ATTR_AP(3) #define ATTR_AP_RW (0 << 1) #define ATTR_AP_RO (1 << 1) #define ATTR_AP_USER (1 << 0) #define ATTR_NS (1 << 5) #define ATTR_IDX(x) ((x) << 2) #define ATTR_IDX_MASK (7 << 2) #define ATTR_DEFAULT (ATTR_AF | ATTR_SH(ATTR_SH_IS)) #define ATTR_DESCR_MASK 3 /* Level 0 table, 512GiB per entry */ #define L0_SHIFT 39 #define L0_SIZE (1ul << L0_SHIFT) #define L0_OFFSET (L0_SIZE - 1ul) #define L0_INVAL 0x0 /* An invalid address */ /* 0x1 Level 0 doesn't support block translation */ /* 0x2 also marks an invalid address */ #define L0_TABLE 0x3 /* A next-level table */ /* Level 1 table, 1GiB per entry */ #define L1_SHIFT 30 #define L1_SIZE (1 << L1_SHIFT) #define L1_OFFSET (L1_SIZE - 1) #define L1_INVAL L0_INVAL #define L1_BLOCK 0x1 #define L1_TABLE L0_TABLE /* Level 2 table, 2MiB per entry */ #define L2_SHIFT 21 #define L2_SIZE (1 << L2_SHIFT) #define L2_OFFSET (L2_SIZE - 1) #define L2_INVAL L1_INVAL #define L2_BLOCK L1_BLOCK #define L2_TABLE L1_TABLE #define L2_BLOCK_MASK UINT64_C(0xffffffe00000) /* Level 3 table, 4KiB per entry */ #define L3_SHIFT 12 #define L3_SIZE (1 << L3_SHIFT) #define L3_OFFSET (L3_SIZE - 1) #define L3_SHIFT 12 #define L3_INVAL 0x0 /* 0x1 is reserved */ /* 0x2 also marks an invalid address */ #define L3_PAGE 0x3 #define L0_ENTRIES_SHIFT 9 #define L0_ENTRIES (1 << L0_ENTRIES_SHIFT) #define L0_ADDR_MASK (L0_ENTRIES - 1) #define Ln_ENTRIES_SHIFT 9 #define Ln_ENTRIES (1 << Ln_ENTRIES_SHIFT) #define Ln_ADDR_MASK (Ln_ENTRIES - 1) #define Ln_TABLE_MASK ((1 << 12) - 1) +#define pmap_l0_index(va) (((va) >> L0_SHIFT) & L0_ADDR_MASK) +#define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) +#define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) +#define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) + #endif /* !_MACHINE_PTE_H_ */ /* End of pte.h */ Index: projects/runtime-coverage/sys/conf/files.arm64 =================================================================== --- projects/runtime-coverage/sys/conf/files.arm64 (revision 324497) +++ projects/runtime-coverage/sys/conf/files.arm64 (revision 324498) @@ -1,214 +1,215 @@ # $FreeBSD$ cloudabi64_vdso.o optional compat_cloudabi64 \ dependency "$S/contrib/cloudabi/cloudabi_vdso_aarch64.S" \ compile-with "${CC} -x assembler-with-cpp -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_aarch64.S -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "cloudabi64_vdso.o" # cloudabi64_vdso_blob.o optional compat_cloudabi64 \ dependency "cloudabi64_vdso.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf64-littleaarch64 --binary-architecture aarch64 cloudabi64_vdso.o ${.TARGET}" \ no-implicit-rule \ clean "cloudabi64_vdso_blob.o" # arm/allwinner/a10_ehci.c optional ehci aw_ehci fdt arm/allwinner/a10_gpio.c optional gpio aw_gpio fdt arm/allwinner/a10_mmc.c optional mmc aw_mmc fdt arm/allwinner/a64/a64_padconf.c optional soc_allwinner_a64 fdt arm/allwinner/a64/a64_r_padconf.c optional soc_allwinner_a64 fdt arm/allwinner/aw_ccu.c optional aw_ccu fdt arm/allwinner/aw_nmi.c optional aw_nmi fdt \ compile-with "${NORMAL_C} -I$S/gnu/dts/include" arm/allwinner/aw_reset.c optional aw_ccu fdt arm/allwinner/aw_rsb.c optional aw_rsb fdt arm/allwinner/aw_rtc.c optional aw_rtc fdt arm/allwinner/aw_sid.c optional aw_sid fdt arm/allwinner/aw_thermal.c optional aw_thermal fdt arm/allwinner/aw_usbphy.c optional ehci aw_usbphy fdt arm/allwinner/aw_wdog.c optional aw_wdog fdt arm/allwinner/axp81x.c optional axp81x fdt arm/allwinner/clk/aw_ahbclk.c optional aw_ccu fdt arm/allwinner/clk/aw_apbclk.c optional aw_ccu fdt arm/allwinner/clk/aw_axiclk.c optional aw_ccu fdt arm/allwinner/clk/aw_cpuclk.c optional aw_ccu fdt arm/allwinner/clk/aw_gate.c optional aw_ccu fdt arm/allwinner/clk/aw_modclk.c optional aw_ccu fdt arm/allwinner/clk/aw_pll.c optional aw_ccu fdt \ compile-with "${NORMAL_C} -I$S/gnu/dts/include" arm/allwinner/clk/aw_thsclk.c optional aw_ccu fdt arm/allwinner/clk/aw_usbclk.c optional aw_ccu fdt arm/allwinner/clkng/aw_ccung.c optional aw_ccu fdt arm/allwinner/clkng/aw_clk_nkmp.c optional aw_ccu fdt arm/allwinner/clkng/aw_clk_nm.c optional aw_ccu fdt arm/allwinner/clkng/aw_clk_prediv_mux.c optional aw_ccu fdt arm/allwinner/clkng/ccu_a64.c optional aw_ccu fdt arm/allwinner/clkng/ccu_h3.c optional aw_ccu fdt arm/allwinner/clkng/ccu_sun8i_r.c optional aw_ccu fdt arm/allwinner/if_awg.c optional awg fdt arm/annapurna/alpine/alpine_ccu.c optional al_ccu fdt arm/annapurna/alpine/alpine_nb_service.c optional al_nb_service fdt arm/annapurna/alpine/alpine_pci.c optional al_pci fdt arm/annapurna/alpine/alpine_pci_msix.c optional al_pci fdt arm/annapurna/alpine/alpine_serdes.c optional al_serdes fdt \ no-depend \ compile-with "${CC} -c -o ${.TARGET} ${CFLAGS} -I$S/contrib/alpine-hal -I$S/contrib/alpine-hal/eth ${PROF} ${.IMPSRC}" arm/arm/generic_timer.c standard arm/arm/gic.c standard arm/arm/gic_fdt.c optional fdt arm/arm/pmu.c standard arm/broadcom/bcm2835/bcm2835_audio.c optional sound vchiq fdt \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" arm/broadcom/bcm2835/bcm2835_bsc.c optional bcm2835_bsc soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_cpufreq.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_dma.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_fbd.c optional vt soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_ft5406.c optional evdev bcm2835_ft5406 soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_gpio.c optional gpio soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_intr.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_mbox.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_rng.c optional random soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_sdhci.c optional sdhci soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_spi.c optional bcm2835_spi soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_vcio.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2835_wdog.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm2836.c optional soc_brcm_bcm2837 fdt arm/broadcom/bcm2835/bcm283x_dwc_fdt.c optional dwcotg fdt soc_brcm_bcm2837 arm/mv/armada38x/armada38x_rtc.c optional mv_rtc fdt arm64/acpica/acpi_machdep.c optional acpi arm64/acpica/OsdEnvironment.c optional acpi arm64/acpica/acpi_wakeup.c optional acpi arm64/acpica/pci_cfgreg.c optional acpi pci arm64/arm64/autoconf.c standard arm64/arm64/bus_machdep.c standard arm64/arm64/bus_space_asm.S standard arm64/arm64/busdma_bounce.c standard arm64/arm64/busdma_machdep.c standard arm64/arm64/bzero.S standard arm64/arm64/clock.c standard arm64/arm64/copyinout.S standard arm64/arm64/copystr.c standard arm64/arm64/cpufunc_asm.S standard arm64/arm64/db_disasm.c optional ddb arm64/arm64/db_interface.c optional ddb arm64/arm64/db_trace.c optional ddb arm64/arm64/debug_monitor.c optional ddb arm64/arm64/disassem.c optional ddb arm64/arm64/dump_machdep.c standard +arm64/arm64/efirt_machdep.c optional efirt arm64/arm64/elf_machdep.c standard arm64/arm64/exception.S standard arm64/arm64/gicv3_its.c optional intrng fdt arm64/arm64/gic_v3.c standard arm64/arm64/gic_v3_fdt.c optional fdt arm64/arm64/identcpu.c standard arm64/arm64/in_cksum.c optional inet | inet6 arm64/arm64/locore.S standard no-obj arm64/arm64/machdep.c standard arm64/arm64/mem.c standard arm64/arm64/memcpy.S standard arm64/arm64/memmove.S standard arm64/arm64/minidump_machdep.c standard arm64/arm64/mp_machdep.c optional smp arm64/arm64/nexus.c standard arm64/arm64/ofw_machdep.c optional fdt arm64/arm64/pmap.c standard arm64/arm64/stack_machdep.c optional ddb | stack arm64/arm64/support.S standard arm64/arm64/swtch.S standard arm64/arm64/sys_machdep.c standard arm64/arm64/trap.c standard arm64/arm64/uio_machdep.c standard arm64/arm64/uma_machdep.c standard arm64/arm64/undefined.c standard arm64/arm64/unwind.c optional ddb | kdtrace_hooks | stack arm64/arm64/vfp.c standard arm64/arm64/vm_machdep.c standard arm64/cavium/thunder_pcie_fdt.c optional soc_cavm_thunderx pci fdt arm64/cavium/thunder_pcie_pem.c optional soc_cavm_thunderx pci arm64/cavium/thunder_pcie_pem_fdt.c optional soc_cavm_thunderx pci fdt arm64/cavium/thunder_pcie_common.c optional soc_cavm_thunderx pci arm64/cloudabi64/cloudabi64_sysvec.c optional compat_cloudabi64 contrib/vchiq/interface/compat/vchi_bsd.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -Wno-unused -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_arm.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -Wno-unused -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_connected.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_core.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_kmod.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_shim.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" contrib/vchiq/interface/vchiq_arm/vchiq_util.c optional vchiq soc_brcm_bcm2837 \ compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" crypto/armv8/armv8_crypto.c optional armv8crypto armv8_crypto_wrap.o optional armv8crypto \ dependency "$S/crypto/armv8/armv8_crypto_wrap.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc:N-mgeneral-regs-only} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -march=armv8-a+crypto ${.IMPSRC}" \ no-implicit-rule \ clean "armv8_crypto_wrap.o" crypto/blowfish/bf_enc.c optional crypto | ipsec | ipsec_support crypto/des/des_enc.c optional crypto | ipsec | ipsec_support | netsmb dev/acpica/acpi_if.m optional acpi dev/ahci/ahci_generic.c optional ahci dev/axgbe/if_axgbe.c optional axgbe dev/axgbe/xgbe-desc.c optional axgbe dev/axgbe/xgbe-dev.c optional axgbe dev/axgbe/xgbe-drv.c optional axgbe dev/axgbe/xgbe-mdio.c optional axgbe dev/cpufreq/cpufreq_dt.c optional cpufreq fdt dev/iicbus/twsi/a10_twsi.c optional twsi fdt dev/iicbus/twsi/twsi.c optional twsi fdt dev/hwpmc/hwpmc_arm64.c optional hwpmc dev/hwpmc/hwpmc_arm64_md.c optional hwpmc dev/mbox/mbox_if.m optional soc_brcm_bcm2837 dev/mmc/host/dwmmc.c optional dwmmc fdt dev/mmc/host/dwmmc_hisi.c optional dwmmc fdt soc_hisi_hi6220 dev/neta/if_mvneta_fdt.c optional neta fdt dev/neta/if_mvneta.c optional neta mdio mii dev/ofw/ofw_cpu.c optional fdt dev/ofw/ofwpci.c optional fdt pci dev/pci/pci_host_generic.c optional pci dev/pci/pci_host_generic_fdt.c optional pci fdt dev/psci/psci.c optional psci dev/psci/psci_arm64.S optional psci dev/uart/uart_cpu_arm64.c optional uart dev/uart/uart_dev_pl011.c optional uart pl011 dev/usb/controller/dwc_otg_hisi.c optional dwcotg fdt soc_hisi_hi6220 dev/usb/controller/ehci_mv.c optional ehci_mv fdt dev/usb/controller/generic_ehci.c optional ehci acpi dev/usb/controller/generic_ohci.c optional ohci fdt dev/usb/controller/generic_usb_if.m optional ohci fdt dev/usb/controller/xhci_mv.c optional xhci_mv fdt dev/vnic/mrml_bridge.c optional vnic fdt dev/vnic/nic_main.c optional vnic pci dev/vnic/nicvf_main.c optional vnic pci pci_iov dev/vnic/nicvf_queues.c optional vnic pci pci_iov dev/vnic/thunder_bgx_fdt.c optional vnic fdt dev/vnic/thunder_bgx.c optional vnic pci dev/vnic/thunder_mdio_fdt.c optional vnic fdt dev/vnic/thunder_mdio.c optional vnic dev/vnic/lmac_if.m optional inet | inet6 | vnic kern/kern_clocksource.c standard kern/msi_if.m optional intrng kern/pic_if.m optional intrng kern/subr_devmap.c standard kern/subr_intr.c optional intrng libkern/bcmp.c standard libkern/ffs.c standard libkern/ffsl.c standard libkern/ffsll.c standard libkern/fls.c standard libkern/flsl.c standard libkern/flsll.c standard libkern/memset.c standard libkern/arm64/crc32c_armv8.S standard cddl/contrib/opensolaris/common/atomic/aarch64/opensolaris_atomic.S optional zfs | dtrace compile-with "${CDDL_C}" cddl/dev/dtrace/aarch64/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}" cddl/dev/dtrace/aarch64/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}" cddl/dev/fbt/aarch64/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}" Index: projects/runtime-coverage/sys/conf/options.arm64 =================================================================== --- projects/runtime-coverage/sys/conf/options.arm64 (revision 324497) +++ projects/runtime-coverage/sys/conf/options.arm64 (revision 324498) @@ -1,17 +1,21 @@ # $FreeBSD$ ARM64 opt_global.h INTRNG opt_global.h SOCDEV_PA opt_global.h SOCDEV_VA opt_global.h THUNDERX_PASS_1_1_ERRATA opt_global.h VFP opt_global.h +# EFI Runtime services support +EFIRT opt_efirt.h + +# Devices DEV_PSCI opt_platform.h # SoC Support SOC_ALLWINNER_A64 opt_soc.h SOC_ALLWINNER_H5 opt_soc.h SOC_BRCM_BCM2837 opt_soc.h SOC_CAVM_THUNDERX opt_soc.h SOC_HISI_HI6220 opt_soc.h Index: projects/runtime-coverage/sys/contrib/rdma/krping/krping.c =================================================================== --- projects/runtime-coverage/sys/contrib/rdma/krping/krping.c (revision 324497) +++ projects/runtime-coverage/sys/contrib/rdma/krping/krping.c (revision 324498) @@ -1,3348 +1,3425 @@ /* * Copyright (c) 2005 Ammasso, Inc. All rights reserved. * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "krping.h" #include "getopt.h" extern int krping_debug; #define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x) #define PRINTF(cb, x...) log(LOG_INFO, x) #define BIND_INFO 1 MODULE_AUTHOR("Steve Wise"); MODULE_DESCRIPTION("RDMA ping client/server"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(krping, 1); MODULE_DEPEND(krping, linuxkpi, 1, 1, 1); static __inline uint64_t get_cycles(void) { uint32_t low, high; __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); return (low | ((u_int64_t)high << 32)); } typedef uint64_t cycles_t; enum mem_type { DMA = 1, FASTREG = 2, MW = 3, MR = 4 }; static const struct krping_option krping_opts[] = { {"count", OPT_INT, 'C'}, {"size", OPT_INT, 'S'}, {"addr", OPT_STRING, 'a'}, + {"addr6", OPT_STRING, 'A'}, {"port", OPT_INT, 'p'}, {"verbose", OPT_NOPARAM, 'v'}, {"validate", OPT_NOPARAM, 'V'}, {"server", OPT_NOPARAM, 's'}, {"client", OPT_NOPARAM, 'c'}, {"mem_mode", OPT_STRING, 'm'}, {"server_inv", OPT_NOPARAM, 'I'}, {"wlat", OPT_NOPARAM, 'l'}, {"rlat", OPT_NOPARAM, 'L'}, {"bw", OPT_NOPARAM, 'B'}, {"duplex", OPT_NOPARAM, 'd'}, {"txdepth", OPT_INT, 'T'}, {"poll", OPT_NOPARAM, 'P'}, {"local_dma_lkey", OPT_NOPARAM, 'Z'}, {"read_inv", OPT_NOPARAM, 'R'}, {"fr", OPT_INT, 'f'}, {NULL, 0, 0} }; #define htonll(x) cpu_to_be64((x)) #define ntohll(x) cpu_to_be64((x)) static struct mutex krping_mutex; /* * List of running krping threads. */ static LIST_HEAD(krping_cbs); /* * krping "ping/pong" loop: * client sends source rkey/addr/len * server receives source rkey/add/len * server rdma reads "ping" data from source * server sends "go ahead" on rdma read completion * client sends sink rkey/addr/len * server receives sink rkey/addr/len * server rdma writes "pong" data to sink * server sends "go ahead" on rdma write completion * */ /* * These states are used to signal events between the completion handler * and the main client or server thread. * * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV, * and RDMA_WRITE_COMPLETE for each ping. */ enum test_state { IDLE = 1, CONNECT_REQUEST, ADDR_RESOLVED, ROUTE_RESOLVED, CONNECTED, RDMA_READ_ADV, RDMA_READ_COMPLETE, RDMA_WRITE_ADV, RDMA_WRITE_COMPLETE, ERROR }; struct krping_rdma_info { uint64_t buf; uint32_t rkey; uint32_t size; }; /* * Default max buffer size for IO... */ #define RPING_BUFSIZE 128*1024 #define RPING_SQ_DEPTH 64 /* * Control block struct. */ struct krping_cb { void *cookie; int server; /* 0 iff client */ struct ib_cq *cq; struct ib_pd *pd; struct ib_qp *qp; enum mem_type mem; struct ib_mr *dma_mr; struct ib_fast_reg_page_list *page_list; int page_list_len; struct ib_send_wr fastreg_wr; struct ib_send_wr invalidate_wr; struct ib_mr *fastreg_mr; int server_invalidate; int read_inv; u8 key; struct ib_mw *mw; struct ib_mw_bind bind_attr; struct ib_recv_wr rq_wr; /* recv work request record */ struct ib_sge recv_sgl; /* recv single SGE */ struct krping_rdma_info recv_buf;/* malloc'd buffer */ u64 recv_dma_addr; DECLARE_PCI_UNMAP_ADDR(recv_mapping) struct ib_mr *recv_mr; struct ib_send_wr sq_wr; /* send work requrest record */ struct ib_sge send_sgl; struct krping_rdma_info send_buf;/* single send buf */ u64 send_dma_addr; DECLARE_PCI_UNMAP_ADDR(send_mapping) struct ib_mr *send_mr; struct ib_send_wr rdma_sq_wr; /* rdma work request record */ struct ib_sge rdma_sgl; /* rdma single SGE */ char *rdma_buf; /* used as rdma sink */ u64 rdma_dma_addr; DECLARE_PCI_UNMAP_ADDR(rdma_mapping) struct ib_mr *rdma_mr; uint32_t remote_rkey; /* remote guys RKEY */ uint64_t remote_addr; /* remote guys TO */ uint32_t remote_len; /* remote guys LEN */ char *start_buf; /* rdma read src */ u64 start_dma_addr; DECLARE_PCI_UNMAP_ADDR(start_mapping) struct ib_mr *start_mr; enum test_state state; /* used for cond/signalling */ wait_queue_head_t sem; struct krping_stats stats; uint16_t port; /* dst port in NBO */ - struct in_addr addr; /* dst addr in NBO */ + union { + struct in_addr v4; + struct in6_addr v6; + } addr; /* dst addr in NBO */ + int addr_type; /* AF_INET or AF_INET6 */ char *addr_str; /* dst addr string */ int verbose; /* verbose logging */ int count; /* ping count */ int size; /* ping data size */ int validate; /* validate ping data */ int wlat; /* run wlat test */ int rlat; /* run rlat test */ int bw; /* run bw test */ int duplex; /* run bw full duplex test */ int poll; /* poll or block for rlat test */ int txdepth; /* SQ depth */ int local_dma_lkey; /* use 0 for lkey */ int frtest; /* fastreg test */ int testnum; /* CM stuff */ struct rdma_cm_id *cm_id; /* connection on client side,*/ /* listener on server side. */ struct rdma_cm_id *child_cm_id; /* connection on server side */ struct list_head list; }; static int krping_cma_event_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { int ret; struct krping_cb *cb = cma_id->context; DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event, cma_id, (cma_id == cb->cm_id) ? "parent" : "child"); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: cb->state = ADDR_RESOLVED; ret = rdma_resolve_route(cma_id, 2000); if (ret) { PRINTF(cb, "rdma_resolve_route error %d\n", ret); wake_up_interruptible(&cb->sem); } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: cb->state = ROUTE_RESOLVED; cb->child_cm_id = cma_id; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_CONNECT_REQUEST: if (cb->state == IDLE) { cb->state = CONNECT_REQUEST; cb->child_cm_id = cma_id; } else { PRINTF(cb, "Received connection request in wrong state" " (%d)\n", cb->state); } DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id); wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ESTABLISHED: DEBUG_LOG(cb, "ESTABLISHED\n"); if (!cb->server) { cb->state = CONNECTED; } wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: PRINTF(cb, "cma event %d, error %d\n", event->event, event->status); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DISCONNECTED: PRINTF(cb, "DISCONNECT EVENT...\n"); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: PRINTF(cb, "cma detected device removal!!!!\n"); break; default: PRINTF(cb, "oof bad type!\n"); wake_up_interruptible(&cb->sem); break; } return 0; } static int server_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } cb->remote_rkey = ntohl(cb->recv_buf.rkey); cb->remote_addr = ntohll(cb->recv_buf.buf); cb->remote_len = ntohl(cb->recv_buf.size); DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n", cb->remote_rkey, (unsigned long long)cb->remote_addr, cb->remote_len); if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) cb->state = RDMA_READ_ADV; else cb->state = RDMA_WRITE_ADV; return 0; } static int client_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } if (cb->state == RDMA_READ_ADV) cb->state = RDMA_WRITE_ADV; else cb->state = RDMA_WRITE_COMPLETE; return 0; } static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) { struct krping_cb *cb = ctx; struct ib_wc wc; struct ib_recv_wr *bad_wr; int ret; BUG_ON(cb->cq != cq); if (cb->state == ERROR) { PRINTF(cb, "cq completion in ERROR state\n"); return; } if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { if (wc.status == IB_WC_WR_FLUSH_ERR) { DEBUG_LOG(cb, "cq flushed\n"); continue; } else { PRINTF(cb, "cq completion failed with " "wr_id %jx status %d opcode %d vender_err %x\n", (uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err); goto error; } } switch (wc.opcode) { case IB_WC_SEND: DEBUG_LOG(cb, "send completion\n"); cb->stats.send_bytes += cb->send_sgl.length; cb->stats.send_msgs++; break; case IB_WC_RDMA_WRITE: DEBUG_LOG(cb, "rdma write completion\n"); cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.write_msgs++; cb->state = RDMA_WRITE_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RDMA_READ: DEBUG_LOG(cb, "rdma read completion\n"); cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.read_msgs++; cb->state = RDMA_READ_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RECV: DEBUG_LOG(cb, "recv completion\n"); cb->stats.recv_bytes += sizeof(cb->recv_buf); cb->stats.recv_msgs++; if (cb->wlat || cb->rlat || cb->bw || cb->frtest) ret = server_recv(cb, &wc); else ret = cb->server ? server_recv(cb, &wc) : client_recv(cb, &wc); if (ret) { PRINTF(cb, "recv wc error: %d\n", ret); goto error; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "post recv error: %d\n", ret); goto error; } wake_up_interruptible(&cb->sem); break; default: PRINTF(cb, "%s:%d Unexpected opcode %d, Shutting down\n", __func__, __LINE__, wc.opcode); goto error; } } if (ret) { PRINTF(cb, "poll error %d\n", ret); goto error; } return; error: cb->state = ERROR; wake_up_interruptible(&cb->sem); } static int krping_accept(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; DEBUG_LOG(cb, "accepting client connection request\n"); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; ret = rdma_accept(cb->child_cm_id, &conn_param); if (ret) { PRINTF(cb, "rdma_accept error: %d\n", ret); return ret; } if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); return -1; } } return 0; } static void krping_setup_wr(struct krping_cb *cb) { cb->recv_sgl.addr = cb->recv_dma_addr; cb->recv_sgl.length = sizeof cb->recv_buf; if (cb->local_dma_lkey) cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey; else if (cb->mem == DMA) cb->recv_sgl.lkey = cb->dma_mr->lkey; else cb->recv_sgl.lkey = cb->recv_mr->lkey; cb->rq_wr.sg_list = &cb->recv_sgl; cb->rq_wr.num_sge = 1; cb->send_sgl.addr = cb->send_dma_addr; cb->send_sgl.length = sizeof cb->send_buf; if (cb->local_dma_lkey) cb->send_sgl.lkey = cb->qp->device->local_dma_lkey; else if (cb->mem == DMA) cb->send_sgl.lkey = cb->dma_mr->lkey; else cb->send_sgl.lkey = cb->send_mr->lkey; cb->sq_wr.opcode = IB_WR_SEND; cb->sq_wr.send_flags = IB_SEND_SIGNALED; cb->sq_wr.sg_list = &cb->send_sgl; cb->sq_wr.num_sge = 1; if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { cb->rdma_sgl.addr = cb->rdma_dma_addr; if (cb->mem == MR) cb->rdma_sgl.lkey = cb->rdma_mr->lkey; cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; cb->rdma_sq_wr.num_sge = 1; } switch(cb->mem) { case FASTREG: /* * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR. * both unsignaled. The client uses them to reregister * the rdma buffers with a new key each iteration. */ cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR; cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; cb->fastreg_wr.wr.fast_reg.length = cb->size; cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list; cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len; cb->invalidate_wr.next = &cb->fastreg_wr; cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; break; case MW: cb->bind_attr.wr_id = 0xabbaabba; cb->bind_attr.send_flags = 0; /* unsignaled */ #ifdef BIND_INFO cb->bind_attr.bind_info.length = cb->size; #else cb->bind_attr.length = cb->size; #endif break; default: break; } } static int krping_setup_buffers(struct krping_cb *cb) { int ret; struct ib_phys_buf buf; u64 iovbase; DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb); cb->recv_dma_addr = ib_dma_map_single(cb->pd->device, &cb->recv_buf, sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr); cb->send_dma_addr = ib_dma_map_single(cb->pd->device, &cb->send_buf, sizeof(cb->send_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr); if (cb->mem == DMA) { cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE); if (IS_ERR(cb->dma_mr)) { DEBUG_LOG(cb, "reg_dmamr failed\n"); ret = PTR_ERR(cb->dma_mr); goto bail; } } else { if (!cb->local_dma_lkey) { buf.addr = cb->recv_dma_addr; buf.size = sizeof cb->recv_buf; DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n", (uintmax_t)buf.addr, (int)buf.size); iovbase = cb->recv_dma_addr; cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, IB_ACCESS_LOCAL_WRITE, &iovbase); if (IS_ERR(cb->recv_mr)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->recv_mr); goto bail; } buf.addr = cb->send_dma_addr; buf.size = sizeof cb->send_buf; DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n", (uintmax_t)buf.addr, (int)buf.size); iovbase = cb->send_dma_addr; cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 0, &iovbase); if (IS_ERR(cb->send_mr)) { DEBUG_LOG(cb, "send_buf reg_mr failed\n"); ret = PTR_ERR(cb->send_mr); goto bail; } } } cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->rdma_buf) { DEBUG_LOG(cb, "rdma_buf malloc failed\n"); ret = -ENOMEM; goto bail; } cb->rdma_dma_addr = ib_dma_map_single(cb->pd->device, cb->rdma_buf, cb->size, DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr); if (cb->mem != DMA) { switch (cb->mem) { case FASTREG: cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; cb->page_list = ib_alloc_fast_reg_page_list( cb->pd->device, cb->page_list_len); if (IS_ERR(cb->page_list)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->page_list); goto bail; } cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, cb->page_list->max_page_list_len); if (IS_ERR(cb->fastreg_mr)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->fastreg_mr); goto bail; } DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p" " page_list_len %u\n", cb->fastreg_mr->rkey, cb->page_list, cb->page_list_len); break; case MW: cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1); if (IS_ERR(cb->mw)) { DEBUG_LOG(cb, "recv_buf alloc_mw failed\n"); ret = PTR_ERR(cb->mw); goto bail; } DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey); /*FALLTHROUGH*/ case MR: buf.addr = cb->rdma_dma_addr; buf.size = cb->size; iovbase = cb->rdma_dma_addr; cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE, &iovbase); if (IS_ERR(cb->rdma_mr)) { DEBUG_LOG(cb, "rdma_buf reg_mr failed\n"); ret = PTR_ERR(cb->rdma_mr); goto bail; } DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n", (uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey); break; default: ret = -EINVAL; goto bail; break; } } if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { cb->start_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->start_buf) { DEBUG_LOG(cb, "start_buf malloc failed\n"); ret = -ENOMEM; goto bail; } cb->start_dma_addr = ib_dma_map_single(cb->pd->device, cb->start_buf, cb->size, DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr); if (cb->mem == MR || cb->mem == MW) { unsigned flags = IB_ACCESS_REMOTE_READ; if (cb->wlat || cb->rlat || cb->bw || cb->frtest) { flags |= IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; } buf.addr = cb->start_dma_addr; buf.size = cb->size; DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n", (uintmax_t)buf.addr, (int)buf.size); iovbase = cb->start_dma_addr; cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, flags, &iovbase); if (IS_ERR(cb->start_mr)) { DEBUG_LOG(cb, "start_buf reg_mr failed\n"); ret = PTR_ERR(cb->start_mr); goto bail; } } } krping_setup_wr(cb); DEBUG_LOG(cb, "allocated & registered buffers...\n"); return 0; bail: if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr)) ib_dereg_mr(cb->fastreg_mr); if (cb->mw && !IS_ERR(cb->mw)) ib_dealloc_mw(cb->mw); if (cb->rdma_mr && !IS_ERR(cb->rdma_mr)) ib_dereg_mr(cb->rdma_mr); if (cb->page_list && !IS_ERR(cb->page_list)) ib_free_fast_reg_page_list(cb->page_list); if (cb->dma_mr && !IS_ERR(cb->dma_mr)) ib_dereg_mr(cb->dma_mr); if (cb->recv_mr && !IS_ERR(cb->recv_mr)) ib_dereg_mr(cb->recv_mr); if (cb->send_mr && !IS_ERR(cb->send_mr)) ib_dereg_mr(cb->send_mr); if (cb->rdma_buf) kfree(cb->rdma_buf); if (cb->start_buf) kfree(cb->start_buf); return ret; } static void krping_free_buffers(struct krping_cb *cb) { DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb); if (cb->dma_mr) ib_dereg_mr(cb->dma_mr); if (cb->send_mr) ib_dereg_mr(cb->send_mr); if (cb->recv_mr) ib_dereg_mr(cb->recv_mr); if (cb->rdma_mr) ib_dereg_mr(cb->rdma_mr); if (cb->start_mr) ib_dereg_mr(cb->start_mr); if (cb->fastreg_mr) ib_dereg_mr(cb->fastreg_mr); if (cb->mw) ib_dealloc_mw(cb->mw); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, recv_mapping), sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, send_mapping), sizeof(cb->send_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, rdma_mapping), cb->size, DMA_BIDIRECTIONAL); kfree(cb->rdma_buf); if (cb->start_buf) { dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, start_mapping), cb->size, DMA_BIDIRECTIONAL); kfree(cb->start_buf); } } static int krping_create_qp(struct krping_cb *cb) { struct ib_qp_init_attr init_attr; int ret; memset(&init_attr, 0, sizeof(init_attr)); init_attr.cap.max_send_wr = cb->txdepth; init_attr.cap.max_recv_wr = 2; init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = cb->cq; init_attr.recv_cq = cb->cq; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; if (cb->server) { ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->child_cm_id->qp; } else { ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->cm_id->qp; } return ret; } static void krping_free_qp(struct krping_cb *cb) { ib_destroy_qp(cb->qp); ib_destroy_cq(cb->cq); ib_dealloc_pd(cb->pd); } static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) { int ret; cb->pd = ib_alloc_pd(cm_id->device); if (IS_ERR(cb->pd)) { PRINTF(cb, "ib_alloc_pd failed\n"); return PTR_ERR(cb->pd); } DEBUG_LOG(cb, "created pd %p\n", cb->pd); strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name)); cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, cb, cb->txdepth * 2, 0); if (IS_ERR(cb->cq)) { PRINTF(cb, "ib_create_cq failed\n"); ret = PTR_ERR(cb->cq); goto err1; } DEBUG_LOG(cb, "created cq %p\n", cb->cq); if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); if (ret) { PRINTF(cb, "ib_create_cq failed\n"); goto err2; } } ret = krping_create_qp(cb); if (ret) { PRINTF(cb, "krping_create_qp failed: %d\n", ret); goto err2; } DEBUG_LOG(cb, "created qp %p\n", cb->qp); return 0; err2: ib_destroy_cq(cb->cq); err1: ib_dealloc_pd(cb->pd); return ret; } /* * return the (possibly rebound) rkey for the rdma buffer. * FASTREG mode: invalidate and rebind via fastreg wr. * MW mode: rebind the MW. * other modes: just return the mr rkey. */ static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv) { u32 rkey = 0xffffffff; u64 p; struct ib_send_wr *bad_wr; int i; int ret; switch (cb->mem) { case FASTREG: cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey; /* * Update the fastreg key. */ ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key); cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey; /* * Update the fastreg WR with new buf info. */ if (buf == (u64)cb->start_dma_addr) cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ; else cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; cb->fastreg_wr.wr.fast_reg.iova_start = buf; p = (u64)(buf & PAGE_MASK); for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; i++, p += PAGE_SIZE) { cb->page_list->page_list[i] = p; DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p); } DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u" " iova_start %jx page_list_len %u\n", post_inv, cb->fastreg_wr.wr.fast_reg.rkey, cb->fastreg_wr.wr.fast_reg.page_shift, (unsigned)cb->fastreg_wr.wr.fast_reg.length, (uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start, cb->fastreg_wr.wr.fast_reg.page_list_len); if (post_inv) ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr); else ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); cb->state = ERROR; } rkey = cb->fastreg_mr->rkey; break; case MW: /* * Update the MW with new buf info. */ if (buf == (u64)cb->start_dma_addr) { #ifdef BIND_INFO cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ; cb->bind_attr.bind_info.mr = cb->start_mr; #else cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ; cb->bind_attr.mr = cb->start_mr; #endif } else { #ifdef BIND_INFO cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE; cb->bind_attr.bind_info.mr = cb->rdma_mr; #else cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE; cb->bind_attr.mr = cb->rdma_mr; #endif } #ifdef BIND_INFO cb->bind_attr.bind_info.addr = buf; #else cb->bind_attr.addr = buf; #endif DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n", #ifdef BIND_INFO cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey); #else cb->mw->rkey, buf, cb->bind_attr.mr->rkey); #endif ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr); if (ret) { PRINTF(cb, "bind mw error %d\n", ret); cb->state = ERROR; } else rkey = cb->mw->rkey; break; case MR: if (buf == (u64)cb->start_dma_addr) rkey = cb->start_mr->rkey; else rkey = cb->rdma_mr->rkey; break; case DMA: rkey = cb->dma_mr->rkey; break; default: PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__); cb->state = ERROR; break; } return rkey; } static void krping_format_send(struct krping_cb *cb, u64 buf) { struct krping_rdma_info *info = &cb->send_buf; u32 rkey; /* * Client side will do fastreg or mw bind before * advertising the rdma buffer. Server side * sends have no data. */ if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate); info->buf = htonll(buf); info->rkey = htonl(rkey); info->size = htonl(cb->size); DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n", (unsigned long long)buf, rkey, cb->size); } } static void krping_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr, inv; int ret; while (1) { /* Wait for client's Start STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV); if (cb->state != RDMA_READ_ADV) { PRINTF(cb, "wait for RDMA_READ_ADV state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received sink adv\n"); cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->remote_len; cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1); /* Issue RDMA Read. */ if (cb->read_inv) cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV; else { cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; if (cb->mem == FASTREG) { /* * Immediately follow the read with a * fenced LOCAL_INV. */ cb->rdma_sq_wr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.ex.invalidate_rkey = cb->fastreg_mr->rkey; inv.send_flags = IB_SEND_FENCE; } } ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } cb->rdma_sq_wr.next = NULL; DEBUG_LOG(cb, "server posted rdma read req \n"); /* Wait for read completion */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_COMPLETE); if (cb->state != RDMA_READ_COMPLETE) { PRINTF(cb, "wait for RDMA_READ_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received read complete\n"); /* Display data in recv buf */ if (cb->verbose) { if (strlen(cb->rdma_buf) > 128) { char msgbuf[128]; strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); PRINTF(cb, "server ping data stripped: %s\n", msgbuf); } else PRINTF(cb, "server ping data: %s\n", cb->rdma_buf); } /* Tell client to continue */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } DEBUG_LOG(cb, "server posted go ahead\n"); /* Wait for client's RDMA STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { PRINTF(cb, "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received sink adv\n"); /* RDMA Write echo data */ cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; if (cb->local_dma_lkey) cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey; else cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0); DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n", cb->rdma_sq_wr.sg_list->lkey, (unsigned long long)cb->rdma_sq_wr.sg_list->addr, cb->rdma_sq_wr.sg_list->length); ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for completion */ ret = wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { PRINTF(cb, "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG(cb, "server rdma write complete \n"); cb->state = CONNECTED; /* Tell client to begin again */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } DEBUG_LOG(cb, "server posted go ahead\n"); } } static void rlat_test(struct krping_cb *cb) { int scnt; int iters = cb->count; struct timeval start_tv, stop_tv; int ret; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; scnt = 0; cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; microtime(&start_tv); if (!cb->poll) { cb->state = RDMA_READ_ADV; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } while (scnt < iters) { cb->state = RDMA_READ_ADV; ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "Couldn't post send: ret=%d scnt %d\n", ret, scnt); return; } do { if (!cb->poll) { wait_event_interruptible(cb->sem, cb->state != RDMA_READ_ADV); if (cb->state == RDMA_READ_COMPLETE) { ne = 1; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } else { ne = -1; } } else ne = ib_poll_cq(cb->cq, 1, &wc); if (cb->state == ERROR) { PRINTF(cb, "state == ERROR...bailing scnt %d\n", scnt); return; } } while (ne == 0); if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (cb->poll && wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } ++scnt; } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size); } static void wlat_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; volatile char *poll_buf = (char *) cb->start_buf; char *buf = (char *)cb->rdma_buf; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters || rcnt < iters) { /* Wait till buffer changes. */ if (rcnt < iters && !(scnt < 1 && !cb->server)) { ++rcnt; while (*poll_buf != (char)rcnt) { if (cb->state == ERROR) { PRINTF(cb, "state = ERROR, bailing\n"); return; } } } if (scnt < iters) { struct ib_send_wr *bad_wr; *buf = (char)scnt+1; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); scnt++; } if (ccnt < iters) { struct ib_wc wc; int ne; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ++ccnt; if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); PRINTF(cb, "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void bw_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters) { while (scnt < iters && scnt - ccnt < cb->txdepth) { struct ib_send_wr *bad_wr; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); ++scnt; } if (ccnt < iters) { int ne; struct ib_wc wc; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ccnt += 1; if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void krping_rlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_wlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } wlat_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_bw_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) bw_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static int fastreg_supported(struct krping_cb *cb, int server) { struct ib_device *dev = server?cb->child_cm_id->device: cb->cm_id->device; struct ib_device_attr attr; int ret; ret = ib_query_device(dev, &attr); if (ret) { PRINTF(cb, "ib_query_device failed ret %d\n", ret); return 0; } if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n", (unsigned long long)attr.device_cap_flags); return 0; } DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n", (uintmax_t)attr.device_cap_flags); return 1; } static int krping_bind_server(struct krping_cb *cb) { - struct sockaddr_in sin; + union { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } sin; int ret; memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof sin; - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = cb->addr.s_addr; - sin.sin_port = cb->port; + switch (cb->addr_type) { + case AF_INET: + sin.v4.sin_len = sizeof sin.v4; + sin.v4.sin_family = AF_INET; + sin.v4.sin_addr = cb->addr.v4; + sin.v4.sin_port = cb->port; + break; + case AF_INET6: + sin.v6.sin6_len = sizeof sin.v6; + sin.v6.sin6_family = AF_INET6; + sin.v6.sin6_addr = cb->addr.v6; + sin.v6.sin6_port = cb->port; + break; + default: + return (-EINVAL); + } + ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); if (ret) { PRINTF(cb, "rdma_bind_addr error %d\n", ret); return ret; } DEBUG_LOG(cb, "rdma_bind_addr successful\n"); DEBUG_LOG(cb, "rdma_listen\n"); ret = rdma_listen(cb->cm_id, 3); if (ret) { PRINTF(cb, "rdma_listen failed: %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST); if (cb->state != CONNECT_REQUEST) { PRINTF(cb, "wait for CONNECT_REQUEST state %d\n", cb->state); return -1; } if (cb->mem == FASTREG && !fastreg_supported(cb, 1)) return -EINVAL; return 0; } /* * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads * complete. * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. */ static void krping_fr_test5(struct krping_cb *cb) { struct ib_fast_reg_page_list **pl; struct ib_send_wr *fr, *read, *bad; struct ib_wc wc; struct ib_sge *sgl; u8 key = 0; struct ib_mr **mr; u8 **buf; dma_addr_t *dma_addr; int i; int ret; int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; time_t start; int count = 0; int scnt; int depth = cb->txdepth >> 1; if (!depth) { PRINTF(cb, "txdepth must be > 1 for this test!\n"); return; } pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); read = kzalloc(sizeof *read * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth); buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) { PRINTF(cb, "kzalloc failed\n"); goto err1; } for (scnt = 0; scnt < depth; scnt++) { pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl[scnt])) { PRINTF(cb, "alloc_fr_page_list failed %ld\n", PTR_ERR(pl[scnt])); goto err2; } DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr[scnt])) { PRINTF(cb, "alloc_fr failed %ld\n", PTR_ERR(mr[scnt])); goto err2; } DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); ib_update_fast_reg_key(mr[scnt], ++key); buf[scnt] = kmalloc(cb->size, GFP_KERNEL); if (!buf[scnt]) { PRINTF(cb, "kmalloc failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); dma_addr[scnt] = ib_dma_map_single(cb->pd->device, buf[scnt], cb->size, DMA_BIDIRECTIONAL); if (dma_mapping_error(cb->pd->device->dma_device, dma_addr[scnt])) { PRINTF(cb, "dma_map failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); } sgl[scnt].lkey = mr[scnt]->rkey; sgl[scnt].length = cb->size; sgl[scnt].addr = (u64)buf[scnt]; DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n", __func__, scnt, sgl[scnt].lkey, sgl[scnt].length, (uintmax_t)sgl[scnt].addr); fr[scnt].opcode = IB_WR_FAST_REG_MR; fr[scnt].wr_id = scnt; fr[scnt].send_flags = 0; fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; fr[scnt].wr.fast_reg.length = cb->size; fr[scnt].wr.fast_reg.page_list = pl[scnt]; fr[scnt].wr.fast_reg.page_list_len = plen; fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; fr[scnt].next = &read[scnt]; read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV; read[scnt].wr_id = scnt; read[scnt].send_flags = IB_SEND_SIGNALED; read[scnt].wr.rdma.rkey = cb->remote_rkey; read[scnt].wr.rdma.remote_addr = cb->remote_addr; read[scnt].num_sge = 1; read[scnt].sg_list = &sgl[scnt]; ret = ib_post_send(cb->qp, &fr[scnt], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } start = time_uptime; DEBUG_LOG(cb, "%s starting IO.\n", __func__); while (!cb->count || cb->server || count < cb->count) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, count); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, 1); if (cb->state == ERROR) break; start = time_uptime; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u wr_id %ju " "opcode %d\n", wc.status, (uintmax_t)wc.wr_id, wc.opcode); goto err2; } count++; if (count == cb->count) break; ib_update_fast_reg_key(mr[wc.wr_id], ++key); fr[wc.wr_id].wr.fast_reg.rkey = mr[wc.wr_id]->rkey; sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey; ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } DEBUG_LOG(cb, "%s done!\n", __func__); err2: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u " "opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mrs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (mr[scnt]) { ib_dereg_mr(mr[scnt]); DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); } } DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (buf[scnt]) { dma_unmap_single(cb->pd->device->dma_device, dma_addr[scnt], cb->size, DMA_BIDIRECTIONAL); kfree(buf[scnt]); DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); } } DEBUG_LOG(cb, "destroying fr page lists!\n"); for (scnt = 0; scnt < depth; scnt++) { if (pl[scnt]) { DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); ib_free_fast_reg_page_list(pl[scnt]); } } err1: if (pl) kfree(pl); if (mr) kfree(mr); if (fr) kfree(fr); if (read) kfree(read); if (sgl) kfree(sgl); if (buf) kfree(buf); if (dma_addr) kfree(dma_addr); } static void krping_fr_test_server(struct krping_cb *cb) { DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_fr_test5_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) krping_fr_test5(cb); DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_fr_test5_client(struct krping_cb *cb) { struct ib_send_wr *bad; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to server */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); return krping_fr_test5(cb); } /* * sq-depth worth of write + fastreg + inv, reposting them as the invs * complete. * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. * If a count is given, then the last IO will have a bogus lkey in the * write work request. This reproduces a fw bug where the connection * will get stuck if a fastreg is processed while the ulptx is failing * the bad write. */ static void krping_fr_test6(struct krping_cb *cb) { struct ib_fast_reg_page_list **pl; struct ib_send_wr *fr, *write, *inv, *bad; struct ib_wc wc; struct ib_sge *sgl; u8 key = 0; struct ib_mr **mr; u8 **buf; dma_addr_t *dma_addr; int i; int ret; int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; unsigned long start; int count = 0; int scnt; int depth = cb->txdepth / 3; if (!depth) { PRINTF(cb, "txdepth must be > 3 for this test!\n"); return; } pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); write = kzalloc(sizeof *write * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth); inv = kzalloc(sizeof *inv * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth); buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) { PRINTF(cb, "kzalloc failed\n"); goto err1; } for (scnt = 0; scnt < depth; scnt++) { pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl[scnt])) { PRINTF(cb, "alloc_fr_page_list failed %ld\n", PTR_ERR(pl[scnt])); goto err2; } DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr[scnt])) { PRINTF(cb, "alloc_fr failed %ld\n", PTR_ERR(mr[scnt])); goto err2; } DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); ib_update_fast_reg_key(mr[scnt], ++key); buf[scnt] = kmalloc(cb->size, GFP_KERNEL); if (!buf[scnt]) { PRINTF(cb, "kmalloc failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); dma_addr[scnt] = ib_dma_map_single(cb->pd->device, buf[scnt], cb->size, DMA_BIDIRECTIONAL); if (dma_mapping_error(cb->pd->device->dma_device, dma_addr[scnt])) { PRINTF(cb, "dma_map failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); } write[scnt].opcode = IB_WR_RDMA_WRITE; write[scnt].wr_id = scnt; write[scnt].wr.rdma.rkey = cb->remote_rkey; write[scnt].wr.rdma.remote_addr = cb->remote_addr; write[scnt].num_sge = 1; write[scnt].sg_list = &cb->rdma_sgl; write[scnt].sg_list->length = cb->size; write[scnt].next = &fr[scnt]; fr[scnt].opcode = IB_WR_FAST_REG_MR; fr[scnt].wr_id = scnt; fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; fr[scnt].wr.fast_reg.length = cb->size; fr[scnt].wr.fast_reg.page_list = pl[scnt]; fr[scnt].wr.fast_reg.page_list_len = plen; fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; fr[scnt].next = &inv[scnt]; inv[scnt].opcode = IB_WR_LOCAL_INV; inv[scnt].send_flags = IB_SEND_SIGNALED; inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey; ret = ib_post_send(cb->qp, &write[scnt], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } start = time_uptime; DEBUG_LOG(cb, "%s starting IO.\n", __func__); while (!cb->count || cb->server || count < cb->count) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, count); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, 1); if (cb->state == ERROR) break; start = time_uptime; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u wr_id %ju " "opcode %d\n", wc.status, (uintmax_t)wc.wr_id, wc.opcode); goto err2; } count++; if (count == (cb->count -1)) cb->rdma_sgl.lkey = 0x00dead; if (count == cb->count) break; ib_update_fast_reg_key(mr[wc.wr_id], ++key); fr[wc.wr_id].wr.fast_reg.rkey = mr[wc.wr_id]->rkey; inv[wc.wr_id].ex.invalidate_rkey = mr[wc.wr_id]->rkey; ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } else if (krping_sigpending()){ PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } DEBUG_LOG(cb, "%s done!\n", __func__); err2: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u " "opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mrs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (mr[scnt]) { ib_dereg_mr(mr[scnt]); DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); } } DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (buf[scnt]) { dma_unmap_single(cb->pd->device->dma_device, dma_addr[scnt], cb->size, DMA_BIDIRECTIONAL); kfree(buf[scnt]); DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); } } DEBUG_LOG(cb, "destroying fr page lists!\n"); for (scnt = 0; scnt < depth; scnt++) { if (pl[scnt]) { DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); ib_free_fast_reg_page_list(pl[scnt]); } } err1: if (pl) kfree(pl); if (mr) kfree(mr); if (fr) kfree(fr); if (write) kfree(write); if (inv) kfree(inv); if (sgl) kfree(sgl); if (buf) kfree(buf); if (dma_addr) kfree(dma_addr); } static void krping_fr_test6_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) krping_fr_test6(cb); DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_fr_test6_client(struct krping_cb *cb) { struct ib_send_wr *bad; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to server */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); return krping_fr_test6(cb); } static void krping_run_server(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_server(cb); if (ret) return; ret = krping_setup_qp(cb, cb->child_cm_id); if (ret) { PRINTF(cb, "setup_qp failed: %d\n", ret); goto err0; } ret = krping_setup_buffers(cb); if (ret) { PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_accept(cb); if (ret) { PRINTF(cb, "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_server(cb); else if (cb->rlat) krping_rlat_test_server(cb); else if (cb->bw) krping_bw_test_server(cb); else if (cb->frtest) { switch (cb->testnum) { case 1: case 2: case 3: case 4: krping_fr_test_server(cb); break; case 5: krping_fr_test5_server(cb); break; case 6: krping_fr_test6_server(cb); break; default: PRINTF(cb, "unknown fr test %d\n", cb->testnum); goto err2; break; } } else krping_test_server(cb); rdma_disconnect(cb->child_cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); err0: rdma_destroy_id(cb->child_cm_id); } static void krping_test_client(struct krping_cb *cb) { int ping, start, cc, i, ret; struct ib_send_wr *bad_wr; unsigned char c; start = 65; for (ping = 0; !cb->count || ping < cb->count; ping++) { cb->state = RDMA_READ_ADV; /* Put some ascii text in the buffer. */ cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); for (i = cc, c = start; i < cb->size; i++) { cb->start_buf[i] = c; c++; if (c > 122) c = 65; } start++; if (start > 122) start = 65; cb->start_buf[cb->size - 1] = 0; krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); break; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for server to ACK */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { PRINTF(cb, "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } krping_format_send(cb, cb->rdma_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for the server to say the RDMA Write is complete. */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { PRINTF(cb, "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } if (cb->validate) if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { PRINTF(cb, "data mismatch!\n"); break; } if (cb->verbose) { if (strlen(cb->rdma_buf) > 128) { char msgbuf[128]; strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); PRINTF(cb, "ping data stripped: %s\n", msgbuf); } else PRINTF(cb, "ping data: %s\n", cb->rdma_buf); } #ifdef SLOW_KRPING wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); #endif } } static void krping_rlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } #if 0 { int i; struct timeval start, stop; time_t sec; suseconds_t usec; unsigned long long elapsed; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = 0; cb->rdma_sq_wr.num_sge = 0; microtime(&start); for (i=0; i < 100000; i++) { if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send\n"); return; } do { ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } microtime(&stop); if (stop.tv_usec < start.tv_usec) { stop.tv_usec += 1000000; stop.tv_sec -= 1; } sec = stop.tv_sec - start.tv_sec; usec = stop.tv_usec - start.tv_usec; elapsed = sec * 1000000 + usec; PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed); } #endif rlat_test(cb); } static void krping_wlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } wlat_test(cb); } static void krping_bw_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } bw_test(cb); } /* * fastreg 2 valid different mrs and verify the completions. */ static void krping_fr_test1(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, *bad; struct ib_wc wc; struct ib_mr *mr1, *mr2; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; int count = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr1)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } mr2 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr2)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err2; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr_id = 1; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.rkey = mr1->rkey; DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } fr.wr.fast_reg.rkey = mr2->rkey; DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err3; } if (ret == 1) { DEBUG_LOG(cb, "completion status %u wr %s\n", wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err3; } wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); } while (count != 2); err3: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mr2!\n"); ib_dereg_mr(mr2); err2: DEBUG_LOG(cb, "destroying fr mr1!\n"); ib_dereg_mr(mr1); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } /* * fastreg the same mr twice, 2nd one should produce error cqe. */ static void krping_fr_test2(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, *bad; struct ib_wc wc; struct ib_mr *mr1; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; int count = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr1)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr_id = 1; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.rkey = mr1->rkey; DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err3; } if (ret == 1) { DEBUG_LOG(cb, "completion status %u wr %s\n", wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err3; } wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); } while (count != 2); err3: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mr1!\n"); ib_dereg_mr(mr1); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } /* * fastreg pipelined in a loop as fast as we can until the user interrupts. * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. */ static void krping_fr_test3(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, inv, *bad; struct ib_wc wc; u8 key = 0; struct ib_mr *mr; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; unsigned long start; int count = 0; int scnt = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.send_flags = IB_SEND_SIGNALED; DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth); start = time_uptime; while (1) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); if (cb->state == ERROR) break; start = time_uptime; } while (scnt < (cb->txdepth>>1)) { ib_update_fast_reg_key(mr, ++key); fr.wr.fast_reg.rkey = mr->rkey; inv.ex.invalidate_rkey = mr->rkey; size = arc4random() % cb->size; if (size == 0) size = cb->size; plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list_len = plen; ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } scnt+=2; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u\n", wc.status); goto err2; } count++; scnt--; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } err2: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "fr_test: done!\n"); ib_dereg_mr(mr); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } /* * fastreg 1 and invalidate 1 mr and verify completion. */ static void krping_fr_test4(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, inv, *bad; struct ib_wc wc; struct ib_mr *mr1; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; int count = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr1)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr_id = 1; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.rkey = mr1->rkey; fr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.ex.invalidate_rkey = mr1->rkey; DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err3; } if (ret == 1) { DEBUG_LOG(cb, "completion status %u wr %s\n", wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err3; } wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); } while (count != 1); err3: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mr1!\n"); ib_dereg_mr(mr1); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } static void krping_fr_test(struct krping_cb *cb) { switch (cb->testnum) { case 1: krping_fr_test1(cb); break; case 2: krping_fr_test2(cb); break; case 3: krping_fr_test3(cb); break; case 4: krping_fr_test4(cb); break; case 5: krping_fr_test5_client(cb); break; case 6: krping_fr_test6_client(cb); break; default: PRINTF(cb, "Unkown frtest num %u\n", cb->testnum); break; } } static int krping_connect_client(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.retry_count = 10; ret = rdma_connect(cb->cm_id, &conn_param); if (ret) { PRINTF(cb, "rdma_connect error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); return -1; } DEBUG_LOG(cb, "rdma_connect successful\n"); return 0; } static int krping_bind_client(struct krping_cb *cb) { - struct sockaddr_in sin; + union { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } sin; int ret; memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof sin; - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = cb->addr.s_addr; - sin.sin_port = cb->port; + switch (cb->addr_type) { + case AF_INET: + sin.v4.sin_len = sizeof sin.v4; + sin.v4.sin_family = AF_INET; + sin.v4.sin_addr = cb->addr.v4; + sin.v4.sin_port = cb->port; + break; + case AF_INET6: + sin.v6.sin6_len = sizeof sin.v6; + sin.v6.sin6_family = AF_INET6; + sin.v6.sin6_addr = cb->addr.v6; + sin.v6.sin6_port = cb->port; + break; + default: + return (-EINVAL); + } + ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, 2000); if (ret) { PRINTF(cb, "rdma_resolve_addr error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED); if (cb->state != ROUTE_RESOLVED) { PRINTF(cb, "addr/route resolution did not resolve: state %d\n", cb->state); return -EINTR; } if (cb->mem == FASTREG && !fastreg_supported(cb, 0)) return -EINVAL; DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n"); return 0; } static void krping_run_client(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_client(cb); if (ret) return; ret = krping_setup_qp(cb, cb->cm_id); if (ret) { PRINTF(cb, "setup_qp failed: %d\n", ret); return; } ret = krping_setup_buffers(cb); if (ret) { PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_connect_client(cb); if (ret) { PRINTF(cb, "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_client(cb); else if (cb->rlat) krping_rlat_test_client(cb); else if (cb->bw) krping_bw_test_client(cb); else if (cb->frtest) krping_fr_test(cb); else krping_test_client(cb); rdma_disconnect(cb->cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); } +static uint16_t +krping_get_ipv6_scope_id(char *name) +{ + struct ifnet *ifp; + uint16_t retval; + + if (name == NULL) + return (0); + ifp = ifunit_ref(name); + if (ifp == NULL) + return (0); + retval = ifp->if_index; + if_rele(ifp); + return (retval); +} + int krping_doit(char *cmd, void *cookie) { struct krping_cb *cb; int op; int ret = 0; char *optarg; + char *scope; unsigned long optint; cb = kzalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return -ENOMEM; mutex_lock(&krping_mutex); list_add_tail(&cb->list, &krping_cbs); mutex_unlock(&krping_mutex); cb->cookie = cookie; cb->server = -1; cb->state = IDLE; cb->size = 64; cb->txdepth = RPING_SQ_DEPTH; cb->mem = DMA; + cb->addr_type = AF_INET; init_waitqueue_head(&cb->sem); while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, &optint)) != 0) { switch (op) { case 'a': cb->addr_str = optarg; - DEBUG_LOG(cb, "ipaddr (%s)\n", optarg); - if (!inet_aton(optarg, &cb->addr)) { + cb->addr_type = AF_INET; + DEBUG_LOG(cb, "ipv4addr (%s)\n", optarg); + if (inet_pton(AF_INET, optarg, &cb->addr) != 1) { PRINTF(cb, "bad addr string %s\n", optarg); ret = EINVAL; + } + break; + case 'A': + cb->addr_str = optarg; + cb->addr_type = AF_INET6; + DEBUG_LOG(cb, "ipv6addr (%s)\n", optarg); + scope = strstr(optarg, "%"); + /* extract scope ID, if any */ + if (scope != NULL) + *scope++ = 0; + /* extract IPv6 network address */ + if (inet_pton(AF_INET6, optarg, &cb->addr) != 1) { + PRINTF(cb, "bad addr string %s\n", + optarg); + ret = EINVAL; + } else if (IN6_IS_SCOPE_LINKLOCAL(&cb->addr.v6) || + IN6_IS_ADDR_MC_INTFACELOCAL(&cb->addr.v6)) { + uint16_t scope_id = krping_get_ipv6_scope_id(scope); + DEBUG_LOG(cb, "ipv6 scope ID = %d\n", scope_id); + cb->addr.v6.s6_addr[2] = scope_id >> 8; + cb->addr.v6.s6_addr[3] = scope_id & 0xFF; } break; case 'p': cb->port = htons(optint); DEBUG_LOG(cb, "port %d\n", (int)optint); break; case 'P': cb->poll = 1; DEBUG_LOG(cb, "server\n"); break; case 's': cb->server = 1; DEBUG_LOG(cb, "server\n"); break; case 'c': cb->server = 0; DEBUG_LOG(cb, "client\n"); break; case 'S': cb->size = optint; if ((cb->size < 1) || (cb->size > RPING_BUFSIZE)) { PRINTF(cb, "Invalid size %d " "(valid range is 1 to %d)\n", cb->size, RPING_BUFSIZE); ret = EINVAL; } else DEBUG_LOG(cb, "size %d\n", (int)optint); break; case 'C': cb->count = optint; if (cb->count < 0) { PRINTF(cb, "Invalid count %d\n", cb->count); ret = EINVAL; } else DEBUG_LOG(cb, "count %d\n", (int) cb->count); break; case 'v': cb->verbose++; DEBUG_LOG(cb, "verbose\n"); break; case 'V': cb->validate++; DEBUG_LOG(cb, "validate data\n"); break; case 'l': cb->wlat++; break; case 'L': cb->rlat++; break; case 'B': cb->bw++; break; case 'd': cb->duplex++; break; case 'm': if (!strncmp(optarg, "dma", 3)) cb->mem = DMA; else if (!strncmp(optarg, "fastreg", 7)) cb->mem = FASTREG; else if (!strncmp(optarg, "mw", 2)) cb->mem = MW; else if (!strncmp(optarg, "mr", 2)) cb->mem = MR; else { PRINTF(cb, "unknown mem mode %s. " "Must be dma, fastreg, mw, or mr\n", optarg); ret = -EINVAL; break; } break; case 'I': cb->server_invalidate = 1; break; case 'T': cb->txdepth = optint; DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth); break; case 'Z': cb->local_dma_lkey = 1; DEBUG_LOG(cb, "using local dma lkey\n"); break; case 'R': cb->read_inv = 1; DEBUG_LOG(cb, "using read-with-inv\n"); break; case 'f': cb->frtest = 1; cb->testnum = optint; DEBUG_LOG(cb, "fast-reg test!\n"); break; default: PRINTF(cb, "unknown opt %s\n", optarg); ret = -EINVAL; break; } } if (ret) goto out; if (cb->server == -1) { PRINTF(cb, "must be either client or server\n"); ret = -EINVAL; goto out; } if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) { PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n"); ret = -EINVAL; goto out; } if (cb->server_invalidate && cb->mem != FASTREG) { PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n"); ret = -EINVAL; goto out; } if (cb->read_inv && cb->mem != FASTREG) { PRINTF(cb, "read_inv only valid with fastreg mem_mode\n"); ret = -EINVAL; goto out; } if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) { PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n"); ret = -EINVAL; goto out; } cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cb->cm_id)) { ret = PTR_ERR(cb->cm_id); PRINTF(cb, "rdma_create_id error %d\n", ret); goto out; } DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id); if (cb->server) krping_run_server(cb); else krping_run_client(cb); DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id); rdma_destroy_id(cb->cm_id); out: mutex_lock(&krping_mutex); list_del(&cb->list); mutex_unlock(&krping_mutex); kfree(cb); return ret; } void krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg) { struct krping_cb *cb; mutex_lock(&krping_mutex); list_for_each_entry(cb, &krping_cbs, list) (*f)(cb->pd ? &cb->stats : NULL, arg); mutex_unlock(&krping_mutex); } void krping_init(void) { mutex_init(&krping_mutex); } Index: projects/runtime-coverage/sys/dev/hyperv/include/hyperv.h =================================================================== --- projects/runtime-coverage/sys/dev/hyperv/include/hyperv.h (revision 324497) +++ projects/runtime-coverage/sys/dev/hyperv/include/hyperv.h (revision 324498) @@ -1,96 +1,97 @@ /*- * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _HYPERV_H_ #define _HYPERV_H_ #ifdef _KERNEL #include #include #define MSR_HV_TIME_REF_COUNT 0x40000020 #define CPUID_HV_MSR_TIME_REFCNT 0x0002 /* MSR_HV_TIME_REF_COUNT */ #define CPUID_HV_MSR_SYNIC 0x0004 /* MSRs for SynIC */ #define CPUID_HV_MSR_SYNTIMER 0x0008 /* MSRs for SynTimer */ #define CPUID_HV_MSR_APIC 0x0010 /* MSR_HV_{EOI,ICR,TPR} */ #define CPUID_HV_MSR_HYPERCALL 0x0020 /* MSR_HV_GUEST_OS_ID * MSR_HV_HYPERCALL */ #define CPUID_HV_MSR_VP_INDEX 0x0040 /* MSR_HV_VP_INDEX */ #define CPUID_HV_MSR_REFERENCE_TSC 0x0200 /* MSR_HV_REFERENCE_TSC */ #define CPUID_HV_MSR_GUEST_IDLE 0x0400 /* MSR_HV_GUEST_IDLE */ #ifndef NANOSEC #define NANOSEC 1000000000ULL #endif #define HYPERV_TIMER_NS_FACTOR 100ULL #define HYPERV_TIMER_FREQ (NANOSEC / HYPERV_TIMER_NS_FACTOR) #endif /* _KERNEL */ #define HYPERV_REFTSC_DEVNAME "hv_tsc" /* * Hyper-V Reference TSC */ struct hyperv_reftsc { volatile uint32_t tsc_seq; volatile uint32_t tsc_rsvd1; volatile uint64_t tsc_scale; volatile int64_t tsc_ofs; } __packed __aligned(PAGE_SIZE); #ifdef CTASSERT CTASSERT(sizeof(struct hyperv_reftsc) == PAGE_SIZE); #endif #ifdef _KERNEL struct hyperv_guid { uint8_t hv_guid[16]; } __packed; #define HYPERV_GUID_STRLEN 40 typedef uint64_t (*hyperv_tc64_t)(void); int hyperv_guid2str(const struct hyperv_guid *, char *, size_t); /* * hyperv_tc64 could be NULL, if there were no suitable Hyper-V * specific timecounter. */ extern hyperv_tc64_t hyperv_tc64; extern u_int hyperv_features; /* CPUID_HV_MSR_ */ +extern u_int hyperv_ver_major; #endif /* _KERNEL */ #endif /* _HYPERV_H_ */ Index: projects/runtime-coverage/sys/dev/hyperv/netvsc/hn_nvs.c =================================================================== --- projects/runtime-coverage/sys/dev/hyperv/netvsc/hn_nvs.c (revision 324497) +++ projects/runtime-coverage/sys/dev/hyperv/netvsc/hn_nvs.c (revision 324498) @@ -1,741 +1,746 @@ /*- * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Network Virtualization Service. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int hn_nvs_conn_chim(struct hn_softc *); static int hn_nvs_conn_rxbuf(struct hn_softc *); static void hn_nvs_disconn_chim(struct hn_softc *); static void hn_nvs_disconn_rxbuf(struct hn_softc *); static int hn_nvs_conf_ndis(struct hn_softc *, int); static int hn_nvs_init_ndis(struct hn_softc *); static int hn_nvs_doinit(struct hn_softc *, uint32_t); static int hn_nvs_init(struct hn_softc *); static const void *hn_nvs_xact_execute(struct hn_softc *, struct vmbus_xact *, void *, int, size_t *, uint32_t); static void hn_nvs_sent_none(struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, const void *, int); struct hn_nvs_sendctx hn_nvs_sendctx_none = HN_NVS_SENDCTX_INITIALIZER(hn_nvs_sent_none, NULL); static const uint32_t hn_nvs_version[] = { HN_NVS_VERSION_5, HN_NVS_VERSION_4, HN_NVS_VERSION_2, HN_NVS_VERSION_1 }; static const void * hn_nvs_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, void *req, int reqlen, size_t *resplen0, uint32_t type) { struct hn_nvs_sendctx sndc; size_t resplen, min_resplen = *resplen0; const struct hn_nvs_hdr *hdr; int error; KASSERT(min_resplen >= sizeof(*hdr), ("invalid minimum response len %zu", min_resplen)); /* * Execute the xact setup by the caller. */ hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact); vmbus_xact_activate(xact); error = hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_RC, req, reqlen, &sndc); if (error) { vmbus_xact_deactivate(xact); return (NULL); } hdr = vmbus_chan_xact_wait(sc->hn_prichan, xact, &resplen, HN_CAN_SLEEP(sc)); /* * Check this NVS response message. */ if (resplen < min_resplen) { if_printf(sc->hn_ifp, "invalid NVS resp len %zu\n", resplen); return (NULL); } if (hdr->nvs_type != type) { if_printf(sc->hn_ifp, "unexpected NVS resp 0x%08x, " "expect 0x%08x\n", hdr->nvs_type, type); return (NULL); } /* All pass! */ *resplen0 = resplen; return (hdr); } static __inline int hn_nvs_req_send(struct hn_softc *sc, void *req, int reqlen) { return (hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_NONE, req, reqlen, &hn_nvs_sendctx_none)); } static int hn_nvs_conn_rxbuf(struct hn_softc *sc) { struct vmbus_xact *xact = NULL; struct hn_nvs_rxbuf_conn *conn; const struct hn_nvs_rxbuf_connresp *resp; size_t resp_len; uint32_t status; int error, rxbuf_size; /* * Limit RXBUF size for old NVS. */ if (sc->hn_nvs_ver <= HN_NVS_VERSION_2) rxbuf_size = HN_RXBUF_SIZE_COMPAT; else rxbuf_size = HN_RXBUF_SIZE; /* * Connect the RXBUF GPADL to the primary channel. * * NOTE: * Only primary channel has RXBUF connected to it. Sub-channels * just share this RXBUF. */ error = vmbus_chan_gpadl_connect(sc->hn_prichan, sc->hn_rxbuf_dma.hv_paddr, rxbuf_size, &sc->hn_rxbuf_gpadl); if (error) { if_printf(sc->hn_ifp, "rxbuf gpadl conn failed: %d\n", error); goto cleanup; } /* * Connect RXBUF to NVS. */ xact = vmbus_xact_get(sc->hn_xact, sizeof(*conn)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs rxbuf conn\n"); error = ENXIO; goto cleanup; } conn = vmbus_xact_req_data(xact); conn->nvs_type = HN_NVS_TYPE_RXBUF_CONN; conn->nvs_gpadl = sc->hn_rxbuf_gpadl; conn->nvs_sig = HN_NVS_RXBUF_SIG; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, conn, sizeof(*conn), &resp_len, HN_NVS_TYPE_RXBUF_CONNRESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec nvs rxbuf conn failed\n"); error = EIO; goto cleanup; } status = resp->nvs_status; vmbus_xact_put(xact); xact = NULL; if (status != HN_NVS_STATUS_OK) { if_printf(sc->hn_ifp, "nvs rxbuf conn failed: %x\n", status); error = EIO; goto cleanup; } sc->hn_flags |= HN_FLAG_RXBUF_CONNECTED; return (0); cleanup: if (xact != NULL) vmbus_xact_put(xact); hn_nvs_disconn_rxbuf(sc); return (error); } static int hn_nvs_conn_chim(struct hn_softc *sc) { struct vmbus_xact *xact = NULL; struct hn_nvs_chim_conn *chim; const struct hn_nvs_chim_connresp *resp; size_t resp_len; uint32_t status, sectsz; int error; /* * Connect chimney sending buffer GPADL to the primary channel. * * NOTE: * Only primary channel has chimney sending buffer connected to it. * Sub-channels just share this chimney sending buffer. */ error = vmbus_chan_gpadl_connect(sc->hn_prichan, sc->hn_chim_dma.hv_paddr, HN_CHIM_SIZE, &sc->hn_chim_gpadl); if (error) { if_printf(sc->hn_ifp, "chim gpadl conn failed: %d\n", error); goto cleanup; } /* * Connect chimney sending buffer to NVS */ xact = vmbus_xact_get(sc->hn_xact, sizeof(*chim)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs chim conn\n"); error = ENXIO; goto cleanup; } chim = vmbus_xact_req_data(xact); chim->nvs_type = HN_NVS_TYPE_CHIM_CONN; chim->nvs_gpadl = sc->hn_chim_gpadl; chim->nvs_sig = HN_NVS_CHIM_SIG; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, chim, sizeof(*chim), &resp_len, HN_NVS_TYPE_CHIM_CONNRESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec nvs chim conn failed\n"); error = EIO; goto cleanup; } status = resp->nvs_status; sectsz = resp->nvs_sectsz; vmbus_xact_put(xact); xact = NULL; if (status != HN_NVS_STATUS_OK) { if_printf(sc->hn_ifp, "nvs chim conn failed: %x\n", status); error = EIO; goto cleanup; } if (sectsz == 0 || sectsz % sizeof(uint32_t) != 0) { /* * Can't use chimney sending buffer; done! */ if (sectsz == 0) { if_printf(sc->hn_ifp, "zero chimney sending buffer " "section size\n"); } else { if_printf(sc->hn_ifp, "misaligned chimney sending " "buffers, section size: %u\n", sectsz); } sc->hn_chim_szmax = 0; sc->hn_chim_cnt = 0; sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; return (0); } sc->hn_chim_szmax = sectsz; sc->hn_chim_cnt = HN_CHIM_SIZE / sc->hn_chim_szmax; if (HN_CHIM_SIZE % sc->hn_chim_szmax != 0) { if_printf(sc->hn_ifp, "chimney sending sections are " "not properly aligned\n"); } if (sc->hn_chim_cnt % LONG_BIT != 0) { if_printf(sc->hn_ifp, "discard %d chimney sending sections\n", sc->hn_chim_cnt % LONG_BIT); } sc->hn_chim_bmap_cnt = sc->hn_chim_cnt / LONG_BIT; sc->hn_chim_bmap = malloc(sc->hn_chim_bmap_cnt * sizeof(u_long), M_DEVBUF, M_WAITOK | M_ZERO); /* Done! */ sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; if (bootverbose) { if_printf(sc->hn_ifp, "chimney sending buffer %d/%d\n", sc->hn_chim_szmax, sc->hn_chim_cnt); } return (0); cleanup: if (xact != NULL) vmbus_xact_put(xact); hn_nvs_disconn_chim(sc); return (error); } static void hn_nvs_disconn_rxbuf(struct hn_softc *sc) { int error; if (sc->hn_flags & HN_FLAG_RXBUF_CONNECTED) { struct hn_nvs_rxbuf_disconn disconn; /* * Disconnect RXBUF from NVS. */ memset(&disconn, 0, sizeof(disconn)); disconn.nvs_type = HN_NVS_TYPE_RXBUF_DISCONN; disconn.nvs_sig = HN_NVS_RXBUF_SIG; /* NOTE: No response. */ error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); if (error) { if_printf(sc->hn_ifp, "send nvs rxbuf disconn failed: %d\n", error); /* * Fine for a revoked channel, since the hypervisor * does not drain TX bufring for a revoked channel. */ if (!vmbus_chan_is_revoked(sc->hn_prichan)) sc->hn_flags |= HN_FLAG_RXBUF_REF; } sc->hn_flags &= ~HN_FLAG_RXBUF_CONNECTED; /* * Wait for the hypervisor to receive this NVS request. * * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_tx_empty(sc->hn_prichan) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("waittx", 1); /* * Linger long enough for NVS to disconnect RXBUF. */ pause("lingtx", (200 * hz) / 1000); } if (sc->hn_rxbuf_gpadl != 0) { /* * Disconnect RXBUF from primary channel. */ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_rxbuf_gpadl); if (error) { if_printf(sc->hn_ifp, "rxbuf gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_RXBUF_REF; } sc->hn_rxbuf_gpadl = 0; } } static void hn_nvs_disconn_chim(struct hn_softc *sc) { int error; if (sc->hn_flags & HN_FLAG_CHIM_CONNECTED) { struct hn_nvs_chim_disconn disconn; /* * Disconnect chimney sending buffer from NVS. */ memset(&disconn, 0, sizeof(disconn)); disconn.nvs_type = HN_NVS_TYPE_CHIM_DISCONN; disconn.nvs_sig = HN_NVS_CHIM_SIG; /* NOTE: No response. */ error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); if (error) { if_printf(sc->hn_ifp, "send nvs chim disconn failed: %d\n", error); /* * Fine for a revoked channel, since the hypervisor * does not drain TX bufring for a revoked channel. */ if (!vmbus_chan_is_revoked(sc->hn_prichan)) sc->hn_flags |= HN_FLAG_CHIM_REF; } sc->hn_flags &= ~HN_FLAG_CHIM_CONNECTED; /* * Wait for the hypervisor to receive this NVS request. * * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_tx_empty(sc->hn_prichan) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("waittx", 1); /* * Linger long enough for NVS to disconnect chimney * sending buffer. */ pause("lingtx", (200 * hz) / 1000); } if (sc->hn_chim_gpadl != 0) { /* * Disconnect chimney sending buffer from primary channel. */ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_chim_gpadl); if (error) { if_printf(sc->hn_ifp, "chim gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_CHIM_REF; } sc->hn_chim_gpadl = 0; } if (sc->hn_chim_bmap != NULL) { free(sc->hn_chim_bmap, M_DEVBUF); sc->hn_chim_bmap = NULL; sc->hn_chim_bmap_cnt = 0; } } static int hn_nvs_doinit(struct hn_softc *sc, uint32_t nvs_ver) { struct vmbus_xact *xact; struct hn_nvs_init *init; const struct hn_nvs_init_resp *resp; size_t resp_len; uint32_t status; xact = vmbus_xact_get(sc->hn_xact, sizeof(*init)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs init\n"); return (ENXIO); } init = vmbus_xact_req_data(xact); init->nvs_type = HN_NVS_TYPE_INIT; init->nvs_ver_min = nvs_ver; init->nvs_ver_max = nvs_ver; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, init, sizeof(*init), &resp_len, HN_NVS_TYPE_INIT_RESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec init failed\n"); vmbus_xact_put(xact); return (EIO); } status = resp->nvs_status; vmbus_xact_put(xact); if (status != HN_NVS_STATUS_OK) { if (bootverbose) { /* * Caller may try another NVS version, and will log * error if there are no more NVS versions to try, * so don't bark out loud here. */ if_printf(sc->hn_ifp, "nvs init failed for ver 0x%x\n", nvs_ver); } return (EINVAL); } return (0); } /* * Configure MTU and enable VLAN. */ static int hn_nvs_conf_ndis(struct hn_softc *sc, int mtu) { struct hn_nvs_ndis_conf conf; int error; memset(&conf, 0, sizeof(conf)); conf.nvs_type = HN_NVS_TYPE_NDIS_CONF; conf.nvs_mtu = mtu + ETHER_HDR_LEN; conf.nvs_caps = HN_NVS_NDIS_CONF_VLAN; if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) conf.nvs_caps |= HN_NVS_NDIS_CONF_SRIOV; /* NOTE: No response. */ error = hn_nvs_req_send(sc, &conf, sizeof(conf)); if (error) { if_printf(sc->hn_ifp, "send nvs ndis conf failed: %d\n", error); return (error); } if (bootverbose) if_printf(sc->hn_ifp, "nvs ndis conf done\n"); sc->hn_caps |= HN_CAP_MTU | HN_CAP_VLAN; return (0); } static int hn_nvs_init_ndis(struct hn_softc *sc) { struct hn_nvs_ndis_init ndis; int error; memset(&ndis, 0, sizeof(ndis)); ndis.nvs_type = HN_NVS_TYPE_NDIS_INIT; ndis.nvs_ndis_major = HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver); ndis.nvs_ndis_minor = HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver); /* NOTE: No response. */ error = hn_nvs_req_send(sc, &ndis, sizeof(ndis)); if (error) if_printf(sc->hn_ifp, "send nvs ndis init failed: %d\n", error); return (error); } static int hn_nvs_init(struct hn_softc *sc) { int i, error; if (device_is_attached(sc->hn_dev)) { /* * NVS version and NDIS version MUST NOT be changed. */ if (bootverbose) { if_printf(sc->hn_ifp, "reinit NVS version 0x%x, " "NDIS version %u.%u\n", sc->hn_nvs_ver, HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); } error = hn_nvs_doinit(sc, sc->hn_nvs_ver); if (error) { if_printf(sc->hn_ifp, "reinit NVS version 0x%x " "failed: %d\n", sc->hn_nvs_ver, error); return (error); } goto done; } /* * Find the supported NVS version and set NDIS version accordingly. */ for (i = 0; i < nitems(hn_nvs_version); ++i) { error = hn_nvs_doinit(sc, hn_nvs_version[i]); if (!error) { sc->hn_nvs_ver = hn_nvs_version[i]; /* Set NDIS version according to NVS version. */ sc->hn_ndis_ver = HN_NDIS_VERSION_6_30; if (sc->hn_nvs_ver <= HN_NVS_VERSION_4) sc->hn_ndis_ver = HN_NDIS_VERSION_6_1; if (bootverbose) { if_printf(sc->hn_ifp, "NVS version 0x%x, " "NDIS version %u.%u\n", sc->hn_nvs_ver, HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); } goto done; } } if_printf(sc->hn_ifp, "no NVS available\n"); return (ENXIO); done: if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) sc->hn_caps |= HN_CAP_HASHVAL; return (0); } int hn_nvs_attach(struct hn_softc *sc, int mtu) { int error; + if (hyperv_ver_major >= 10) { + /* UDP 4-tuple hash is enforced. */ + sc->hn_caps |= HN_CAP_UDPHASH; + } + /* * Initialize NVS. */ error = hn_nvs_init(sc); if (error) return (error); if (sc->hn_nvs_ver >= HN_NVS_VERSION_2) { /* * Configure NDIS before initializing it. */ error = hn_nvs_conf_ndis(sc, mtu); if (error) return (error); } /* * Initialize NDIS. */ error = hn_nvs_init_ndis(sc); if (error) return (error); /* * Connect RXBUF. */ error = hn_nvs_conn_rxbuf(sc); if (error) return (error); /* * Connect chimney sending buffer. */ error = hn_nvs_conn_chim(sc); if (error) { hn_nvs_disconn_rxbuf(sc); return (error); } return (0); } void hn_nvs_detach(struct hn_softc *sc) { /* NOTE: there are no requests to stop the NVS. */ hn_nvs_disconn_rxbuf(sc); hn_nvs_disconn_chim(sc); } void hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc, struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, const void *data, int dlen) { vmbus_xact_wakeup(sndc->hn_cbarg, data, dlen); } static void hn_nvs_sent_none(struct hn_nvs_sendctx *sndc __unused, struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, const void *data __unused, int dlen __unused) { /* EMPTY */ } int hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch0) { struct vmbus_xact *xact; struct hn_nvs_subch_req *req; const struct hn_nvs_subch_resp *resp; int error, nsubch_req; uint32_t nsubch; size_t resp_len; nsubch_req = *nsubch0; KASSERT(nsubch_req > 0, ("invalid # of sub-channels %d", nsubch_req)); xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs subch alloc\n"); return (ENXIO); } req = vmbus_xact_req_data(xact); req->nvs_type = HN_NVS_TYPE_SUBCH_REQ; req->nvs_op = HN_NVS_SUBCH_OP_ALLOC; req->nvs_nsubch = nsubch_req; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, req, sizeof(*req), &resp_len, HN_NVS_TYPE_SUBCH_RESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec nvs subch alloc failed\n"); error = EIO; goto done; } if (resp->nvs_status != HN_NVS_STATUS_OK) { if_printf(sc->hn_ifp, "nvs subch alloc failed: %x\n", resp->nvs_status); error = EIO; goto done; } nsubch = resp->nvs_nsubch; if (nsubch > nsubch_req) { if_printf(sc->hn_ifp, "%u subchans are allocated, " "requested %d\n", nsubch, nsubch_req); nsubch = nsubch_req; } *nsubch0 = nsubch; error = 0; done: vmbus_xact_put(xact); return (error); } int hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) { return hn_nvs_send_rndis_sglist(chan, HN_NVS_RNDIS_MTYPE_CTRL, sndc, gpa, gpa_cnt); } void hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path) { struct hn_nvs_datapath dp; memset(&dp, 0, sizeof(dp)); dp.nvs_type = HN_NVS_TYPE_SET_DATAPATH; dp.nvs_active_path = path; hn_nvs_req_send(sc, &dp, sizeof(dp)); } Index: projects/runtime-coverage/sys/dev/hyperv/netvsc/if_hn.c =================================================================== --- projects/runtime-coverage/sys/dev/hyperv/netvsc/if_hn.c (revision 324497) +++ projects/runtime-coverage/sys/dev/hyperv/netvsc/if_hn.c (revision 324498) @@ -1,7470 +1,7523 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hn.h" #include "opt_inet6.h" #include "opt_inet.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmbus_if.h" #define HN_IFSTART_SUPPORT #define HN_RING_CNT_DEF_MAX 8 #define HN_VFMAP_SIZE_DEF 8 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ /* YYY should get it from the underlying channel */ #define HN_TX_DESC_CNT 512 #define HN_RNDIS_PKT_LEN \ (sizeof(struct rndis_packet_msg) + \ HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE #define HN_TX_DATA_BOUNDARY PAGE_SIZE #define HN_TX_DATA_MAXSIZE IP_MAXPACKET #define HN_TX_DATA_SEGSIZE PAGE_SIZE /* -1 for RNDIS packet message */ #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) #define HN_DIRECT_TX_SIZE_DEF 128 #define HN_EARLY_TXEOF_THRESH 8 #define HN_PKTBUF_LEN_DEF (16 * 1024) #define HN_LROENT_CNT_DEF 128 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) #define HN_LRO_ACKCNT_DEF 1 #define HN_LOCK_INIT(sc) \ sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) #define HN_LOCK(sc) \ do { \ while (sx_try_xlock(&(sc)->hn_lock) == 0) \ DELAY(1000); \ } while (0) #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) #define HN_CSUM_IP_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) #define HN_CSUM_IP6_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) #define HN_PKTSIZE_MIN(align) \ roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ HN_RNDIS_PKT_LEN, (align)) #define HN_PKTSIZE(m, align) \ roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) #ifdef RSS #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) #else #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) #endif struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; #endif STAILQ_ENTRY(hn_txdesc) agg_link; /* Aggregated txdescs, in sending order. */ STAILQ_HEAD(, hn_txdesc) agg_list; /* The oldest packet, if transmission aggregation happens. */ struct mbuf *m; struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ struct hn_nvs_sendctx send_ctx; uint32_t chim_index; int chim_size; bus_dmamap_t data_dmap; bus_addr_t rndis_pkt_paddr; struct rndis_packet_msg *rndis_pkt; bus_dmamap_t rndis_pkt_dmap; }; #define HN_TXD_FLAG_ONLIST 0x0001 #define HN_TXD_FLAG_DMAMAP 0x0002 #define HN_TXD_FLAG_ONAGG 0x0004 struct hn_rxinfo { uint32_t vlan_info; uint32_t csum_info; uint32_t hash_info; uint32_t hash_value; }; struct hn_rxvf_setarg { struct hn_rx_ring *rxr; struct ifnet *vf_ifp; }; #define HN_RXINFO_VLAN 0x0001 #define HN_RXINFO_CSUM 0x0002 #define HN_RXINFO_HASHINF 0x0004 #define HN_RXINFO_HASHVAL 0x0008 #define HN_RXINFO_ALL \ (HN_RXINFO_VLAN | \ HN_RXINFO_CSUM | \ HN_RXINFO_HASHINF | \ HN_RXINFO_HASHVAL) #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff #define HN_NDIS_RXCSUM_INFO_INVALID 0 #define HN_NDIS_HASH_INFO_INVALID 0 static int hn_probe(device_t); static int hn_attach(device_t); static int hn_detach(device_t); static int hn_shutdown(device_t); static void hn_chan_callback(struct vmbus_channel *, void *); static void hn_init(void *); static int hn_ioctl(struct ifnet *, u_long, caddr_t); #ifdef HN_IFSTART_SUPPORT static void hn_start(struct ifnet *); #endif static int hn_transmit(struct ifnet *, struct mbuf *); static void hn_xmit_qflush(struct ifnet *); static int hn_ifmedia_upd(struct ifnet *); static void hn_ifmedia_sts(struct ifnet *, struct ifmediareq *); static void hn_ifnet_event(void *, struct ifnet *, int); static void hn_ifaddr_event(void *, struct ifnet *); static void hn_ifnet_attevent(void *, struct ifnet *); static void hn_ifnet_detevent(void *, struct ifnet *); static void hn_ifnet_lnkevent(void *, struct ifnet *, int); static bool hn_ismyvf(const struct hn_softc *, const struct ifnet *); static void hn_rxvf_change(struct hn_softc *, struct ifnet *, bool); static void hn_rxvf_set(struct hn_softc *, struct ifnet *); static void hn_rxvf_set_task(void *, int); static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); static int hn_xpnt_vf_iocsetflags(struct hn_softc *); static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, struct ifreq *); static void hn_xpnt_vf_saveifflags(struct hn_softc *); static bool hn_xpnt_vf_isready(struct hn_softc *); static void hn_xpnt_vf_setready(struct hn_softc *); static void hn_xpnt_vf_init_taskfunc(void *, int); static void hn_xpnt_vf_init(struct hn_softc *); static void hn_xpnt_vf_setenable(struct hn_softc *); static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); static void hn_vf_rss_fixup(struct hn_softc *, bool); static void hn_vf_rss_restore(struct hn_softc *); static int hn_rndis_rxinfo(const void *, int, struct hn_rxinfo *); static void hn_rndis_rx_data(struct hn_rx_ring *, const void *, int); static void hn_rndis_rx_status(struct hn_softc *, const void *, int); static void hn_rndis_init_fixat(struct hn_softc *, int); static void hn_nvs_handle_notify(struct hn_softc *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_comp(struct hn_softc *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, uint64_t); #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); static void hn_stop(struct hn_softc *, bool); static void hn_init_locked(struct hn_softc *); static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *); static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *); static int hn_attach_subchans(struct hn_softc *); static void hn_detach_allchans(struct hn_softc *); static void hn_chan_rollup(struct hn_rx_ring *, struct hn_tx_ring *); static void hn_set_ring_inuse(struct hn_softc *, int); static int hn_synth_attach(struct hn_softc *, int); static void hn_synth_detach(struct hn_softc *); static int hn_synth_alloc_subchans(struct hn_softc *, int *); static bool hn_synth_attachable(const struct hn_softc *); static void hn_suspend(struct hn_softc *); static void hn_suspend_data(struct hn_softc *); static void hn_suspend_mgmt(struct hn_softc *); static void hn_resume(struct hn_softc *); static void hn_resume_data(struct hn_softc *); static void hn_resume_mgmt(struct hn_softc *); static void hn_suspend_mgmt_taskfunc(void *, int); static void hn_chan_drain(struct hn_softc *, struct vmbus_channel *); static void hn_disable_rx(struct hn_softc *); static void hn_drain_rxtx(struct hn_softc *, int); static void hn_polling(struct hn_softc *, u_int); static void hn_chan_polling(struct vmbus_channel *, u_int); static void hn_mtu_change_fixup(struct hn_softc *); static void hn_update_link_status(struct hn_softc *); static void hn_change_network(struct hn_softc *); static void hn_link_taskfunc(void *, int); static void hn_netchg_init_taskfunc(void *, int); static void hn_netchg_status_taskfunc(void *, int); static void hn_link_status(struct hn_softc *); static int hn_create_rx_data(struct hn_softc *, int); static void hn_destroy_rx_data(struct hn_softc *); static int hn_check_iplen(const struct mbuf *, int); +static void hn_rxpkt_proto(const struct mbuf *, int *, int *); static int hn_set_rxfilter(struct hn_softc *, uint32_t); static int hn_rxfilter_config(struct hn_softc *); static int hn_rss_reconfig(struct hn_softc *); static void hn_rss_ind_fixup(struct hn_softc *); static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); static int hn_rxpkt(struct hn_rx_ring *, const void *, int, const struct hn_rxinfo *); static uint32_t hn_rss_type_fromndis(uint32_t); static uint32_t hn_rss_type_tondis(uint32_t); static int hn_tx_ring_create(struct hn_softc *, int); static void hn_tx_ring_destroy(struct hn_tx_ring *); static int hn_create_tx_data(struct hn_softc *, int); static void hn_fixup_tx_data(struct hn_softc *); +static void hn_fixup_rx_data(struct hn_softc *); static void hn_destroy_tx_data(struct hn_softc *); static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); static void hn_txdesc_gc(struct hn_tx_ring *, struct hn_txdesc *); static int hn_encap(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *); static void hn_set_chim_size(struct hn_softc *, int); static void hn_set_tso_maxsize(struct hn_softc *, int, int); static bool hn_tx_ring_pending(struct hn_tx_ring *); static void hn_tx_ring_qflush(struct hn_tx_ring *); static void hn_resume_tx(struct hn_softc *, int); static void hn_set_txagg(struct hn_softc *); static void *hn_try_txagg(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *, int); static int hn_get_txswq_depth(const struct hn_tx_ring *); static void hn_txpkt_done(struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, const void *, int); static int hn_txpkt_sglist(struct hn_tx_ring *, struct hn_txdesc *); static int hn_txpkt_chim(struct hn_tx_ring *, struct hn_txdesc *); static int hn_xmit(struct hn_tx_ring *, int); static void hn_xmit_taskfunc(void *, int); static void hn_xmit_txeof(struct hn_tx_ring *); static void hn_xmit_txeof_taskfunc(void *, int); #ifdef HN_IFSTART_SUPPORT static int hn_start_locked(struct hn_tx_ring *, int); static void hn_start_taskfunc(void *, int); static void hn_start_txeof(struct hn_tx_ring *); static void hn_start_txeof_taskfunc(void *, int); #endif SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V network interface"); /* Trust tcp segements verification on host side. */ static int hn_trust_hosttcp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, &hn_trust_hosttcp, 0, "Trust tcp segement verification on host side, " "when csum info is missing (global setting)"); /* Trust udp datagrams verification on host side. */ static int hn_trust_hostudp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, &hn_trust_hostudp, 0, "Trust udp datagram verification on host side, " "when csum info is missing (global setting)"); /* Trust ip packets verification on host side. */ static int hn_trust_hostip = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, &hn_trust_hostip, 0, "Trust ip packet verification on host side, " "when csum info is missing (global setting)"); /* * Offload UDP/IPv4 checksum. */ static int hn_enable_udp4cs = 1; SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); /* * Offload UDP/IPv6 checksum. */ static int hn_enable_udp6cs = 1; SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); /* Stats. */ static counter_u64_t hn_udpcs_fixup; SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, &hn_udpcs_fixup, "# of UDP checksum fixup"); /* * See hn_set_hlen(). * * This value is for Azure. For Hyper-V, set this above * 65536 to disable UDP datagram checksum fixup. */ static int hn_udpcs_fixup_mtu = 1420; SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); /* Limit TSO burst size */ static int hn_tso_maxlen = IP_MAXPACKET; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); /* Limit chimney send size */ static int hn_tx_chimney_size = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, &hn_tx_chimney_size, 0, "Chimney send packet size limit"); /* Limit the size of packet for direct transmission */ static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); /* # of LRO entries per RX ring */ #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, &hn_lro_entry_count, 0, "LRO entry count"); #endif #endif static int hn_tx_taskq_cnt = 1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); #define HN_TX_TASKQ_M_INDEP 0 #define HN_TX_TASKQ_M_GLOBAL 1 #define HN_TX_TASKQ_M_EVTTQ 2 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, &hn_tx_taskq_mode, 0, "TX taskqueue modes: " "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); #ifndef HN_USE_TXDESC_BUFRING static int hn_use_txdesc_bufring = 0; #else static int hn_use_txdesc_bufring = 1; #endif SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); #ifdef HN_IFSTART_SUPPORT /* Use ifnet.if_start instead of ifnet.if_transmit */ static int hn_use_if_start = 0; SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, &hn_use_if_start, 0, "Use if_start TX method"); #endif /* # of channels to use */ static int hn_chan_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hn_chan_cnt, 0, "# of channels to use; each channel has one RX ring and one TX ring"); /* # of transmit rings to use */ static int hn_tx_ring_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, &hn_tx_ring_cnt, 0, "# of TX rings to use"); /* Software TX ring deptch */ static int hn_tx_swq_depth = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ #if __FreeBSD_version >= 1100095 static u_int hn_lro_mbufq_depth = 0; SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); #endif /* Packet transmission aggregation size limit */ static int hn_tx_agg_size = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); /* Packet transmission aggregation count limit */ static int hn_tx_agg_pkts = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); /* VF list */ SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING, 0, 0, hn_vflist_sysctl, "A", "VF list"); /* VF mapping */ SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING, 0, 0, hn_vfmap_sysctl, "A", "VF mapping"); /* Transparent VF */ static int hn_xpnt_vf = 0; SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, &hn_xpnt_vf, 0, "Transparent VF mod"); /* Accurate BPF support for Transparent VF */ static int hn_xpnt_vf_accbpf = 0; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); /* Extra wait for transparent VF attach routing; unit seconds. */ static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, &hn_xpnt_vf_attwait, 0, "Extra wait for transparent VF attach routing; unit: seconds"); static u_int hn_cpu_index; /* next CPU for channel */ static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ static struct rmlock hn_vfmap_lock; static int hn_vfmap_size; static struct ifnet **hn_vfmap; #ifndef RSS static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa }; #endif /* !RSS */ static const struct hyperv_guid hn_guid = { .hv_guid = { 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } }; static device_method_t hn_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hn_probe), DEVMETHOD(device_attach, hn_attach), DEVMETHOD(device_detach, hn_detach), DEVMETHOD(device_shutdown, hn_shutdown), DEVMETHOD_END }; static driver_t hn_driver = { "hn", hn_methods, sizeof(struct hn_softc) }; static devclass_t hn_devclass; DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); #if __FreeBSD_version >= 1100099 static void hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) { int i; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; } #endif static int hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0, ("invalid rndis sglist txd")); return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); } static int hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) { struct hn_nvs_rndis rndis; KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && txd->chim_size > 0, ("invalid rndis chim txd")); rndis.nvs_type = HN_NVS_TYPE_RNDIS; rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; rndis.nvs_chim_idx = txd->chim_index; rndis.nvs_chim_sz = txd->chim_size; return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, &rndis, sizeof(rndis), &txd->send_ctx)); } static __inline uint32_t hn_chim_alloc(struct hn_softc *sc) { int i, bmap_cnt = sc->hn_chim_bmap_cnt; u_long *bmap = sc->hn_chim_bmap; uint32_t ret = HN_NVS_CHIM_IDX_INVALID; for (i = 0; i < bmap_cnt; ++i) { int idx; idx = ffsl(~bmap[i]); if (idx == 0) continue; --idx; /* ffsl is 1-based */ KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, ("invalid i %d and idx %d", i, idx)); if (atomic_testandset_long(&bmap[i], idx)) continue; ret = i * LONG_BIT + idx; break; } return (ret); } static __inline void hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) { u_long mask; uint32_t idx; idx = chim_idx / LONG_BIT; KASSERT(idx < sc->hn_chim_bmap_cnt, ("invalid chimney index 0x%x", chim_idx)); mask = 1UL << (chim_idx % LONG_BIT); KASSERT(sc->hn_chim_bmap[idx] & mask, ("index bitmap 0x%lx, chimney index %u, " "bitmap idx %d, bitmask 0x%lx", sc->hn_chim_bmap[idx], chim_idx, idx, mask)); atomic_clear_long(&sc->hn_chim_bmap[idx], mask); } #if defined(INET6) || defined(INET) #define PULLUP_HDR(m, len) \ do { \ if (__predict_false((m)->m_len < (len))) { \ (m) = m_pullup((m), (len)); \ if ((m) == NULL) \ return (NULL); \ } \ } while (0) /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_tso_fixup(struct mbuf *m_head) { struct ether_vlan_header *evl; struct tcphdr *th; int ehlen; KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; m_head->m_pkthdr.l2hlen = ehlen; #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; m_head->m_pkthdr.l3hlen = iphlen; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); ip->ip_len = 0; ip->ip_sum = 0; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP) { m_freem(m_head); return (NULL); } m_head->m_pkthdr.l3hlen = sizeof(*ip6); PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); th = mtodo(m_head, ehlen + sizeof(*ip6)); ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); } #endif return (m_head); } /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_set_hlen(struct mbuf *m_head) { const struct ether_vlan_header *evl; int ehlen; PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, const struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; m_head->m_pkthdr.l2hlen = ehlen; #ifdef INET if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { const struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; m_head->m_pkthdr.l3hlen = iphlen; /* * UDP checksum offload does not work in Azure, if the * following conditions meet: * - sizeof(IP hdr + UDP hdr + payload) > 1420. * - IP_DF is not set in the IP hdr. * * Fallback to software checksum for these UDP datagrams. */ if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && (ntohs(ip->ip_off) & IP_DF) == 0) { uint16_t off = ehlen + iphlen; counter_u64_add(hn_udpcs_fixup, 1); PULLUP_HDR(m_head, off + sizeof(struct udphdr)); *(uint16_t *)(m_head->m_data + off + m_head->m_pkthdr.csum_data) = in_cksum_skip( m_head, m_head->m_pkthdr.len, off); m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { const struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP) { m_freem(m_head); return (NULL); } m_head->m_pkthdr.l3hlen = sizeof(*ip6); } #endif return (m_head); } /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) { const struct tcphdr *th; int ehlen, iphlen; *tcpsyn = 0; ehlen = m_head->m_pkthdr.l2hlen; iphlen = m_head->m_pkthdr.l3hlen; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); if (th->th_flags & TH_SYN) *tcpsyn = 1; return (m_head); } #undef PULLUP_HDR #endif /* INET6 || INET */ static int hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) { int error = 0; HN_LOCK_ASSERT(sc); if (sc->hn_rx_filter != filter) { error = hn_rndis_set_rxfilter(sc, filter); if (!error) sc->hn_rx_filter = filter; } return (error); } static int hn_rxfilter_config(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; uint32_t filter; HN_LOCK_ASSERT(sc); /* * If the non-transparent mode VF is activated, we don't know how * its RX filter is configured, so stick the synthetic device in * the promiscous mode. */ if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { filter = NDIS_PACKET_TYPE_PROMISCUOUS; } else { filter = NDIS_PACKET_TYPE_DIRECTED; if (ifp->if_flags & IFF_BROADCAST) filter |= NDIS_PACKET_TYPE_BROADCAST; /* TODO: support multicast list */ if ((ifp->if_flags & IFF_ALLMULTI) || !TAILQ_EMPTY(&ifp->if_multiaddrs)) filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; } return (hn_set_rxfilter(sc, filter)); } static void hn_set_txagg(struct hn_softc *sc) { uint32_t size, pkts; int i; /* * Setup aggregation size. */ if (sc->hn_agg_size < 0) size = UINT32_MAX; else size = sc->hn_agg_size; if (sc->hn_rndis_agg_size < size) size = sc->hn_rndis_agg_size; /* NOTE: We only aggregate packets using chimney sending buffers. */ if (size > (uint32_t)sc->hn_chim_szmax) size = sc->hn_chim_szmax; if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'int'. */ if (size > INT_MAX) size = INT_MAX; /* * Setup aggregation packet count. */ if (sc->hn_agg_pkts < 0) pkts = UINT32_MAX; else pkts = sc->hn_agg_pkts; if (sc->hn_rndis_agg_pkts < pkts) pkts = sc->hn_rndis_agg_pkts; if (pkts <= 1) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'short'. */ if (pkts > SHRT_MAX) pkts = SHRT_MAX; done: /* NOTE: Type of the per TX ring setting is 'short'. */ if (sc->hn_rndis_agg_align > SHRT_MAX) { /* Disable */ size = 0; pkts = 0; } if (bootverbose) { if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", size, pkts, sc->hn_rndis_agg_align); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_agg_szmax = size; txr->hn_agg_pktmax = pkts; txr->hn_agg_align = sc->hn_rndis_agg_align; mtx_unlock(&txr->hn_tx_lock); } } static int hn_get_txswq_depth(const struct hn_tx_ring *txr) { KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); if (hn_tx_swq_depth < txr->hn_txdesc_cnt) return txr->hn_txdesc_cnt; return hn_tx_swq_depth; } static int hn_rss_reconfig(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return (ENXIO); /* * Disable RSS first. * * NOTE: * Direct reconfiguration by setting the UNCHG flags does * _not_ work properly. */ if (bootverbose) if_printf(sc->hn_ifp, "disable RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); if (error) { if_printf(sc->hn_ifp, "RSS disable failed\n"); return (error); } /* * Reenable the RSS w/ the updated RSS key or indirect * table. */ if (bootverbose) if_printf(sc->hn_ifp, "reconfig RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) { if_printf(sc->hn_ifp, "RSS reconfig failed\n"); return (error); } return (0); } static void hn_rss_ind_fixup(struct hn_softc *sc) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int i, nchan; nchan = sc->hn_rx_ring_inuse; KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); /* * Check indirect table to make sure that all channels in it * can be used. */ for (i = 0; i < NDIS_HASH_INDCNT; ++i) { if (rss->rss_ind[i] >= nchan) { if_printf(sc->hn_ifp, "RSS indirect table %d fixup: %u -> %d\n", i, rss->rss_ind[i], nchan - 1); rss->rss_ind[i] = nchan - 1; } } } static int hn_ifmedia_upd(struct ifnet *ifp __unused) { return EOPNOTSUPP; } static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct hn_softc *sc = ifp->if_softc; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } static void hn_rxvf_set_task(void *xarg, int pending __unused) { struct hn_rxvf_setarg *arg = xarg; arg->rxr->hn_rxvf_ifp = arg->vf_ifp; } static void hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) { struct hn_rx_ring *rxr; struct hn_rxvf_setarg arg; struct task task; int i; HN_LOCK_ASSERT(sc); TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; if (i < sc->hn_rx_ring_inuse) { arg.rxr = rxr; arg.vf_ifp = vf_ifp; vmbus_chan_run_task(rxr->hn_chan, &task); } else { rxr->hn_rxvf_ifp = vf_ifp; } } } static bool hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) { const struct ifnet *hn_ifp; hn_ifp = sc->hn_ifp; if (ifp == hn_ifp) return (false); if (ifp->if_alloctype != IFT_ETHER) return (false); /* Ignore lagg/vlan interfaces */ if (strcmp(ifp->if_dname, "lagg") == 0 || strcmp(ifp->if_dname, "vlan") == 0) return (false); if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) return (false); return (true); } static void hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) { struct ifnet *hn_ifp; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto out; if (!hn_ismyvf(sc, ifp)) goto out; hn_ifp = sc->hn_ifp; if (rxvf) { if (sc->hn_flags & HN_FLAG_RXVF) goto out; sc->hn_flags |= HN_FLAG_RXVF; hn_rxfilter_config(sc); } else { if (!(sc->hn_flags & HN_FLAG_RXVF)) goto out; sc->hn_flags &= ~HN_FLAG_RXVF; if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) hn_rxfilter_config(sc); else hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); } hn_nvs_set_datapath(sc, rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); hn_rxvf_set(sc, rxvf ? ifp : NULL); if (rxvf) { hn_vf_rss_fixup(sc, true); hn_suspend_mgmt(sc); sc->hn_link_flags &= ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); if_link_state_change(hn_ifp, LINK_STATE_DOWN); } else { hn_vf_rss_restore(sc); hn_resume_mgmt(sc); } devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, rxvf ? "VF_UP" : "VF_DOWN", NULL); if (bootverbose) { if_printf(hn_ifp, "datapath is switched %s %s\n", rxvf ? "to" : "from", ifp->if_xname); } out: HN_UNLOCK(sc); } static void hn_ifnet_event(void *arg, struct ifnet *ifp, int event) { if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) return; hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); } static void hn_ifaddr_event(void *arg, struct ifnet *ifp) { hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); } static int hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) { struct ifnet *ifp, *vf_ifp; uint64_t tmp; int error; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Fix up requested capabilities w/ supported capabilities, * since the supported capabilities could have been changed. */ ifr->ifr_reqcap &= ifp->if_capabilities; /* Pass SIOCSIFCAP to VF. */ error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); /* * NOTE: * The error will be propagated to the callers, however, it * is _not_ useful here. */ /* * Merge VF's enabled capabilities. */ ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; return (error); } static int hn_xpnt_vf_iocsetflags(struct hn_softc *sc) { struct ifnet *vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); vf_ifp = sc->hn_vf_ifp; memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_flags = vf_ifp->if_flags & 0xffff; ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); } static void hn_xpnt_vf_saveifflags(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int allmulti = 0; HN_LOCK_ASSERT(sc); /* XXX vlan(4) style mcast addr maintenance */ if (!TAILQ_EMPTY(&ifp->if_multiaddrs)) allmulti = IFF_ALLMULTI; /* Always set the VF's if_flags */ sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; } static void hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) { struct rm_priotracker pt; struct ifnet *hn_ifp = NULL; struct mbuf *mn; /* * XXX racy, if hn(4) ever detached. */ rm_rlock(&hn_vfmap_lock, &pt); if (vf_ifp->if_index < hn_vfmap_size) hn_ifp = hn_vfmap[vf_ifp->if_index]; rm_runlock(&hn_vfmap_lock, &pt); if (hn_ifp != NULL) { for (mn = m; mn != NULL; mn = mn->m_nextpkt) { /* * Allow tapping on the VF. */ ETHER_BPF_MTAP(vf_ifp, mn); /* * Update VF stats. */ if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, mn->m_pkthdr.len); } /* * XXX IFCOUNTER_IMCAST * This stat updating is kinda invasive, since it * requires two checks on the mbuf: the length check * and the ethernet header check. As of this write, * all multicast packets go directly to hn(4), which * makes imcast stat updating in the VF a try in vian. */ /* * Fix up rcvif and increase hn(4)'s ipackets. */ mn->m_pkthdr.rcvif = hn_ifp; if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); } /* * Go through hn(4)'s if_input. */ hn_ifp->if_input(hn_ifp, m); } else { /* * In the middle of the transition; free this * mbuf chain. */ while (m != NULL) { mn = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = mn; } } } static void hn_mtu_change_fixup(struct hn_softc *sc) { struct ifnet *ifp; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); #endif } static uint32_t hn_rss_type_fromndis(uint32_t rss_hash) { uint32_t types = 0; if (rss_hash & NDIS_HASH_IPV4) types |= RSS_TYPE_IPV4; if (rss_hash & NDIS_HASH_TCP_IPV4) types |= RSS_TYPE_TCP_IPV4; if (rss_hash & NDIS_HASH_IPV6) types |= RSS_TYPE_IPV6; if (rss_hash & NDIS_HASH_IPV6_EX) types |= RSS_TYPE_IPV6_EX; if (rss_hash & NDIS_HASH_TCP_IPV6) types |= RSS_TYPE_TCP_IPV6; if (rss_hash & NDIS_HASH_TCP_IPV6_EX) types |= RSS_TYPE_TCP_IPV6_EX; return (types); } static uint32_t hn_rss_type_tondis(uint32_t types) { uint32_t rss_hash = 0; KASSERT((types & (RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, ("UDP4, UDP6 and UDP6EX are not supported")); if (types & RSS_TYPE_IPV4) rss_hash |= NDIS_HASH_IPV4; if (types & RSS_TYPE_TCP_IPV4) rss_hash |= NDIS_HASH_TCP_IPV4; if (types & RSS_TYPE_IPV6) rss_hash |= NDIS_HASH_IPV6; if (types & RSS_TYPE_IPV6_EX) rss_hash |= NDIS_HASH_IPV6_EX; if (types & RSS_TYPE_TCP_IPV6) rss_hash |= NDIS_HASH_TCP_IPV6; if (types & RSS_TYPE_TCP_IPV6_EX) rss_hash |= NDIS_HASH_TCP_IPV6_EX; return (rss_hash); } static void hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) { int i; HN_LOCK_ASSERT(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; } static void hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) { struct ifnet *ifp, *vf_ifp; struct ifrsshash ifrh; struct ifrsskey ifrk; int error; uint32_t my_types, diff_types, mbuf_types = 0; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); if (sc->hn_rx_ring_inuse == 1) { /* No RSS on synthetic parts; done. */ return; } if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { /* Synthetic parts do not support Toeplitz; done. */ return; } ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Extract VF's RSS key. Only 40 bytes key for Toeplitz is * supported. */ memset(&ifrk, 0, sizeof(ifrk)); strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); if (error) { if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n", vf_ifp->if_xname, error); goto done; } if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { if_printf(ifp, "%s RSS function %u is not Toeplitz\n", vf_ifp->if_xname, ifrk.ifrk_func); goto done; } if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", vf_ifp->if_xname, ifrk.ifrk_keylen); goto done; } /* * Extract VF's RSS hash. Only Toeplitz is supported. */ memset(&ifrh, 0, sizeof(ifrh)); strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); if (error) { if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", vf_ifp->if_xname, error); goto done; } if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { if_printf(ifp, "%s RSS function %u is not Toeplitz\n", vf_ifp->if_xname, ifrh.ifrh_func); goto done; } my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); if ((ifrh.ifrh_types & my_types) == 0) { /* This disables RSS; ignore it then */ if_printf(ifp, "%s intersection of RSS types failed. " "VF %#x, mine %#x\n", vf_ifp->if_xname, ifrh.ifrh_types, my_types); goto done; } diff_types = my_types ^ ifrh.ifrh_types; my_types &= ifrh.ifrh_types; mbuf_types = my_types; /* * Detect RSS hash value/type confliction. * * NOTE: * We don't disable the hash type, but stop delivery the hash * value/type through mbufs on RX path. */ if ((my_types & RSS_TYPE_IPV4) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { /* Conflict; disable IPV4 hash type/value delivery. */ if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV4; } if ((my_types & RSS_TYPE_IPV6) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | RSS_TYPE_IPV6_EX))) { /* Conflict; disable IPV6 hash type/value delivery. */ if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV6; } if ((my_types & RSS_TYPE_IPV6_EX) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | RSS_TYPE_IPV6))) { /* Conflict; disable IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV6_EX; } if ((my_types & RSS_TYPE_TCP_IPV6) && (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { /* Conflict; disable TCP_IPV6 hash type/value delivery. */ if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_TCP_IPV6; } if ((my_types & RSS_TYPE_TCP_IPV6_EX) && (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; } if ((my_types & RSS_TYPE_UDP_IPV6) && (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { /* Conflict; disable UDP_IPV6 hash type/value delivery. */ if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_UDP_IPV6; } if ((my_types & RSS_TYPE_UDP_IPV6_EX) && (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; } /* * Indirect table does not matter. */ sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | hn_rss_type_tondis(my_types); memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (reconf) { error = hn_rss_reconfig(sc); if (error) { /* XXX roll-back? */ if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); /* XXX keep going. */ } } done: /* Hash deliverability for mbufs. */ hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); } static void hn_vf_rss_restore(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); if (sc->hn_rx_ring_inuse == 1) goto done; /* * Restore hash types. Key does _not_ matter. */ if (sc->hn_rss_hash != sc->hn_rss_hcap) { int error; sc->hn_rss_hash = sc->hn_rss_hcap; error = hn_rss_reconfig(sc); if (error) { if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", error); /* XXX keep going. */ } } done: /* Hash deliverability for mbufs. */ hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); } static void hn_xpnt_vf_setready(struct hn_softc *sc) { struct ifnet *ifp, *vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Mark the VF ready. */ sc->hn_vf_rdytick = 0; /* * Save information for restoration. */ sc->hn_saved_caps = ifp->if_capabilities; sc->hn_saved_tsomax = ifp->if_hw_tsomax; sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; /* * Intersect supported/enabled capabilities. * * NOTE: * if_hwassist is not changed here. */ ifp->if_capabilities &= vf_ifp->if_capabilities; ifp->if_capenable &= ifp->if_capabilities; /* * Fix TSO settings. */ if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; /* * Change VF's enabled capabilities. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_reqcap = ifp->if_capenable; hn_xpnt_vf_iocsetcaps(sc, &ifr); if (ifp->if_mtu != ETHERMTU) { int error; /* * Change VF's MTU. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_mtu = ifp->if_mtu; error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); if (error) { if_printf(ifp, "%s SIOCSIFMTU %u failed\n", vf_ifp->if_xname, ifp->if_mtu); if (ifp->if_mtu > ETHERMTU) { if_printf(ifp, "change MTU to %d\n", ETHERMTU); /* * XXX * No need to adjust the synthetic parts' MTU; * failure of the adjustment will cause us * infinite headache. */ ifp->if_mtu = ETHERMTU; hn_mtu_change_fixup(sc); } } } } static bool hn_xpnt_vf_isready(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) return (false); if (sc->hn_vf_rdytick == 0) return (true); if (sc->hn_vf_rdytick > ticks) return (false); /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); return (true); } static void hn_xpnt_vf_setenable(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; if (clear_vf) sc->hn_vf_ifp = NULL; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_init(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); if (bootverbose) { if_printf(sc->hn_ifp, "try bringing up %s\n", sc->hn_vf_ifp->if_xname); } /* * Bring the VF up. */ hn_xpnt_vf_saveifflags(sc); sc->hn_vf_ifp->if_flags |= IFF_UP; error = hn_xpnt_vf_iocsetflags(sc); if (error) { if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", sc->hn_vf_ifp->if_xname, error); return; } /* * NOTE: * Datapath setting must happen _after_ bringing the VF up. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); /* * NOTE: * Fixup RSS related bits _after_ the VF is brought up, since * many VFs generate RSS key during it's initialization. */ hn_vf_rss_fixup(sc, true); /* Mark transparent mode VF as enabled. */ hn_xpnt_vf_setenable(sc); } static void hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) goto done; if (sc->hn_vf_ifp == NULL) goto done; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) goto done; if (sc->hn_vf_rdytick != 0) { /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); } if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * Delayed VF initialization. */ if (bootverbose) { if_printf(sc->hn_ifp, "delayed initialize %s\n", sc->hn_vf_ifp->if_xname); } hn_xpnt_vf_init(sc); } done: HN_UNLOCK(sc); } static void hn_ifnet_attevent(void *xsc, struct ifnet *ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (sc->hn_vf_ifp != NULL) { if_printf(sc->hn_ifp, "%s was attached as VF\n", sc->hn_vf_ifp->if_xname); goto done; } if (hn_xpnt_vf && ifp->if_start != NULL) { /* * ifnet.if_start is _not_ supported by transparent * mode VF; mainly due to the IFF_DRV_OACTIVE flag. */ if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " "in transparent VF mode.\n", ifp->if_xname); goto done; } rm_wlock(&hn_vfmap_lock); if (ifp->if_index >= hn_vfmap_size) { struct ifnet **newmap; int newsize; newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, M_WAITOK | M_ZERO); memcpy(newmap, hn_vfmap, sizeof(struct ifnet *) * hn_vfmap_size); free(hn_vfmap, M_DEVBUF); hn_vfmap = newmap; hn_vfmap_size = newsize; } KASSERT(hn_vfmap[ifp->if_index] == NULL, ("%s: ifindex %d was mapped to %s", ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); hn_vfmap[ifp->if_index] = sc->hn_ifp; rm_wunlock(&hn_vfmap_lock); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); sc->hn_vf_ifp = ifp; rm_wunlock(&sc->hn_vf_lock); if (hn_xpnt_vf) { int wait_ticks; /* * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. * Save vf_ifp's current if_input for later restoration. */ sc->hn_vf_input = ifp->if_input; ifp->if_input = hn_xpnt_vf_input; /* * Stop link status management; use the VF's. */ hn_suspend_mgmt(sc); /* * Give VF sometime to complete its attach routing. */ wait_ticks = hn_xpnt_vf_attwait * hz; sc->hn_vf_rdytick = ticks + wait_ticks; taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, wait_ticks); } done: HN_UNLOCK(sc); } static void hn_ifnet_detevent(void *xsc, struct ifnet *ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (sc->hn_vf_ifp == NULL) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (hn_xpnt_vf) { /* * Make sure that the delayed initialization is not running. * * NOTE: * - This lock _must_ be released, since the hn_vf_init task * will try holding this lock. * - It is safe to release this lock here, since the * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. * * XXX racy, if hn(4) ever detached. */ HN_UNLOCK(sc); taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); HN_LOCK(sc); KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", sc->hn_ifp->if_xname)); ifp->if_input = sc->hn_vf_input; sc->hn_vf_input = NULL; if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); if (sc->hn_vf_rdytick == 0) { /* * The VF was ready; restore some settings. */ sc->hn_ifp->if_capabilities = sc->hn_saved_caps; /* * NOTE: * There is _no_ need to fixup if_capenable and * if_hwassist, since the if_capabilities before * restoration was an intersection of the VF's * if_capabilites and the synthetic device's * if_capabilites. */ sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; sc->hn_ifp->if_hw_tsomaxsegcount = sc->hn_saved_tsosegcnt; sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; } if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { /* * Restore RSS settings. */ hn_vf_rss_restore(sc); /* * Resume link status management, which was suspended * by hn_ifnet_attevent(). */ hn_resume_mgmt(sc); } } /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); rm_wlock(&hn_vfmap_lock); KASSERT(ifp->if_index < hn_vfmap_size, ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); if (hn_vfmap[ifp->if_index] != NULL) { KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, ("%s: ifindex %d was mapped to %s", ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); hn_vfmap[ifp->if_index] = NULL; } rm_wunlock(&hn_vfmap_lock); done: HN_UNLOCK(sc); } static void hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) { struct hn_softc *sc = xsc; if (sc->hn_vf_ifp == ifp) if_link_state_change(sc->hn_ifp, link_state); } static int hn_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { device_set_desc(dev, "Hyper-V Network Interface"); return BUS_PROBE_DEFAULT; } return ENXIO; } static int hn_attach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; uint8_t eaddr[ETHER_ADDR_LEN]; struct ifnet *ifp = NULL; int error, ring_cnt, tx_ring_cnt; uint32_t mtu; sc->hn_dev = dev; sc->hn_prichan = vmbus_get_channel(dev); HN_LOCK_INIT(sc); rm_init(&sc->hn_vf_lock, "hnvf"); if (hn_xpnt_vf && hn_xpnt_vf_accbpf) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; /* * Initialize these tunables once. */ sc->hn_agg_size = hn_tx_agg_size; sc->hn_agg_pkts = hn_tx_agg_pkts; /* * Setup taskqueue for transmission. */ if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { int i; sc->hn_tx_taskqs = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_tx_taskqs[i]); taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, "%s tx%d", device_get_nameunit(dev), i); } } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { sc->hn_tx_taskqs = hn_tx_taskque; } /* * Setup taskqueue for mangement tasks, e.g. link status. */ sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", device_get_nameunit(dev)); TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, hn_netchg_status_taskfunc, sc); if (hn_xpnt_vf) { /* * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. */ sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_vf_taskq); taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", device_get_nameunit(dev)); TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, hn_xpnt_vf_init_taskfunc, sc); } /* * Allocate ifnet and setup its name earlier, so that if_printf * can be used by functions, which will be called after * ether_ifattach(). */ ifp = sc->hn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); /* * Initialize ifmedia earlier so that it can be unconditionally * destroyed, if error happened later on. */ ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); /* * Figure out the # of RX rings (ring_cnt) and the # of TX rings * to use (tx_ring_cnt). * * NOTE: * The # of RX rings to use is same as the # of channels to use. */ ring_cnt = hn_chan_cnt; if (ring_cnt <= 0) { /* Default */ ring_cnt = mp_ncpus; if (ring_cnt > HN_RING_CNT_DEF_MAX) ring_cnt = HN_RING_CNT_DEF_MAX; } else if (ring_cnt > mp_ncpus) { ring_cnt = mp_ncpus; } #ifdef RSS if (ring_cnt > rss_getnumbuckets()) ring_cnt = rss_getnumbuckets(); #endif tx_ring_cnt = hn_tx_ring_cnt; if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) tx_ring_cnt = ring_cnt; #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { /* ifnet.if_start only needs one TX ring. */ tx_ring_cnt = 1; } #endif /* * Set the leader CPU for channels. */ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; /* * Create enough TX/RX rings, even if only limited number of * channels can be allocated. */ error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; error = hn_create_rx_data(sc, ring_cnt); if (error) goto failed; /* * Create transaction context for NVS and RNDIS transactions. */ sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); if (sc->hn_xact == NULL) { error = ENXIO; goto failed; } /* * Install orphan handler for the revocation of this device's * primary channel. * * NOTE: * The processing order is critical here: * Install the orphan handler, _before_ testing whether this * device's primary channel has been revoked or not. */ vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); if (vmbus_chan_is_revoked(sc->hn_prichan)) { error = ENXIO; goto failed; } /* * Attach the synthetic parts, i.e. NVS and RNDIS. */ error = hn_synth_attach(sc, ETHERMTU); if (error) goto failed; error = hn_rndis_get_eaddr(sc, eaddr); if (error) goto failed; error = hn_rndis_get_mtu(sc, &mtu); if (error) mtu = ETHERMTU; else if (bootverbose) device_printf(dev, "RNDIS mtu %u\n", mtu); #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring_inuse > 1) { /* * Reduce TCP segment aggregation limit for multiple * RX rings to increase ACK timeliness. */ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); } #endif /* - * Fixup TX stuffs after synthetic parts are attached. + * Fixup TX/RX stuffs after synthetic parts are attached. */ hn_fixup_tx_data(sc); + hn_fixup_rx_data(sc); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, &sc->hn_nvs_ver, 0, "NVS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_ndis_version_sysctl, "A", "NDIS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_caps_sysctl, "A", "capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_hwassist_sysctl, "A", "hwassist"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, "max # of TSO segments"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, "max size of TSO segment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxfilter_sysctl, "A", "rxfilter"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_hash_sysctl, "A", "RSS hash"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); #ifndef RSS /* * Don't allow RSS key/indirect table changes, if RSS is defined. */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_key_sysctl, "IU", "RSS key"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_ind_sysctl, "IU", "RSS indirect table"); #endif SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, "RNDIS offered packet transmission aggregation size limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, "RNDIS offered packet transmission aggregation count limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, "RNDIS packet transmission aggregation alignment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_size_sysctl, "I", "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pkts_sysctl, "I", "Packet transmission aggregation packets, " "0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_polling_sysctl, "I", "Polling frequency: [100,1000000], 0 disable polling"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_vf_sysctl, "A", "Virtual Function's name"); if (!hn_xpnt_vf) { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxvf_sysctl, "A", "activated Virtual Function's name"); } else { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_enabled_sysctl, "I", "Transparent VF enabled"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_accbpf_sysctl, "I", "Accurate BPF for transparent VF"); } /* * Setup the ifmedia, which has been initialized earlier. */ ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); /* XXX ifmedia_set really should do this for us */ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; /* * Setup the ifnet for this interface. */ ifp->if_baudrate = IF_Gbps(10); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; ifp->if_init = hn_init; #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); ifp->if_start = hn_start; IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); ifp->if_snd.ifq_drv_maxlen = qdepth - 1; IFQ_SET_READY(&ifp->if_snd); } else #endif { ifp->if_transmit = hn_transmit; ifp->if_qflush = hn_xmit_qflush; } ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; #endif if (sc->hn_caps & HN_CAP_VLAN) { /* XXX not sure about VLAN_MTU. */ ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; } ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; if (ifp->if_hwassist & HN_CSUM_IP_MASK) ifp->if_capabilities |= IFCAP_TXCSUM; if (ifp->if_hwassist & HN_CSUM_IP6_MASK) ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; if (sc->hn_caps & HN_CAP_TSO4) { ifp->if_capabilities |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_IP_TSO; } if (sc->hn_caps & HN_CAP_TSO6) { ifp->if_capabilities |= IFCAP_TSO6; ifp->if_hwassist |= CSUM_IP6_TSO; } /* Enable all available capabilities by default. */ ifp->if_capenable = ifp->if_capabilities; /* * Disable IPv6 TSO and TXCSUM by default, they still can * be enabled through SIOCSIFCAP. */ ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { /* * Lock hn_set_tso_maxsize() to simplify its * internal logic. */ HN_LOCK(sc); hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); HN_UNLOCK(sc); ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; ifp->if_hw_tsomaxsegsize = PAGE_SIZE; } ether_ifattach(ifp, eaddr); if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { if_printf(ifp, "TSO segcnt %u segsz %u\n", ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } if (mtu < ETHERMTU) { if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); ifp->if_mtu = mtu; } /* Inform the upper layer about the long frame support. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* * Kick off link status check. */ sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; hn_update_link_status(sc); if (!hn_xpnt_vf) { sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); } else { sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); } /* * NOTE: * Subscribe ether_ifattach event, instead of ifnet_arrival event, * since interface's LLADDR is needed; interface LLADDR is not * available when ifnet_arrival event is triggered. */ sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) hn_synth_detach(sc); hn_detach(dev); return (error); } static int hn_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct ifnet *ifp = sc->hn_ifp, *vf_ifp; if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { /* * In case that the vmbus missed the orphan handler * installation. */ vmbus_xact_ctx_orphan(sc->hn_xact); } if (sc->hn_ifaddr_evthand != NULL) EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); if (sc->hn_ifnet_evthand != NULL) EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); if (sc->hn_ifnet_atthand != NULL) { EVENTHANDLER_DEREGISTER(ether_ifattach_event, sc->hn_ifnet_atthand); } if (sc->hn_ifnet_dethand != NULL) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, sc->hn_ifnet_dethand); } if (sc->hn_ifnet_lnkhand != NULL) EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); vf_ifp = sc->hn_vf_ifp; __compiler_membar(); if (vf_ifp != NULL) hn_ifnet_detevent(sc, vf_ifp); if (device_is_attached(dev)) { HN_LOCK(sc); if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc, true); /* * NOTE: * hn_stop() only suspends data, so managment * stuffs have to be suspended manually here. */ hn_suspend_mgmt(sc); hn_synth_detach(sc); } HN_UNLOCK(sc); ether_ifdetach(ifp); } ifmedia_removeall(&sc->hn_media); hn_destroy_rx_data(sc); hn_destroy_tx_data(sc); if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(sc->hn_tx_taskqs[i]); free(sc->hn_tx_taskqs, M_DEVBUF); } taskqueue_free(sc->hn_mgmt_taskq0); if (sc->hn_vf_taskq != NULL) taskqueue_free(sc->hn_vf_taskq); if (sc->hn_xact != NULL) { /* * Uninstall the orphan handler _before_ the xact is * destructed. */ vmbus_chan_unset_orphan(sc->hn_prichan); vmbus_xact_ctx_destroy(sc->hn_xact); } if_free(ifp); HN_LOCK_DESTROY(sc); rm_destroy(&sc->hn_vf_lock); return (0); } static int hn_shutdown(device_t dev) { return (0); } static void hn_link_status(struct hn_softc *sc) { uint32_t link_status; int error; error = hn_rndis_get_linkstatus(sc, &link_status); if (error) { /* XXX what to do? */ return; } if (link_status == NDIS_MEDIA_STATE_CONNECTED) sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; else sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? LINK_STATE_UP : LINK_STATE_DOWN); } static void hn_link_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) return; hn_link_status(sc); } static void hn_netchg_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Prevent any link status checks from running. */ sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; /* * Fake up a [link down --> link up] state change; 5 seconds * delay is used, which closely simulates miibus reaction * upon link down event. */ sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 5 * hz); } static void hn_netchg_status_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Re-allow link status checks. */ sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; hn_link_status(sc); } static void hn_update_link_status(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); } static void hn_change_network(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); } static __inline int hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m = *m_head; int error; KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m_new; m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); if (m_new == NULL) return ENOBUFS; else *m_head = m = m_new; txr->hn_tx_collapsed++; error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); } if (!error) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_PREWRITE); txd->flags |= HN_TXD_FLAG_DMAMAP; } return error; } static __inline int hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, ("put an onlist txd %#x", txd->flags)); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("put an onagg txd %#x", txd->flags)); KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; if (!STAILQ_EMPTY(&txd->agg_list)) { struct hn_txdesc *tmp_txd; while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { int freed; KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), ("resursive aggregation on aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), ("not aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("aggregated txdesc uses dmamap")); KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("aggregated txdesc consumes " "chimney sending buffer")); KASSERT(tmp_txd->chim_size == 0, ("aggregated txdesc has non-zero " "chimney sending size")); STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; freed = hn_txdesc_put(txr, tmp_txd); KASSERT(freed, ("failed to free aggregated txdesc")); } } if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("chim txd uses dmamap")); hn_chim_free(txr->hn_sc, txd->chim_index); txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->hn_tx_data_dtag, txd->data_dmap); txd->flags &= ~HN_TXD_FLAG_DMAMAP; } if (txd->m != NULL) { m_freem(txd->m); txd->m = NULL; } txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); KASSERT(txr->hn_txdesc_avail >= 0 && txr->hn_txdesc_avail < txr->hn_txdesc_cnt, ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail++; SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); mtx_unlock_spin(&txr->hn_txlist_spin); #else /* HN_USE_TXDESC_BUFRING */ #ifdef HN_DEBUG atomic_add_int(&txr->hn_txdesc_avail, 1); #endif buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif /* !HN_USE_TXDESC_BUFRING */ return 1; } static __inline struct hn_txdesc * hn_txdesc_get(struct hn_tx_ring *txr) { struct hn_txdesc *txd; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); txd = SLIST_FIRST(&txr->hn_txlist); if (txd != NULL) { KASSERT(txr->hn_txdesc_avail > 0, ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail--; SLIST_REMOVE_HEAD(&txr->hn_txlist, link); } mtx_unlock_spin(&txr->hn_txlist_spin); #else txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); #endif if (txd != NULL) { #ifdef HN_USE_TXDESC_BUFRING #ifdef HN_DEBUG atomic_subtract_int(&txr->hn_txdesc_avail, 1); #endif #endif /* HN_USE_TXDESC_BUFRING */ KASSERT(txd->m == NULL && txd->refs == 0 && STAILQ_EMPTY(&txd->agg_list) && txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0 && (txd->flags & HN_TXD_FLAG_ONLIST) && (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; txd->refs = 1; } return txd; } static __inline void hn_txdesc_hold(struct hn_txdesc *txd) { /* 0->1 transition will never work */ KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); atomic_add_int(&txd->refs, 1); } static __inline void hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) { KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("recursive aggregation on aggregating txdesc")); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("already aggregated")); KASSERT(STAILQ_EMPTY(&txd->agg_list), ("recursive aggregation on to-be-aggregated txdesc")); txd->flags |= HN_TXD_FLAG_ONAGG; STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); } static bool hn_tx_ring_pending(struct hn_tx_ring *txr) { bool pending = false; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) pending = true; mtx_unlock_spin(&txr->hn_txlist_spin); #else if (!buf_ring_full(txr->hn_txdesc_br)) pending = true; #endif return (pending); } static __inline void hn_txeof(struct hn_tx_ring *txr) { txr->hn_has_txeof = 0; txr->hn_txeof(txr); } static void hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, struct vmbus_channel *chan, const void *data __unused, int dlen __unused) { struct hn_txdesc *txd = sndc->hn_cbarg; struct hn_tx_ring *txr; txr = txd->txr; KASSERT(txr->hn_chan == chan, ("channel mismatch, on chan%u, should be chan%u", vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); ++txr->hn_txdone_cnt; if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { txr->hn_txdone_cnt = 0; if (txr->hn_oactive) hn_txeof(txr); } } static void hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { #if defined(INET) || defined(INET6) tcp_lro_flush_all(&rxr->hn_lro); #endif /* * NOTE: * 'txr' could be NULL, if multiple channels and * ifnet.if_start method are enabled. */ if (txr == NULL || !txr->hn_has_txeof) return; txr->hn_txdone_cnt = 0; hn_txeof(txr); } static __inline uint32_t hn_rndis_pktmsg_offset(uint32_t ofs) { KASSERT(ofs >= sizeof(struct rndis_packet_msg), ("invalid RNDIS packet msg offset %u", ofs)); return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); } static __inline void * hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, size_t pi_dlen, uint32_t pi_type) { const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); struct rndis_pktinfo *pi; KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); /* * Per-packet-info does not move; it only grows. * * NOTE: * rm_pktinfooffset in this phase counts from the beginning * of rndis_packet_msg. */ KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, ("%u pktinfo overflows RNDIS packet msg", pi_type)); pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + pkt->rm_pktinfolen); pkt->rm_pktinfolen += pi_size; pi->rm_size = pi_size; pi->rm_type = pi_type; pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; return (pi->rm_data); } static __inline int hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) { struct hn_txdesc *txd; struct mbuf *m; int error, pkts; txd = txr->hn_agg_txd; KASSERT(txd != NULL, ("no aggregate txdesc")); /* * Since hn_txpkt() will reset this temporary stat, save * it now, so that oerrors can be updated properly, if * hn_txpkt() ever fails. */ pkts = txr->hn_stat_pkts; /* * Since txd's mbuf will _not_ be freed upon hn_txpkt() * failure, save it for later freeing, if hn_txpkt() ever * fails. */ m = txd->m; error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m is not. */ m_freem(m); txr->hn_flush_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); } /* Reset all aggregation states. */ txr->hn_agg_txd = NULL; txr->hn_agg_szleft = 0; txr->hn_agg_pktleft = 0; txr->hn_agg_prevpkt = NULL; return (error); } static void * hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, int pktsize) { void *chim; if (txr->hn_agg_txd != NULL) { if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { struct hn_txdesc *agg_txd = txr->hn_agg_txd; struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; int olen; /* * Update the previous RNDIS packet's total length, * it can be increased due to the mandatory alignment * padding for this RNDIS packet. And update the * aggregating txdesc's chimney sending buffer size * accordingly. * * XXX * Zero-out the padding, as required by the RNDIS spec. */ olen = pkt->rm_len; pkt->rm_len = roundup2(olen, txr->hn_agg_align); agg_txd->chim_size += pkt->rm_len - olen; /* Link this txdesc to the parent. */ hn_txdesc_agg(agg_txd, txd); chim = (uint8_t *)pkt + pkt->rm_len; /* Save the current packet for later fixup. */ txr->hn_agg_prevpkt = chim; txr->hn_agg_pktleft--; txr->hn_agg_szleft -= pktsize; if (txr->hn_agg_szleft <= HN_PKTSIZE_MIN(txr->hn_agg_align)) { /* * Probably can't aggregate more packets, * flush this aggregating txdesc proactively. */ txr->hn_agg_pktleft = 0; } /* Done! */ return (chim); } hn_flush_txagg(ifp, txr); } KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); txr->hn_tx_chimney_tried++; txd->chim_index = hn_chim_alloc(txr->hn_sc); if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) return (NULL); txr->hn_tx_chimney++; chim = txr->hn_sc->hn_chim + (txd->chim_index * txr->hn_sc->hn_chim_szmax); if (txr->hn_agg_pktmax > 1 && txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { txr->hn_agg_txd = txd; txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; txr->hn_agg_prevpkt = chim; } return (chim); } /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. */ static int hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) { bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; struct mbuf *m_head = *m_head0; struct rndis_packet_msg *pkt; uint32_t *pi_data; void *chim = NULL; int pkt_hlen, pkt_size; pkt = txd->rndis_pkt; pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); if (pkt_size < txr->hn_chim_size) { chim = hn_try_txagg(ifp, txr, txd, pkt_size); if (chim != NULL) pkt = chim; } else { if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); } pkt->rm_type = REMOTE_NDIS_PACKET_MSG; pkt->rm_len = m_head->m_pkthdr.len; pkt->rm_dataoffset = 0; pkt->rm_datalen = m_head->m_pkthdr.len; pkt->rm_oobdataoffset = 0; pkt->rm_oobdatalen = 0; pkt->rm_oobdataelements = 0; pkt->rm_pktinfooffset = sizeof(*pkt); pkt->rm_pktinfolen = 0; pkt->rm_vchandle = 0; pkt->rm_reserved = 0; if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { /* * Set the hash value for this packet, so that the host could * dispatch the TX done event for this packet back to this TX * ring's channel. */ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); *pi_data = txr->hn_tx_idx; } if (m_head->m_flags & M_VLANTAG) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); *pi_data = NDIS_VLAN_INFO_MAKE( EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { #if defined(INET6) || defined(INET) pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { *pi_data = NDIS_LSO2_INFO_MAKEIPV4( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, m_head->m_pkthdr.tso_segsz); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { *pi_data = NDIS_LSO2_INFO_MAKEIPV6( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, m_head->m_pkthdr.tso_segsz); } #endif #endif /* INET6 || INET */ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); if (m_head->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP)) { *pi_data = NDIS_TXCSUM_INFO_IPV6; } else { *pi_data = NDIS_TXCSUM_INFO_IPV4; if (m_head->m_pkthdr.csum_flags & CSUM_IP) *pi_data |= NDIS_TXCSUM_INFO_IPCS; } if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) { *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); } else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP6_UDP)) { *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); } } pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; /* Fixup RNDIS packet message total length */ pkt->rm_len += pkt_hlen; /* Convert RNDIS packet message offsets */ pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); /* * Fast path: Chimney sending. */ if (chim != NULL) { struct hn_txdesc *tgt_txd = txd; if (txr->hn_agg_txd != NULL) { tgt_txd = txr->hn_agg_txd; #ifdef INVARIANTS *m_head0 = NULL; #endif } KASSERT(pkt == chim, ("RNDIS pkt not in chimney sending buffer")); KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, ("chimney sending buffer is not used")); tgt_txd->chim_size += pkt->rm_len; m_copydata(m_head, 0, m_head->m_pkthdr.len, ((uint8_t *)chim) + pkt_hlen); txr->hn_gpa_cnt = 0; txr->hn_sendpkt = hn_txpkt_chim; goto done; } KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("chimney buffer is used")); KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); if (__predict_false(error)) { int freed; /* * This mbuf is not linked w/ the txd yet, so free it now. */ m_freem(m_head); *m_head0 = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon txdma error")); txr->hn_txdma_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return error; } *m_head0 = m_head; /* +1 RNDIS packet message */ txr->hn_gpa_cnt = nsegs + 1; /* send packet with page buffer */ txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; txr->hn_gpa[0].gpa_len = pkt_hlen; /* * Fill the page buffers with mbuf info after the page * buffer for RNDIS packet message. */ for (i = 0; i < nsegs; ++i) { struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; gpa->gpa_page = atop(segs[i].ds_addr); gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; gpa->gpa_len = segs[i].ds_len; } txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; txr->hn_sendpkt = hn_txpkt_sglist; done: txd->m = m_head; /* Set the completion routine */ hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); /* Update temporary stats for later use. */ txr->hn_stat_pkts++; txr->hn_stat_size += m_head->m_pkthdr.len; if (m_head->m_flags & M_MCAST) txr->hn_stat_mcasts++; return 0; } /* * NOTE: * If this function fails, then txd will be freed, but the mbuf * associated w/ the txd will _not_ be freed. */ static int hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { int error, send_failed = 0, has_bpf; again: has_bpf = bpf_peers_present(ifp->if_bpf); if (has_bpf) { /* * Make sure that this txd and any aggregated txds are not * freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); } error = txr->hn_sendpkt(txr, txd); if (!error) { if (has_bpf) { const struct hn_txdesc *tmp_txd; ETHER_BPF_MTAP(ifp, txd->m); STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) ETHER_BPF_MTAP(ifp, tmp_txd->m); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { if_inc_counter(ifp, IFCOUNTER_OBYTES, txr->hn_stat_size); if (txr->hn_stat_mcasts != 0) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, txr->hn_stat_mcasts); } } txr->hn_pkts += txr->hn_stat_pkts; txr->hn_sends++; } if (has_bpf) hn_txdesc_put(txr, txd); if (__predict_false(error)) { int freed; /* * This should "really rarely" happen. * * XXX Too many RX to be acked or too many sideband * commands to run? Ask netvsc_channel_rollup() * to kick start later. */ txr->hn_has_txeof = 1; if (!send_failed) { txr->hn_send_failed++; send_failed = 1; /* * Try sending again after set hn_has_txeof; * in case that we missed the last * netvsc_channel_rollup(). */ goto again; } if_printf(ifp, "send failed\n"); /* * Caller will perform further processing on the * associated mbuf, so don't free it in hn_txdesc_put(); * only unload it from the DMA map in hn_txdesc_put(), * if it was loaded. */ txd->m = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon send error")); txr->hn_send_failed++; } /* Reset temporary stats, after this sending is done. */ txr->hn_stat_size = 0; txr->hn_stat_pkts = 0; txr->hn_stat_mcasts = 0; return (error); } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return 1 if able to complete the job; otherwise 0. */ static int hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) break; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } #if defined(INET) || defined(INET6) static __inline int hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) { #if __FreeBSD_version >= 1100095 if (hn_lro_mbufq_depth) { tcp_lro_queue_mbuf(lc, m); return 0; } #endif return tcp_lro_rx(lc, m, 0); } #endif static int hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, const struct hn_rxinfo *info) { struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; struct mbuf *m_new; int size, do_lro = 0, do_csum = 1, is_vf = 0; int hash_type = M_HASHTYPE_NONE; + int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; ifp = hn_ifp; if (rxr->hn_rxvf_ifp != NULL) { /* * Non-transparent mode VF; pretend this packet is from * the VF. */ ifp = rxr->hn_rxvf_ifp; is_vf = 1; } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { /* Transparent mode VF. */ is_vf = 1; } if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { /* * NOTE: * See the NOTE of hn_rndis_init_fixat(). This * function can be reached, immediately after the * RNDIS is initialized but before the ifnet is * setup on the hn_attach() path; drop the unexpected * packets. */ return (0); } if (__predict_false(dlen < ETHER_HDR_LEN)) { if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); return (0); } if (dlen <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } memcpy(mtod(m_new, void *), data, dlen); m_new->m_pkthdr.len = m_new->m_len = dlen; rxr->hn_small_pkts++; } else { /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (dlen > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } hv_m_append(m_new, dlen, data); } m_new->m_pkthdr.rcvif = ifp; if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) do_csum = 0; /* receive side checksum offload */ if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { /* IP csum offload */ if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } /* * XXX * As of this write (Oct 28th, 2016), host side will turn * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so * the do_lro setting here is actually _not_ accurate. We * depend on the RSS hash type check to reset do_lro. */ if ((info->csum_info & (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) do_lro = 1; } else { - const struct ether_header *eh; - uint16_t etype; - int hoff; - - hoff = sizeof(*eh); - /* Checked at the beginning of this function. */ - KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); - - eh = mtod(m_new, struct ether_header *); - etype = ntohs(eh->ether_type); - if (etype == ETHERTYPE_VLAN) { - const struct ether_vlan_header *evl; - - hoff = sizeof(*evl); - if (m_new->m_len < hoff) - goto skip; - evl = mtod(m_new, struct ether_vlan_header *); - etype = ntohs(evl->evl_proto); - } - - if (etype == ETHERTYPE_IP) { - int pr; - - pr = hn_check_iplen(m_new, hoff); - if (pr == IPPROTO_TCP) { + hn_rxpkt_proto(m_new, &l3proto, &l4proto); + if (l3proto == ETHERTYPE_IP) { + if (l4proto == IPPROTO_TCP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_TCP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } do_lro = 1; - } else if (pr == IPPROTO_UDP) { + } else if (l4proto == IPPROTO_UDP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_UDP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } - } else if (pr != IPPROTO_DONE && do_csum && + } else if (l4proto != IPPROTO_DONE && do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); } } } -skip: + if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( NDIS_VLAN_INFO_ID(info->vlan_info), NDIS_VLAN_INFO_PRI(info->vlan_info), NDIS_VLAN_INFO_CFI(info->vlan_info)); m_new->m_flags |= M_VLANTAG; } /* * If VF is activated (tranparent/non-transparent mode does not * matter here). * * - Disable LRO * * hn(4) will only receive broadcast packets, multicast packets, * TCP SYN and SYN|ACK (in Azure), LRO is useless for these * packet types. * * For non-transparent, we definitely _cannot_ enable LRO at * all, since the LRO flush will use hn(4) as the receiving * interface; i.e. hn_ifp->if_input(hn_ifp, m). */ if (is_vf) do_lro = 0; /* * If VF is activated (tranparent/non-transparent mode does not * matter here), do _not_ mess with unsupported hash types or * functions. */ if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { rxr->hn_rss_pkts++; m_new->m_pkthdr.flowid = info->hash_value; if (!is_vf) hash_type = M_HASHTYPE_OPAQUE_HASH; if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK & rxr->hn_mbuf_hash); /* * NOTE: * do_lro is resetted, if the hash types are not TCP * related. See the comment in the above csum_flags * setup section. */ switch (type) { case NDIS_HASH_IPV4: hash_type = M_HASHTYPE_RSS_IPV4; do_lro = 0; break; case NDIS_HASH_TCP_IPV4: hash_type = M_HASHTYPE_RSS_TCP_IPV4; + if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { + int def_htype = M_HASHTYPE_OPAQUE_HASH; + + if (is_vf) + def_htype = M_HASHTYPE_NONE; + + /* + * UDP 4-tuple hash is delivered as + * TCP 4-tuple hash. + */ + if (l3proto == ETHERTYPE_MAX) { + hn_rxpkt_proto(m_new, + &l3proto, &l4proto); + } + if (l3proto == ETHERTYPE_IP) { + if (l4proto == IPPROTO_UDP) { + hash_type = + M_HASHTYPE_RSS_UDP_IPV4; + do_lro = 0; + } else if (l4proto != + IPPROTO_TCP) { + hash_type = def_htype; + do_lro = 0; + } + } else { + hash_type = def_htype; + do_lro = 0; + } + } break; case NDIS_HASH_IPV6: hash_type = M_HASHTYPE_RSS_IPV6; do_lro = 0; break; case NDIS_HASH_IPV6_EX: hash_type = M_HASHTYPE_RSS_IPV6_EX; do_lro = 0; break; case NDIS_HASH_TCP_IPV6: hash_type = M_HASHTYPE_RSS_TCP_IPV6; break; case NDIS_HASH_TCP_IPV6_EX: hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; break; } } } else if (!is_vf) { m_new->m_pkthdr.flowid = rxr->hn_rx_idx; hash_type = M_HASHTYPE_OPAQUE; } M_HASHTYPE_SET(m_new, hash_type); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if (hn_ifp != ifp) { const struct ether_header *eh; /* * Non-transparent mode VF is activated. */ /* * Allow tapping on hn(4). */ ETHER_BPF_MTAP(hn_ifp, m_new); /* * Update hn(4)'s stats. */ if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); /* Checked at the beginning of this function. */ KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); eh = mtod(m_new, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); } rxr->hn_pkts++; if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxr->hn_lro; if (lro->lro_cnt) { rxr->hn_lro_tried++; if (hn_lro_rx(lro, m_new) == 0) { /* DONE! */ return 0; } } #endif } ifp->if_input(ifp, m_new); return (0); } static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct hn_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data, ifr_vf; struct ifnet *vf_ifp; int mask, error = 0; struct ifrsskey *ifrk; struct ifrsshash *ifrh; uint32_t mtu; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu > HN_MTU_MAX) { error = EINVAL; break; } HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if ((sc->hn_caps & HN_CAP_MTU) == 0) { /* Can't change MTU */ HN_UNLOCK(sc); error = EOPNOTSUPP; break; } if (ifp->if_mtu == ifr->ifr_mtu) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) { vf_ifp = sc->hn_vf_ifp; ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, sizeof(ifr_vf.ifr_name)); error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr_vf); if (error) { HN_UNLOCK(sc); if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", vf_ifp->if_xname, ifr->ifr_mtu, error); break; } } /* * Suspend this interface before the synthetic parts * are ripped. */ hn_suspend(sc); /* * Detach the synthetics parts, i.e. NVS and RNDIS. */ hn_synth_detach(sc); /* * Reattach the synthetic parts, i.e. NVS and RNDIS, * with the new MTU setting. */ error = hn_synth_attach(sc, ifr->ifr_mtu); if (error) { HN_UNLOCK(sc); break; } error = hn_rndis_get_mtu(sc, &mtu); if (error) mtu = ifr->ifr_mtu; else if (bootverbose) if_printf(ifp, "RNDIS mtu %u\n", mtu); /* * Commit the requested MTU, after the synthetic parts * have been successfully attached. */ if (mtu >= ifr->ifr_mtu) { mtu = ifr->ifr_mtu; } else { if_printf(ifp, "fixup mtu %d -> %u\n", ifr->ifr_mtu, mtu); } ifp->if_mtu = mtu; /* * Synthetic parts' reattach may change the chimney * sending size; update it. */ if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) hn_set_chim_size(sc, sc->hn_chim_szmax); /* * Make sure that various parameters based on MTU are * still valid, after the MTU change. */ hn_mtu_change_fixup(sc); /* * All done! Resume the interface now. */ hn_resume(sc); if ((sc->hn_flags & HN_FLAG_RXVF) || (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { /* * Since we have reattached the NVS part, * change the datapath to VF again; in case * that it is lost, after the NVS was detached. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); } HN_UNLOCK(sc); break; case SIOCSIFFLAGS: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) hn_xpnt_vf_saveifflags(sc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * Caller meight hold mutex, e.g. * bpf; use busy-wait for the RNDIS * reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) error = hn_xpnt_vf_iocsetflags(sc); } else { hn_init_locked(sc); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc, false); } sc->hn_if_flags = ifp->if_flags; HN_UNLOCK(sc); break; case SIOCSIFCAP: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, sizeof(ifr_vf.ifr_name)); error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); HN_UNLOCK(sc); break; } /* * Fix up requested capabilities w/ supported capabilities, * since the supported capabilities could have been changed. */ mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); } /* TODO: flip RNDIS offload parameters for RXCSUM. */ if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; #endif if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; else ifp->if_hwassist &= ~CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; else ifp->if_hwassist &= ~CSUM_IP6_TSO; } HN_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * Multicast uses mutex; use busy-wait for * the RNDIS reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); } /* XXX vlan(4) style mcast addr maintenance */ if (hn_xpnt_vf_isready(sc)) { int old_if_flags; old_if_flags = sc->hn_vf_ifp->if_flags; hn_xpnt_vf_saveifflags(sc); if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & IFF_ALLMULTI)) error = hn_xpnt_vf_iocsetflags(sc); } HN_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { /* * SIOCGIFMEDIA expects ifmediareq, so don't * create and pass ifr_vf to the VF here; just * replace the ifr_name. */ vf_ifp = sc->hn_vf_ifp; strlcpy(ifr->ifr_name, vf_ifp->if_xname, sizeof(ifr->ifr_name)); error = vf_ifp->if_ioctl(vf_ifp, cmd, data); /* Restore the ifr_name. */ strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); HN_UNLOCK(sc); break; } HN_UNLOCK(sc); error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; case SIOCGIFRSSHASH: ifrh = (struct ifrsshash *)data; HN_LOCK(sc); if (sc->hn_rx_ring_inuse == 1) { HN_UNLOCK(sc); ifrh->ifrh_func = RSS_FUNC_NONE; ifrh->ifrh_types = 0; break; } if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; else ifrh->ifrh_func = RSS_FUNC_PRIVATE; ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); HN_UNLOCK(sc); break; case SIOCGIFRSSKEY: ifrk = (struct ifrsskey *)data; HN_LOCK(sc); if (sc->hn_rx_ring_inuse == 1) { HN_UNLOCK(sc); ifrk->ifrk_func = RSS_FUNC_NONE; ifrk->ifrk_keylen = 0; break; } if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; else ifrk->ifrk_func = RSS_FUNC_PRIVATE; ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, NDIS_HASH_KEYSIZE_TOEPLITZ); HN_UNLOCK(sc); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void hn_stop(struct hn_softc *sc, bool detaching) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Clear RUNNING bit ASAP. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); /* Disable polling. */ hn_polling(sc, 0); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { KASSERT(sc->hn_vf_ifp != NULL, ("%s: VF is not attached", ifp->if_xname)); /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); /* * NOTE: * Datapath setting must happen _before_ bringing * the VF down. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); /* * Bring the VF down. */ hn_xpnt_vf_saveifflags(sc); sc->hn_vf_ifp->if_flags &= ~IFF_UP; hn_xpnt_vf_iocsetflags(sc); } /* Suspend data transfers. */ hn_suspend_data(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* * If the non-transparent mode VF is active, make sure * that the RX filter still allows packet reception. */ if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) hn_rxfilter_config(sc); } static void hn_init_locked(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; /* Configure RX filter */ hn_rxfilter_config(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* Clear TX 'suspended' bit. */ hn_resume_tx(sc, sc->hn_tx_ring_inuse); if (hn_xpnt_vf_isready(sc)) { /* Initialize transparent VF. */ hn_xpnt_vf_init(sc); } /* Everything is ready; unleash! */ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); /* Re-enable polling if requested. */ if (sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_init(void *xsc) { struct hn_softc *sc = xsc; HN_LOCK(sc); hn_init_locked(sc); HN_UNLOCK(sc); } #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int lenlim; int error; lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; error = sysctl_handle_int(oidp, &lenlim, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || lenlim > TCP_LRO_LENGTH_MAX) { HN_UNLOCK(sc); return EINVAL; } hn_set_lro_lenlim(sc, lenlim); HN_UNLOCK(sc); return 0; } static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ackcnt, error, i; /* * lro_ackcnt_lim is append count limit, * +1 to turn it into aggregation limit. */ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; error = sysctl_handle_int(oidp, &ackcnt, 0, req); if (error || req->newptr == NULL) return error; if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) return EINVAL; /* * Convert aggregation limit back to append * count limit. */ --ackcnt; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; HN_UNLOCK(sc); return 0; } #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int hcsum = arg2; int on, error, i; on = 0; if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) on = 1; error = sysctl_handle_int(oidp, &on, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) rxr->hn_trust_hcsum |= hcsum; else rxr->hn_trust_hcsum &= ~hcsum; } HN_UNLOCK(sc); return 0; } static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int chim_size, error; chim_size = sc->hn_tx_ring[0].hn_chim_size; error = sysctl_handle_int(oidp, &chim_size, 0, req); if (error || req->newptr == NULL) return error; if (chim_size > sc->hn_chim_szmax || chim_size <= 0) return EINVAL; HN_LOCK(sc); hn_set_chim_size(sc, chim_size); HN_UNLOCK(sc); return 0; } #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((int *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((int *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; u_long stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((u_long *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((u_long *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_tx_ring *txr; u_long stat; stat = 0; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } return 0; } static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error, conf; struct hn_tx_ring *txr; txr = &sc->hn_tx_ring[0]; conf = *((int *)((uint8_t *)txr + ofs)); error = sysctl_handle_int(oidp, &conf, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } HN_UNLOCK(sc); return 0; } static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, size; size = sc->hn_agg_size; error = sysctl_handle_int(oidp, &size, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_size = size; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, pkts; pkts = sc->hn_agg_pkts; error = sysctl_handle_int(oidp, &pkts, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_pkts = pkts; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pkts; pkts = sc->hn_tx_ring[0].hn_agg_pktmax; return (sysctl_handle_int(oidp, &pkts, 0, req)); } static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int align; align = sc->hn_tx_ring[0].hn_agg_align; return (sysctl_handle_int(oidp, &align, 0, req)); } static void hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) { if (pollhz == 0) vmbus_chan_poll_disable(chan); else vmbus_chan_poll_enable(chan, pollhz); } static void hn_polling(struct hn_softc *sc, u_int pollhz) { int nsubch = sc->hn_rx_ring_inuse - 1; HN_LOCK_ASSERT(sc); if (nsubch > 0) { struct vmbus_channel **subch; int i; subch = vmbus_subchan_get(sc->hn_prichan, nsubch); for (i = 0; i < nsubch; ++i) hn_chan_polling(subch[i], pollhz); vmbus_subchan_rel(subch, nsubch); } hn_chan_polling(sc->hn_prichan, pollhz); } static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pollhz, error; pollhz = sc->hn_pollhz; error = sysctl_handle_int(oidp, &pollhz, 0, req); if (error || req->newptr == NULL) return (error); if (pollhz != 0 && (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) return (EINVAL); HN_LOCK(sc); if (sc->hn_pollhz != pollhz) { sc->hn_pollhz = pollhz; if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) hn_polling(sc, sc->hn_pollhz); } HN_UNLOCK(sc); return (0); } static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char caps_str[128]; uint32_t caps; HN_LOCK(sc); caps = sc->hn_caps; HN_UNLOCK(sc); snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); } static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char assist_str[128]; uint32_t hwassist; HN_LOCK(sc); hwassist = sc->hn_ifp->if_hwassist; HN_UNLOCK(sc); snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); } static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char filter_str[128]; uint32_t filter; HN_LOCK(sc); filter = sc->hn_rx_filter; HN_UNLOCK(sc); snprintf(filter_str, sizeof(filter_str), "%b", filter, NDIS_PACKET_TYPES); return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); } #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error || req->newptr == NULL) goto back; if ((sc->hn_flags & HN_FLAG_RXVF) || (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { /* * RSS key is synchronized w/ VF's, don't allow users * to change it. */ error = EBUSY; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (sc->hn_rx_ring_inuse > 1) { error = hn_rss_reconfig(sc); } else { /* Not RSS capable, at least for now; just save the RSS key. */ error = 0; } back: HN_UNLOCK(sc); return (error); } static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error || req->newptr == NULL) goto back; /* * Don't allow RSS indirect table change, if this interface is not * RSS capable currently. */ if (sc->hn_rx_ring_inuse == 1) { error = EOPNOTSUPP; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSIND; hn_rss_ind_fixup(sc); error = hn_rss_reconfig(sc); back: HN_UNLOCK(sc); return (error); } #endif /* !RSS */ static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rss_hash; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rss_hcap; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rx_ring[0].hn_mbuf_hash; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; struct ifnet *vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_vf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; struct ifnet *vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct ifnet *ifp; if (hn_vfmap[i] == NULL) continue; ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) sbuf_printf(sb, "%s", ifp->if_xname); else sbuf_printf(sb, " %s", ifp->if_xname); first = false; } } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct ifnet *ifp, *hn_ifp; hn_ifp = hn_vfmap[i]; if (hn_ifp == NULL) continue; ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) { sbuf_printf(sb, "%s:%s", ifp->if_xname, hn_ifp->if_xname); } else { sbuf_printf(sb, " %s:%s", ifp->if_xname, hn_ifp->if_xname); } first = false; } } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, onoff = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) onoff = 1; error = sysctl_handle_int(oidp, &onoff, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); /* NOTE: hn_vf_lock for hn_transmit() */ rm_wlock(&sc->hn_vf_lock); if (onoff) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; else sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; rm_wunlock(&sc->hn_vf_lock); HN_UNLOCK(sc); return (0); } static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int enabled = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) enabled = 1; return (sysctl_handle_int(oidp, &enabled, 0, req)); } static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; int len, iphlen, iplen; const struct tcphdr *th; int thoff; /* TCP data offset */ len = hoff + sizeof(struct ip); /* The packet must be at least the size of an IP header. */ if (m->m_pkthdr.len < len) return IPPROTO_DONE; /* The fixed IP header must reside completely in the first mbuf. */ if (m->m_len < len) return IPPROTO_DONE; ip = mtodo(m, hoff); /* Bound check the packet's stated IP header length. */ iphlen = ip->ip_hl << 2; if (iphlen < sizeof(struct ip)) /* minimum header length */ return IPPROTO_DONE; /* The full IP header must reside completely in the one mbuf. */ if (m->m_len < hoff + iphlen) return IPPROTO_DONE; iplen = ntohs(ip->ip_len); /* * Check that the amount of data in the buffers is as * at least much as the IP header would have us expect. */ if (m->m_pkthdr.len < hoff + iplen) return IPPROTO_DONE; /* * Ignore IP fragments. */ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) return IPPROTO_DONE; /* * The TCP/IP or UDP/IP header must be entirely contained within * the first fragment of a packet. */ switch (ip->ip_p) { case IPPROTO_TCP: if (iplen < iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); thoff = th->th_off << 2; if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + thoff) return IPPROTO_DONE; break; case IPPROTO_UDP: if (iplen < iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; break; default: if (iplen < iphlen) return IPPROTO_DONE; break; } return ip->ip_p; } +static void +hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) +{ + const struct ether_header *eh; + uint16_t etype; + int hoff; + + hoff = sizeof(*eh); + /* Checked at the beginning of this function. */ + KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); + + eh = mtod(m_new, const struct ether_header *); + etype = ntohs(eh->ether_type); + if (etype == ETHERTYPE_VLAN) { + const struct ether_vlan_header *evl; + + hoff = sizeof(*evl); + if (m_new->m_len < hoff) + return; + evl = mtod(m_new, const struct ether_vlan_header *); + etype = ntohs(evl->evl_proto); + } + *l3proto = etype; + + if (etype == ETHERTYPE_IP) + *l4proto = hn_check_iplen(m_new, hoff); + else + *l4proto = IPPROTO_DONE; +} + static int hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev = sc->hn_dev; #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 int lroent_cnt; #endif #endif int i; /* * Create RXBUF for reception. * * NOTE: * - It is shared by all channels. * - A large enough buffer is allocated, certain version of NVSes * may further limit the usable space. */ sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_rxbuf == NULL) { device_printf(sc->hn_dev, "allocate rxbuf failed\n"); return (ENOMEM); } sc->hn_rx_ring_cnt = ring_cnt; sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 lroent_cnt = hn_lro_entry_count; if (lroent_cnt < TCP_LRO_ENTRIES) lroent_cnt = TCP_LRO_ENTRIES; if (bootverbose) device_printf(dev, "LRO: entry count %d\n", lroent_cnt); #endif #endif /* INET || INET6 */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); /* Create dev.hn.UNIT.rx sysctl tree */ sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, &rxr->hn_br_dma, BUS_DMA_WAITOK); if (rxr->hn_br == NULL) { device_printf(dev, "allocate bufring failed\n"); return (ENOMEM); } if (hn_trust_hosttcp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; if (hn_trust_hostudp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; if (hn_trust_hostip) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; rxr->hn_mbuf_hash = NDIS_HASH_ALL; rxr->hn_ifp = sc->hn_ifp; if (i < sc->hn_tx_ring_cnt) rxr->hn_txr = &sc->hn_tx_ring[i]; rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); rxr->hn_rx_idx = i; rxr->hn_rxbuf = sc->hn_rxbuf; /* * Initialize LRO. */ #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, hn_lro_mbufq_depth); #else tcp_lro_init(&rxr->hn_lro); rxr->hn_lro.ifp = sc->hn_ifp; #endif #if __FreeBSD_version >= 1100099 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif #endif /* INET || INET6 */ if (sc->hn_rx_sysctl_tree != NULL) { char name[16]; /* * Create per RX ring sysctl tree: * dev.hn.UNIT.rx.RINGID */ snprintf(name, sizeof(name), "%d", i); rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (rxr->hn_rx_sysctl_tree != NULL) { SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "packets", CTLFLAG_RW, &rxr->hn_pkts, "# of packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rss_pkts", CTLFLAG_RW, &rxr->hn_rss_pkts, "# of packets w/ RSS info received"); SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "pktbuf_len", CTLFLAG_RD, &rxr->hn_pktbuf_len, 0, "Temporary channel packet buffer length"); } } } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_queued), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO queued"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO flushed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro_tried), hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); #if __FreeBSD_version >= 1100099 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_lenlim_sysctl, "IU", "Max # of data bytes to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_ackcnt_sysctl, "I", "Max # of ACKs to be aggregated by LRO"); #endif SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", "Trust tcp segement verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, hn_trust_hcsum_sysctl, "I", "Trust udp datagram verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, hn_trust_hcsum_sysctl, "I", "Trust ip packet verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_ip), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_tcp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_udp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_trusted), hn_rx_stat_ulong_sysctl, "LU", "# of packets that we trust host's csum verification"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_small_pkts), hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_ack_failed), hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); return (0); } static void hn_destroy_rx_data(struct hn_softc *sc) { int i; if (sc->hn_rxbuf != NULL) { if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); else device_printf(sc->hn_dev, "RXBUF is referenced\n"); sc->hn_rxbuf = NULL; } if (sc->hn_rx_ring_cnt == 0) return; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_br == NULL) continue; if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); } else { device_printf(sc->hn_dev, "%dth channel bufring is referenced", i); } rxr->hn_br = NULL; #if defined(INET) || defined(INET6) tcp_lro_free(&rxr->hn_lro); #endif free(rxr->hn_pktbuf, M_DEVBUF); } free(sc->hn_rx_ring, M_DEVBUF); sc->hn_rx_ring = NULL; sc->hn_rx_ring_cnt = 0; sc->hn_rx_ring_inuse = 0; } static int hn_tx_ring_create(struct hn_softc *sc, int id) { struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; device_t dev = sc->hn_dev; bus_dma_tag_t parent_dtag; int error, i; txr->hn_sc = sc; txr->hn_tx_idx = id; #ifndef HN_USE_TXDESC_BUFRING mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); #endif mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); txr->hn_txdesc_cnt = HN_TX_DESC_CNT; txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #ifndef HN_USE_TXDESC_BUFRING SLIST_INIT(&txr->hn_txlist); #else txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); #endif if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); } else { txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; } #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { txr->hn_txeof = hn_start_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); } else #endif { int br_depth; txr->hn_txeof = hn_xmit_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); br_depth = hn_get_txswq_depth(txr); txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); } txr->hn_direct_tx_size = hn_direct_tx_size; /* * Always schedule transmission instead of trying to do direct * transmission. This one gives the best performance so far. */ txr->hn_sched_tx = 1; parent_dtag = bus_get_dma_tag(dev); /* DMA tag for RNDIS packet messages. */ error = bus_dma_tag_create(parent_dtag, /* parent */ HN_RNDIS_PKT_ALIGN, /* alignment */ HN_RNDIS_PKT_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_RNDIS_PKT_LEN, /* maxsize */ 1, /* nsegments */ HN_RNDIS_PKT_LEN, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_rndis_dtag); if (error) { device_printf(dev, "failed to create rndis dmatag\n"); return error; } /* DMA tag for data. */ error = bus_dma_tag_create(parent_dtag, /* parent */ 1, /* alignment */ HN_TX_DATA_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_TX_DATA_MAXSIZE, /* maxsize */ HN_TX_DATA_SEGCNT_MAX, /* nsegments */ HN_TX_DATA_SEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_data_dtag); if (error) { device_printf(dev, "failed to create data dmatag\n"); return error; } for (i = 0; i < txr->hn_txdesc_cnt; ++i) { struct hn_txdesc *txd = &txr->hn_txdesc[i]; txd->txr = txr; txd->chim_index = HN_NVS_CHIM_IDX_INVALID; STAILQ_INIT(&txd->agg_list); /* * Allocate and load RNDIS packet message. */ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, (void **)&txd->rndis_pkt, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &txd->rndis_pkt_dmap); if (error) { device_printf(dev, "failed to allocate rndis_packet_msg, %d\n", i); return error; } error = bus_dmamap_load(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap, txd->rndis_pkt, HN_RNDIS_PKT_LEN, hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "failed to load rndis_packet_msg, %d\n", i); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* DMA map for TX data. */ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, &txd->data_dmap); if (error) { device_printf(dev, "failed to allocate tx data dmamap\n"); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* All set, put it to list */ txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); #else buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif } txr->hn_txdesc_avail = txr->hn_txdesc_cnt; if (sc->hn_tx_sysctl_tree != NULL) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; char name[16]; /* * Create per TX ring sysctl tree: * dev.hn.UNIT.tx.RINGID */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); snprintf(name, sizeof(name), "%d", id); txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); #ifdef HN_DEBUG SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); #endif #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", CTLFLAG_RD, &txr->hn_oactive, 0, "over active"); } SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", CTLFLAG_RW, &txr->hn_pkts, "# of packets transmitted"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", CTLFLAG_RW, &txr->hn_sends, "# of sends"); } } return 0; } static void hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) { struct hn_tx_ring *txr = txd->txr; KASSERT(txd->m == NULL, ("still has mbuf installed")); KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); } static void hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->refs == 0 || txd->refs == 1, ("invalid txd refs %d", txd->refs)); /* Aggregated txds will be freed by their aggregating txd. */ if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { int freed; freed = hn_txdesc_put(txr, txd); KASSERT(freed, ("can't free txdesc")); } } static void hn_tx_ring_destroy(struct hn_tx_ring *txr) { int i; if (txr->hn_txdesc == NULL) return; /* * NOTE: * Because the freeing of aggregated txds will be deferred * to the aggregating txd, two passes are used here: * - The first pass GCes any pending txds. This GC is necessary, * since if the channels are revoked, hypervisor will not * deliver send-done for all pending txds. * - The second pass frees the busdma stuffs, i.e. after all txds * were freed. */ for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_gc(txr, &txr->hn_txdesc[i]); for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); if (txr->hn_tx_data_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_data_dtag); if (txr->hn_tx_rndis_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); #ifdef HN_USE_TXDESC_BUFRING buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); #endif free(txr->hn_txdesc, M_DEVBUF); txr->hn_txdesc = NULL; if (txr->hn_mbuf_br != NULL) buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); #ifndef HN_USE_TXDESC_BUFRING mtx_destroy(&txr->hn_txlist_spin); #endif mtx_destroy(&txr->hn_tx_lock); } static int hn_create_tx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int i; /* * Create TXBUF for chimney sending. * * NOTE: It is shared by all channels. */ sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_chim == NULL) { device_printf(sc->hn_dev, "allocate txbuf failed\n"); return (ENOMEM); } sc->hn_tx_ring_cnt = ring_cnt; sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); ctx = device_get_sysctl_ctx(sc->hn_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); /* Create dev.hn.UNIT.tx sysctl tree */ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { int error; error = hn_tx_ring_create(sc, i); if (error) return error; } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_no_txdescs), hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_send_failed), hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_flush_failed), hn_tx_stat_ulong_sysctl, "LU", "# of packet transmission aggregation flush failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", CTLFLAG_RD, &sc->hn_chim_szmax, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_chim_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), hn_tx_conf_int_sysctl, "I", "Size of the packet for direct transmission"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_sched_tx), hn_tx_conf_int_sysctl, "I", "Always schedule transmission " "instead of doing direct transmission"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, "Applied packet transmission aggregation size"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pktmax_sysctl, "I", "Applied packet transmission aggregation packets"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_align_sysctl, "I", "Applied packet transmission aggregation alignment"); return 0; } static void hn_set_chim_size(struct hn_softc *sc, int chim_size) { int i; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_chim_size = chim_size; } static void hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) { struct ifnet *ifp = sc->hn_ifp; u_int hw_tsomax; int tso_minlen; HN_LOCK_ASSERT(sc); if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) return; KASSERT(sc->hn_ndis_tso_sgmin >= 2, ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); tso_minlen = sc->hn_ndis_tso_sgmin * mtu; KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && sc->hn_ndis_tso_szmax <= IP_MAXPACKET, ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); if (tso_maxlen < tso_minlen) tso_maxlen = tso_minlen; else if (tso_maxlen > IP_MAXPACKET) tso_maxlen = IP_MAXPACKET; if (tso_maxlen > sc->hn_ndis_tso_szmax) tso_maxlen = sc->hn_ndis_tso_szmax; hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); if (hn_xpnt_vf_isready(sc)) { if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; } ifp->if_hw_tsomax = hw_tsomax; if (bootverbose) if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); } static void hn_fixup_tx_data(struct hn_softc *sc) { uint64_t csum_assist; int i; hn_set_chim_size(sc, sc->hn_chim_szmax); if (hn_tx_chimney_size > 0 && hn_tx_chimney_size < sc->hn_chim_szmax) hn_set_chim_size(sc, hn_tx_chimney_size); csum_assist = 0; if (sc->hn_caps & HN_CAP_IPCS) csum_assist |= CSUM_IP; if (sc->hn_caps & HN_CAP_TCP4CS) csum_assist |= CSUM_IP_TCP; if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) csum_assist |= CSUM_IP_UDP; if (sc->hn_caps & HN_CAP_TCP6CS) csum_assist |= CSUM_IP6_TCP; if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) csum_assist |= CSUM_IP6_UDP; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_csum_assist = csum_assist; if (sc->hn_caps & HN_CAP_HASHVAL) { /* * Support HASHVAL pktinfo on TX path. */ if (bootverbose) if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; + } +} + +static void +hn_fixup_rx_data(struct hn_softc *sc) +{ + + if (sc->hn_caps & HN_CAP_UDPHASH) { + int i; + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) + sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; } } static void hn_destroy_tx_data(struct hn_softc *sc) { int i; if (sc->hn_chim != NULL) { if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); } else { device_printf(sc->hn_dev, "chimney sending buffer is referenced"); } sc->hn_chim = NULL; } if (sc->hn_tx_ring_cnt == 0) return; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_destroy(&sc->hn_tx_ring[i]); free(sc->hn_tx_ring, M_DEVBUF); sc->hn_tx_ring = NULL; sc->hn_tx_ring_cnt = 0; sc->hn_tx_ring_inuse = 0; } #ifdef HN_IFSTART_SUPPORT static void hn_start_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; int sched = 0; KASSERT(hn_use_if_start, ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (0); while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { struct hn_txdesc *txd; struct mbuf *m_head; int error; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); sched = 1; break; } #if defined(INET6) || defined(INET) if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { m_head = hn_tso_fixup(m_head); if (__predict_false(m_head == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); continue; } } else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { m_head = hn_set_hlen(m_head); if (__predict_false(m_head == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); continue; } } #endif txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static void hn_start(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } static void hn_start_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_start_txeof(struct hn_tx_ring *txr) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the OACTIVE earlier, with the hope, that * others could catch up. The task will clear the * flag again with the hn_tx_lock to avoid possible * races. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } #endif /* HN_IFSTART_SUPPORT */ static int hn_xmit(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; struct mbuf *m_head; int sched = 0; mtx_assert(&txr->hn_tx_lock, MA_OWNED); #ifdef HN_IFSTART_SUPPORT KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); #endif KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) return (0); while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { struct hn_txdesc *txd; int error; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); sched = 1; break; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed; discard */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); drbr_advance(ifp, txr->hn_mbuf_br); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { txr->hn_oactive = 1; break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif /* Sent */ drbr_advance(ifp, txr->hn_mbuf_br); } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static int hn_transmit(struct ifnet *ifp, struct mbuf *m) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr; int error, idx = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { struct rm_priotracker pt; rm_rlock(&sc->hn_vf_lock, &pt); if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { struct mbuf *m_bpf = NULL; int obytes, omcast; obytes = m->m_pkthdr.len; if (m->m_flags & M_MCAST) omcast = 1; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { if (bpf_peers_present(ifp->if_bpf)) { m_bpf = m_copypacket(m, M_NOWAIT); if (m_bpf == NULL) { /* * Failed to grab a shallow * copy; tap now. */ ETHER_BPF_MTAP(ifp, m); } } } else { ETHER_BPF_MTAP(ifp, m); } error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); rm_runlock(&sc->hn_vf_lock, &pt); if (m_bpf != NULL) { if (!error) ETHER_BPF_MTAP(ifp, m_bpf); m_freem(m_bpf); } if (error == ENOBUFS) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); } else if (error) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); if (omcast) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, omcast); } } return (error); } rm_runlock(&sc->hn_vf_lock, &pt); } #if defined(INET6) || defined(INET) /* * Perform TSO packet header fixup or get l2/l3 header length now, * since packet headers should be cache-hot. */ if (m->m_pkthdr.csum_flags & CSUM_TSO) { m = hn_tso_fixup(m); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return EIO; } } else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { m = hn_set_hlen(m); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return EIO; } } #endif /* * Select the TX ring based on flowid */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { #ifdef RSS uint32_t bid; if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), &bid) == 0) idx = bid % sc->hn_tx_ring_inuse; else #endif { #if defined(INET6) || defined(INET) int tcpsyn = 0; if (m->m_pkthdr.len < 128 && (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) && (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { m = hn_check_tcpsyn(m, &tcpsyn); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (EIO); } } #else const int tcpsyn = 0; #endif if (tcpsyn) idx = 0; else idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; } } txr = &sc->hn_tx_ring[idx]; error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); if (error) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return error; } if (txr->hn_oactive) return 0; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return 0; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); return 0; } static void hn_tx_ring_qflush(struct hn_tx_ring *txr) { struct mbuf *m; mtx_lock(&txr->hn_tx_lock); while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) m_freem(m); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_qflush(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; struct rm_priotracker pt; int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); if_qflush(ifp); rm_rlock(&sc->hn_vf_lock, &pt); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); rm_runlock(&sc->hn_vf_lock, &pt); } static void hn_xmit_txeof(struct hn_tx_ring *txr) { if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; txr->hn_oactive = 0; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the oactive earlier, with the hope, that * others could catch up. The task will clear the * oactive again with the hn_tx_lock to avoid possible * races. */ txr->hn_oactive = 0; taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_xmit_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); txr->hn_oactive = 0; hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) { struct vmbus_chan_br cbr; struct hn_rx_ring *rxr; struct hn_tx_ring *txr = NULL; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("RX ring %d already attached", idx)); rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; rxr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } if (idx < sc->hn_tx_ring_inuse) { txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("TX ring %d already attached", idx)); txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; txr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } } /* Bind this channel to a proper CPU. */ vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); /* * Open this channel */ cbr.cbr = rxr->hn_br; cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; cbr.cbr_txsz = HN_TXBR_SIZE; cbr.cbr_rxsz = HN_RXBR_SIZE; error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); if (error) { if (error == EISCONN) { if_printf(sc->hn_ifp, "bufring is connected after " "chan%u open failure\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else { if_printf(sc->hn_ifp, "open chan%u failed: %d\n", vmbus_chan_id(chan), error); } } return (error); } static void hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) { struct hn_rx_ring *rxr; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), ("RX ring %d is not attached", idx)); rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; if (idx < sc->hn_tx_ring_inuse) { struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), ("TX ring %d is not attached attached", idx)); txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } /* * Close this channel. * * NOTE: * Channel closing does _not_ destroy the target channel. */ error = vmbus_chan_close_direct(chan); if (error == EISCONN) { if_printf(sc->hn_ifp, "chan%u bufring is connected " "after being closed\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else if (error) { if_printf(sc->hn_ifp, "chan%u close failed: %d\n", vmbus_chan_id(chan), error); } } static int hn_attach_subchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i, error = 0; KASSERT(subchan_cnt > 0, ("no sub-channels")); /* Attach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { int error1; error1 = hn_chan_attach(sc, subchans[i]); if (error1) { error = error1; /* Move on; all channels will be detached later. */ } } vmbus_subchan_rel(subchans, subchan_cnt); if (error) { if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "%d sub-channels attached\n", subchan_cnt); } } return (error); } static void hn_detach_allchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i; if (subchan_cnt == 0) goto back; /* Detach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) hn_chan_detach(sc, subchans[i]); vmbus_subchan_rel(subchans, subchan_cnt); back: /* * Detach the primary channel, _after_ all sub-channels * are detached. */ hn_chan_detach(sc, sc->hn_prichan); /* Wait for sub-channels to be destroyed, if any. */ vmbus_subchan_drain(sc->hn_prichan); #ifdef INVARIANTS for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { KASSERT((sc->hn_rx_ring[i].hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("%dth RX ring is still attached", i)); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { KASSERT((sc->hn_tx_ring[i].hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("%dth TX ring is still attached", i)); } #endif } static int hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) { struct vmbus_channel **subchans; int nchan, rxr_cnt, error; nchan = *nsubch + 1; if (nchan == 1) { /* * Multiple RX/TX rings are not requested. */ *nsubch = 0; return (0); } /* * Query RSS capabilities, e.g. # of RX rings, and # of indirect * table entries. */ error = hn_rndis_query_rsscaps(sc, &rxr_cnt); if (error) { /* No RSS; this is benign. */ *nsubch = 0; return (0); } if (bootverbose) { if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", rxr_cnt, nchan); } if (nchan > rxr_cnt) nchan = rxr_cnt; if (nchan == 1) { if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); *nsubch = 0; return (0); } /* * Allocate sub-channels from NVS. */ *nsubch = nchan - 1; error = hn_nvs_alloc_subchans(sc, nsubch); if (error || *nsubch == 0) { /* Failed to allocate sub-channels. */ *nsubch = 0; return (0); } /* * Wait for all sub-channels to become ready before moving on. */ subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); vmbus_subchan_rel(subchans, *nsubch); return (0); } static bool hn_synth_attachable(const struct hn_softc *sc) { int i; if (sc->hn_flags & HN_FLAG_ERRORS) return (false); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) return (false); } return (true); } /* * Make sure that the RX filter is zero after the successful * RNDIS initialization. * * NOTE: * Under certain conditions on certain versions of Hyper-V, * the RNDIS rxfilter is _not_ zero on the hypervisor side * after the successful RNDIS initialization, which breaks * the assumption of any following code (well, it breaks the * RNDIS API contract actually). Clear the RNDIS rxfilter * explicitly, drain packets sneaking through, and drain the * interrupt taskqueues scheduled due to the stealth packets. */ static void hn_rndis_init_fixat(struct hn_softc *sc, int nchan) { hn_disable_rx(sc); hn_drain_rxtx(sc, nchan); } static int hn_synth_attach(struct hn_softc *sc, int mtu) { #define ATTACHED_NVS 0x0002 #define ATTACHED_RNDIS 0x0004 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int error, nsubch, nchan = 1, i, rndis_inited; uint32_t old_caps, attached = 0; KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, ("synthetic parts were attached")); if (!hn_synth_attachable(sc)) return (ENXIO); /* Save capabilities for later verification. */ old_caps = sc->hn_caps; sc->hn_caps = 0; /* Clear RSS stuffs. */ sc->hn_rss_ind_size = 0; sc->hn_rss_hash = 0; sc->hn_rss_hcap = 0; /* * Attach the primary channel _before_ attaching NVS and RNDIS. */ error = hn_chan_attach(sc, sc->hn_prichan); if (error) goto failed; /* * Attach NVS. */ error = hn_nvs_attach(sc, mtu); if (error) goto failed; attached |= ATTACHED_NVS; /* * Attach RNDIS _after_ NVS is attached. */ error = hn_rndis_attach(sc, mtu, &rndis_inited); if (rndis_inited) attached |= ATTACHED_RNDIS; if (error) goto failed; /* * Make sure capabilities are not changed. */ if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", old_caps, sc->hn_caps); error = ENXIO; goto failed; } /* * Allocate sub-channels for multi-TX/RX rings. * * NOTE: * The # of RX rings that can be used is equivalent to the # of * channels to be requested. */ nsubch = sc->hn_rx_ring_cnt - 1; error = hn_synth_alloc_subchans(sc, &nsubch); if (error) goto failed; /* NOTE: _Full_ synthetic parts detach is required now. */ sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; /* * Set the # of TX/RX rings that could be used according to * the # of channels that NVS offered. */ nchan = nsubch + 1; hn_set_ring_inuse(sc, nchan); if (nchan == 1) { /* Only the primary channel can be used; done */ goto back; } /* * Attach the sub-channels. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ error = hn_attach_subchans(sc); if (error) goto failed; /* * Configure RSS key and indirect table _after_ all sub-channels * are attached. */ if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { /* * RSS key is not set yet; set it to the default RSS key. */ if (bootverbose) if_printf(sc->hn_ifp, "setup default RSS key\n"); #ifdef RSS rss_getkey(rss->rss_key); #else memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); #endif sc->hn_flags |= HN_FLAG_HAS_RSSKEY; } if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { /* * RSS indirect table is not set yet; set it up in round- * robin fashion. */ if (bootverbose) { if_printf(sc->hn_ifp, "setup default RSS indirect " "table\n"); } for (i = 0; i < NDIS_HASH_INDCNT; ++i) { uint32_t subidx; #ifdef RSS subidx = rss_get_indirection_to_bucket(i); #else subidx = i; #endif rss->rss_ind[i] = subidx % nchan; } sc->hn_flags |= HN_FLAG_HAS_RSSIND; } else { /* * # of usable channels may be changed, so we have to * make sure that all entries in RSS indirect table * are valid. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ hn_rss_ind_fixup(sc); } sc->hn_rss_hash = sc->hn_rss_hcap; if ((sc->hn_flags & HN_FLAG_RXVF) || (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { /* NOTE: Don't reconfigure RSS; will do immediately. */ hn_vf_rss_fixup(sc, false); } error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) goto failed; back: /* * Fixup transmission aggregation setup. */ hn_set_txagg(sc); hn_rndis_init_fixat(sc, nchan); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { hn_rndis_init_fixat(sc, nchan); hn_synth_detach(sc); } else { if (attached & ATTACHED_RNDIS) { hn_rndis_init_fixat(sc, nchan); hn_rndis_detach(sc); } if (attached & ATTACHED_NVS) hn_nvs_detach(sc); hn_chan_detach(sc, sc->hn_prichan); /* Restore old capabilities. */ sc->hn_caps = old_caps; } return (error); #undef ATTACHED_RNDIS #undef ATTACHED_NVS } /* * NOTE: * The interface must have been suspended though hn_suspend(), before * this function get called. */ static void hn_synth_detach(struct hn_softc *sc) { KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Detach the RNDIS first. */ hn_rndis_detach(sc); /* Detach NVS. */ hn_nvs_detach(sc); /* Detach all of the channels. */ hn_detach_allchans(sc); sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; } static void hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) { KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, ("invalid ring count %d", ring_cnt)); if (sc->hn_tx_ring_cnt > ring_cnt) sc->hn_tx_ring_inuse = ring_cnt; else sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_rx_ring_inuse = ring_cnt; #ifdef RSS if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, rss_getnumbuckets()); } #endif if (bootverbose) { if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); } } static void hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) { /* * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_rx_empty(chan) || (!vmbus_chan_is_revoked(sc->hn_prichan) && !vmbus_chan_tx_empty(chan))) pause("waitch", 1); vmbus_chan_intr_drain(chan); } static void hn_disable_rx(struct hn_softc *sc) { /* * Disable RX by clearing RX filter forcefully. */ sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ /* * Give RNDIS enough time to flush all pending data packets. */ pause("waitrx", (200 * hz) / 1000); } /* * NOTE: * RX/TX _must_ have been suspended/disabled, before this function * is called. */ static void hn_drain_rxtx(struct hn_softc *sc, int nchan) { struct vmbus_channel **subch = NULL; int nsubch; /* * Drain RX/TX bufrings and interrupts. */ nsubch = nchan - 1; if (nsubch > 0) subch = vmbus_subchan_get(sc->hn_prichan, nsubch); if (subch != NULL) { int i; for (i = 0; i < nsubch; ++i) hn_chan_drain(sc, subch[i]); } hn_chan_drain(sc, sc->hn_prichan); if (subch != NULL) vmbus_subchan_rel(subch, nsubch); } static void hn_suspend_data(struct hn_softc *sc) { struct hn_tx_ring *txr; int i; HN_LOCK_ASSERT(sc); /* * Suspend TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 1; mtx_unlock(&txr->hn_tx_lock); /* No one is able send more packets now. */ /* * Wait for all pending sends to finish. * * NOTE: * We will _not_ receive all pending send-done, if the * primary channel is revoked. */ while (hn_tx_ring_pending(txr) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("hnwtx", 1 /* 1 tick */); } /* * Disable RX. */ hn_disable_rx(sc); /* * Drain RX/TX. */ hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); /* * Drain any pending TX tasks. * * NOTE: * The above hn_drain_rxtx() can dispatch TX tasks, so the TX * tasks will have to be drained _after_ the above hn_drain_rxtx(). */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) { ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; } static void hn_suspend_mgmt(struct hn_softc *sc) { struct task task; HN_LOCK_ASSERT(sc); /* * Make sure that hn_mgmt_taskq0 can nolonger be accessed * through hn_mgmt_taskq. */ TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); vmbus_chan_run_task(sc->hn_prichan, &task); /* * Make sure that all pending management tasks are completed. */ taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); taskqueue_drain_all(sc->hn_mgmt_taskq0); } static void hn_suspend(struct hn_softc *sc) { /* Disable polling. */ hn_polling(sc, 0); /* * If the non-transparent mode VF is activated, the synthetic * device is receiving packets, so the data path of the * synthetic device must be suspended. */ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_suspend_data(sc); hn_suspend_mgmt(sc); } static void hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) { int i; KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, ("invalid TX ring count %d", tx_ring_cnt)); for (i = 0; i < tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 0; mtx_unlock(&txr->hn_tx_lock); } } static void hn_resume_data(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* * Re-enable RX. */ hn_rxfilter_config(sc); /* * Make sure to clear suspend status on "all" TX rings, * since hn_tx_ring_inuse can be changed after * hn_suspend_data(). */ hn_resume_tx(sc, sc->hn_tx_ring_cnt); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { /* * Flush unused drbrs, since hn_tx_ring_inuse may be * reduced. */ for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); } /* * Kick start TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; /* * Use txeof task, so that any pending oactive can be * cleared properly. */ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_resume_mgmt(struct hn_softc *sc) { sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; /* * Kick off network change detection, if it was pending. * If no network change was pending, start link status * checks, which is more lightweight than network change * detection. */ if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) hn_change_network(sc); else hn_update_link_status(sc); } static void hn_resume(struct hn_softc *sc) { /* * If the non-transparent mode VF is activated, the synthetic * device have to receive packets, so the data path of the * synthetic device must be resumed. */ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_resume_data(sc); /* * Don't resume link status change if VF is attached/activated. * - In the non-transparent VF mode, the synthetic device marks * link down until the VF is deactivated; i.e. VF is down. * - In transparent VF mode, VF's media status is used until * the VF is detached. */ if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) hn_resume_mgmt(sc); /* * Re-enable polling if this interface is running and * the polling is requested. */ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) { const struct rndis_status_msg *msg; int ofs; if (dlen < sizeof(*msg)) { if_printf(sc->hn_ifp, "invalid RNDIS status\n"); return; } msg = data; switch (msg->rm_status) { case RNDIS_STATUS_MEDIA_CONNECT: case RNDIS_STATUS_MEDIA_DISCONNECT: hn_update_link_status(sc); break; case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: case RNDIS_STATUS_LINK_SPEED_CHANGE: /* Not really useful; ignore. */ break; case RNDIS_STATUS_NETWORK_CHANGE: ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); if (dlen < ofs + msg->rm_stbuflen || msg->rm_stbuflen < sizeof(uint32_t)) { if_printf(sc->hn_ifp, "network changed\n"); } else { uint32_t change; memcpy(&change, ((const uint8_t *)msg) + ofs, sizeof(change)); if_printf(sc->hn_ifp, "network changed, change %u\n", change); } hn_change_network(sc); break; default: if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", msg->rm_status); break; } } static int hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) { const struct rndis_pktinfo *pi = info_data; uint32_t mask = 0; while (info_dlen != 0) { const void *data; uint32_t dlen; if (__predict_false(info_dlen < sizeof(*pi))) return (EINVAL); if (__predict_false(info_dlen < pi->rm_size)) return (EINVAL); info_dlen -= pi->rm_size; if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) return (EINVAL); if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) return (EINVAL); dlen = pi->rm_size - pi->rm_pktinfooffset; data = pi->rm_data; switch (pi->rm_type) { case NDIS_PKTINFO_TYPE_VLAN: if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) return (EINVAL); info->vlan_info = *((const uint32_t *)data); mask |= HN_RXINFO_VLAN; break; case NDIS_PKTINFO_TYPE_CSUM: if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) return (EINVAL); info->csum_info = *((const uint32_t *)data); mask |= HN_RXINFO_CSUM; break; case HN_NDIS_PKTINFO_TYPE_HASHVAL: if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) return (EINVAL); info->hash_value = *((const uint32_t *)data); mask |= HN_RXINFO_HASHVAL; break; case HN_NDIS_PKTINFO_TYPE_HASHINF: if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) return (EINVAL); info->hash_info = *((const uint32_t *)data); mask |= HN_RXINFO_HASHINF; break; default: goto next; } if (mask == HN_RXINFO_ALL) { /* All found; done */ break; } next: pi = (const struct rndis_pktinfo *) ((const uint8_t *)pi + pi->rm_size); } /* * Final fixup. * - If there is no hash value, invalidate the hash info. */ if ((mask & HN_RXINFO_HASHVAL) == 0) info->hash_info = HN_NDIS_HASH_INFO_INVALID; return (0); } static __inline bool hn_rndis_check_overlap(int off, int len, int check_off, int check_len) { if (off < check_off) { if (__predict_true(off + len <= check_off)) return (false); } else if (off > check_off) { if (__predict_true(check_off + check_len <= off)) return (false); } return (true); } static void hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_packet_msg *pkt; struct hn_rxinfo info; int data_off, pktinfo_off, data_len, pktinfo_len; /* * Check length. */ if (__predict_false(dlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); return; } pkt = data; if (__predict_false(dlen < pkt->rm_len)) { if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " "dlen %d, msglen %u\n", dlen, pkt->rm_len); return; } if (__predict_false(pkt->rm_len < pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " "msglen %u, data %u, oob %u, pktinfo %u\n", pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, pkt->rm_pktinfolen); return; } if (__predict_false(pkt->rm_datalen == 0)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); return; } /* * Check offests. */ #define IS_OFFSET_INVALID(ofs) \ ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) /* XXX Hyper-V does not meet data offset alignment requirement */ if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data offset %u\n", pkt->rm_dataoffset); return; } if (__predict_false(pkt->rm_oobdataoffset > 0 && IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob offset %u\n", pkt->rm_oobdataoffset); return; } if (__predict_true(pkt->rm_pktinfooffset > 0) && __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo offset %u\n", pkt->rm_pktinfooffset); return; } #undef IS_OFFSET_INVALID data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); data_len = pkt->rm_datalen; pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); pktinfo_len = pkt->rm_pktinfolen; /* * Check OOB coverage. */ if (__predict_false(pkt->rm_oobdatalen != 0)) { int oob_off, oob_len; if_printf(rxr->hn_ifp, "got oobdata\n"); oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); oob_len = pkt->rm_oobdatalen; if (__predict_false(oob_off + oob_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overflow, msglen %u, oob abs %d len %d\n", pkt->rm_len, oob_off, oob_len); return; } /* * Check against data. */ if (hn_rndis_check_overlap(oob_off, oob_len, data_off, data_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps data, oob abs %d len %d, " "data abs %d len %d\n", oob_off, oob_len, data_off, data_len); return; } /* * Check against pktinfo. */ if (pktinfo_len != 0 && hn_rndis_check_overlap(oob_off, oob_len, pktinfo_off, pktinfo_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps pktinfo, oob abs %d len %d, " "pktinfo abs %d len %d\n", oob_off, oob_len, pktinfo_off, pktinfo_len); return; } } /* * Check per-packet-info coverage and find useful per-packet-info. */ info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; info.hash_info = HN_NDIS_HASH_INFO_INVALID; if (__predict_true(pktinfo_len != 0)) { bool overlap; int error; if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overflow, msglen %u, " "pktinfo abs %d len %d\n", pkt->rm_len, pktinfo_off, pktinfo_len); return; } /* * Check packet info coverage. */ overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, data_off, data_len); if (__predict_false(overlap)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overlap data, pktinfo abs %d len %d, " "data abs %d len %d\n", pktinfo_off, pktinfo_len, data_off, data_len); return; } /* * Find useful per-packet-info. */ error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, pktinfo_len, &info); if (__predict_false(error)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " "pktinfo\n"); return; } } if (__predict_false(data_off + data_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data overflow, msglen %u, data abs %d len %d\n", pkt->rm_len, data_off, data_len); return; } hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); } static __inline void hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_msghdr *hdr; if (__predict_false(dlen < sizeof(*hdr))) { if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); return; } hdr = data; if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { /* Hot data path. */ hn_rndis_rx_data(rxr, data, dlen); /* Done! */ return; } if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); else hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); } static void hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) { const struct hn_nvs_hdr *hdr; if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { if_printf(sc->hn_ifp, "invalid nvs notify\n"); return; } hdr = VMBUS_CHANPKT_CONST_DATA(pkt); if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { /* Useless; ignore */ return; } if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); } static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt) { struct hn_nvs_sendctx *sndc; sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), VMBUS_CHANPKT_DATALEN(pkt)); /* * NOTE: * 'sndc' CAN NOT be accessed anymore, since it can be freed by * its callback. */ } static void hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) { const struct vmbus_chanpkt_rxbuf *pkt; const struct hn_nvs_hdr *nvs_hdr; int count, i, hlen; if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); return; } nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); /* Make sure that this is a RNDIS message. */ if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", nvs_hdr->nvs_type); return; } hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); if (__predict_false(hlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); return; } pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", pkt->cp_rxbuf_id); return; } count = pkt->cp_rxbuf_cnt; if (__predict_false(hlen < __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); return; } /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ for (i = 0; i < count; ++i) { int ofs, len; ofs = pkt->cp_rxbuf[i].rb_ofs; len = pkt->cp_rxbuf[i].rb_len; if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " "ofs %d, len %d\n", i, ofs, len); continue; } hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); } /* * Ack the consumed RXBUF associated w/ this channel packet, * so that this RXBUF can be recycled by the hypervisor. */ hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); } static void hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, uint64_t tid) { struct hn_nvs_rndis_ack ack; int retries, error; ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; ack.nvs_status = HN_NVS_STATUS_OK; retries = 0; again: error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); if (__predict_false(error == EAGAIN)) { /* * NOTE: * This should _not_ happen in real world, since the * consumption of the TX bufring from the TX path is * controlled. */ if (rxr->hn_ack_failed == 0) if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); rxr->hn_ack_failed++; retries++; if (retries < 10) { DELAY(100); goto again; } /* RXBUF leaks! */ if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); } } static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr) { struct hn_rx_ring *rxr = xrxr; struct hn_softc *sc = rxr->hn_ifp->if_softc; for (;;) { struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; int error, pktlen; pktlen = rxr->hn_pktbuf_len; error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); if (__predict_false(error == ENOBUFS)) { void *nbuf; int nlen; /* * Expand channel packet buffer. * * XXX * Use M_WAITOK here, since allocation failure * is fatal. */ nlen = rxr->hn_pktbuf_len * 2; while (nlen < pktlen) nlen *= 2; nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", rxr->hn_pktbuf_len, nlen); free(rxr->hn_pktbuf, M_DEVBUF); rxr->hn_pktbuf = nbuf; rxr->hn_pktbuf_len = nlen; /* Retry! */ continue; } else if (__predict_false(error == EAGAIN)) { /* No more channel packets; done! */ break; } KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: hn_nvs_handle_comp(sc, chan, pkt); break; case VMBUS_CHANPKT_TYPE_RXBUF: hn_nvs_handle_rxbuf(rxr, chan, pkt); break; case VMBUS_CHANPKT_TYPE_INBAND: hn_nvs_handle_notify(sc, pkt); break; default: if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", pkt->cph_type); break; } } hn_chan_rollup(rxr, rxr->hn_txr); } static void hn_sysinit(void *arg __unused) { int i; hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); #ifdef HN_IFSTART_SUPPORT /* * Don't use ifnet.if_start if transparent VF mode is requested; * mainly due to the IFF_DRV_OACTIVE flag. */ if (hn_xpnt_vf && hn_use_if_start) { hn_use_if_start = 0; printf("hn: tranparent VF mode, if_transmit will be used, " "instead of if_start\n"); } #endif if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { printf("hn: invalid transparent VF attach routing " "wait timeout %d, reset to %d\n", hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; } /* * Initialize VF map. */ rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); hn_vfmap_size = HN_VFMAP_SIZE_DEF; hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, M_WAITOK | M_ZERO); /* * Fix the # of TX taskqueues. */ if (hn_tx_taskq_cnt <= 0) hn_tx_taskq_cnt = 1; else if (hn_tx_taskq_cnt > mp_ncpus) hn_tx_taskq_cnt = mp_ncpus; /* * Fix the TX taskqueue mode. */ switch (hn_tx_taskq_mode) { case HN_TX_TASKQ_M_INDEP: case HN_TX_TASKQ_M_GLOBAL: case HN_TX_TASKQ_M_EVTTQ: break; default: hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; break; } if (vm_guest != VM_GUEST_HV) return; if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) return; hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &hn_tx_taskque[i]); taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, "hn tx%d", i); } } SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); static void hn_sysuninit(void *arg __unused) { if (hn_tx_taskque != NULL) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(hn_tx_taskque[i]); free(hn_tx_taskque, M_DEVBUF); } if (hn_vfmap != NULL) free(hn_vfmap, M_DEVBUF); rm_destroy(&hn_vfmap_lock); counter_u64_free(hn_udpcs_fixup); } SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); Index: projects/runtime-coverage/sys/dev/hyperv/netvsc/if_hnvar.h =================================================================== --- projects/runtime-coverage/sys/dev/hyperv/netvsc/if_hnvar.h (revision 324497) +++ projects/runtime-coverage/sys/dev/hyperv/netvsc/if_hnvar.h (revision 324498) @@ -1,316 +1,318 @@ /*- * Copyright (c) 2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _IF_HNVAR_H_ #define _IF_HNVAR_H_ #define HN_USE_TXDESC_BUFRING #define HN_CHIM_SIZE (15 * 1024 * 1024) #define HN_RXBUF_SIZE (16 * 1024 * 1024) #define HN_RXBUF_SIZE_COMPAT (15 * 1024 * 1024) #define HN_MTU_MAX (65535 - ETHER_ADDR_LEN) #define HN_TXBR_SIZE (128 * PAGE_SIZE) #define HN_RXBR_SIZE (128 * PAGE_SIZE) #define HN_XACT_REQ_PGCNT 2 #define HN_XACT_RESP_PGCNT 2 #define HN_XACT_REQ_SIZE (HN_XACT_REQ_PGCNT * PAGE_SIZE) #define HN_XACT_RESP_SIZE (HN_XACT_RESP_PGCNT * PAGE_SIZE) #define HN_GPACNT_MAX 32 struct hn_txdesc; #ifndef HN_USE_TXDESC_BUFRING SLIST_HEAD(hn_txdesc_list, hn_txdesc); #else struct buf_ring; #endif struct hn_tx_ring; struct hn_rx_ring { struct ifnet *hn_ifp; struct ifnet *hn_rxvf_ifp; /* SR-IOV VF for RX */ struct hn_tx_ring *hn_txr; void *hn_pktbuf; int hn_pktbuf_len; int hn_rx_flags; /* HN_RX_FLAG_ */ uint32_t hn_mbuf_hash; /* NDIS_HASH_ */ uint8_t *hn_rxbuf; /* shadow sc->hn_rxbuf */ int hn_rx_idx; /* Trust csum verification on host side */ int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */ struct lro_ctrl hn_lro; u_long hn_csum_ip; u_long hn_csum_tcp; u_long hn_csum_udp; u_long hn_csum_trusted; u_long hn_lro_tried; u_long hn_small_pkts; u_long hn_pkts; u_long hn_rss_pkts; u_long hn_ack_failed; /* Rarely used stuffs */ struct sysctl_oid *hn_rx_sysctl_tree; void *hn_br; /* TX/RX bufring */ struct hyperv_dma hn_br_dma; struct vmbus_channel *hn_chan; } __aligned(CACHE_LINE_SIZE); #define HN_TRUST_HCSUM_IP 0x0001 #define HN_TRUST_HCSUM_TCP 0x0002 #define HN_TRUST_HCSUM_UDP 0x0004 #define HN_RX_FLAG_ATTACHED 0x0001 #define HN_RX_FLAG_BR_REF 0x0002 #define HN_RX_FLAG_XPNT_VF 0x0004 +#define HN_RX_FLAG_UDP_HASH 0x0008 struct hn_tx_ring { #ifndef HN_USE_TXDESC_BUFRING struct mtx hn_txlist_spin; struct hn_txdesc_list hn_txlist; #else struct buf_ring *hn_txdesc_br; #endif int hn_txdesc_cnt; int hn_txdesc_avail; u_short hn_has_txeof; u_short hn_txdone_cnt; int hn_sched_tx; void (*hn_txeof)(struct hn_tx_ring *); struct taskqueue *hn_tx_taskq; struct task hn_tx_task; struct task hn_txeof_task; struct buf_ring *hn_mbuf_br; int hn_oactive; int hn_tx_idx; int hn_tx_flags; struct mtx hn_tx_lock; struct hn_softc *hn_sc; struct vmbus_channel *hn_chan; int hn_direct_tx_size; int hn_chim_size; bus_dma_tag_t hn_tx_data_dtag; uint64_t hn_csum_assist; /* Applied packet transmission aggregation limits. */ int hn_agg_szmax; short hn_agg_pktmax; short hn_agg_align; /* Packet transmission aggregation states. */ struct hn_txdesc *hn_agg_txd; int hn_agg_szleft; short hn_agg_pktleft; struct rndis_packet_msg *hn_agg_prevpkt; /* Temporary stats for each sends. */ int hn_stat_size; short hn_stat_pkts; short hn_stat_mcasts; int (*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *); int hn_suspended; int hn_gpa_cnt; struct vmbus_gpa hn_gpa[HN_GPACNT_MAX]; u_long hn_no_txdescs; u_long hn_send_failed; u_long hn_txdma_failed; u_long hn_tx_collapsed; u_long hn_tx_chimney_tried; u_long hn_tx_chimney; u_long hn_pkts; u_long hn_sends; u_long hn_flush_failed; /* Rarely used stuffs */ struct hn_txdesc *hn_txdesc; bus_dma_tag_t hn_tx_rndis_dtag; struct sysctl_oid *hn_tx_sysctl_tree; } __aligned(CACHE_LINE_SIZE); #define HN_TX_FLAG_ATTACHED 0x0001 #define HN_TX_FLAG_HASHVAL 0x0002 /* support HASHVAL pktinfo */ /* * Device-specific softc structure */ struct hn_softc { struct ifnet *hn_ifp; struct ifmedia hn_media; device_t hn_dev; int hn_if_flags; struct sx hn_lock; struct vmbus_channel *hn_prichan; int hn_rx_ring_cnt; int hn_rx_ring_inuse; struct hn_rx_ring *hn_rx_ring; struct rmlock hn_vf_lock; struct ifnet *hn_vf_ifp; /* SR-IOV VF */ uint32_t hn_xvf_flags; /* transparent VF flags */ int hn_tx_ring_cnt; int hn_tx_ring_inuse; struct hn_tx_ring *hn_tx_ring; uint8_t *hn_chim; u_long *hn_chim_bmap; int hn_chim_bmap_cnt; int hn_chim_cnt; int hn_chim_szmax; int hn_cpu; struct taskqueue **hn_tx_taskqs; struct sysctl_oid *hn_tx_sysctl_tree; struct sysctl_oid *hn_rx_sysctl_tree; struct vmbus_xact_ctx *hn_xact; uint32_t hn_nvs_ver; uint32_t hn_rx_filter; /* Packet transmission aggregation user settings. */ int hn_agg_size; int hn_agg_pkts; struct taskqueue *hn_mgmt_taskq; struct taskqueue *hn_mgmt_taskq0; struct task hn_link_task; struct task hn_netchg_init; struct timeout_task hn_netchg_status; uint32_t hn_link_flags; /* HN_LINK_FLAG_ */ uint32_t hn_caps; /* HN_CAP_ */ uint32_t hn_flags; /* HN_FLAG_ */ u_int hn_pollhz; void *hn_rxbuf; uint32_t hn_rxbuf_gpadl; struct hyperv_dma hn_rxbuf_dma; uint32_t hn_chim_gpadl; struct hyperv_dma hn_chim_dma; uint32_t hn_rndis_rid; uint32_t hn_ndis_ver; int hn_ndis_tso_szmax; int hn_ndis_tso_sgmin; uint32_t hn_rndis_agg_size; uint32_t hn_rndis_agg_pkts; uint32_t hn_rndis_agg_align; int hn_rss_ind_size; uint32_t hn_rss_hash; /* setting, NDIS_HASH_ */ uint32_t hn_rss_hcap; /* caps, NDIS_HASH_ */ struct ndis_rssprm_toeplitz hn_rss; eventhandler_tag hn_ifaddr_evthand; eventhandler_tag hn_ifnet_evthand; eventhandler_tag hn_ifnet_atthand; eventhandler_tag hn_ifnet_dethand; eventhandler_tag hn_ifnet_lnkhand; /* * Transparent VF delayed initialization. */ int hn_vf_rdytick; /* ticks, 0 == ready */ struct taskqueue *hn_vf_taskq; struct timeout_task hn_vf_init; /* * Saved information for VF under transparent mode. */ void (*hn_vf_input) (struct ifnet *, struct mbuf *); int hn_saved_caps; u_int hn_saved_tsomax; u_int hn_saved_tsosegcnt; u_int hn_saved_tsosegsz; }; #define HN_FLAG_RXBUF_CONNECTED 0x0001 #define HN_FLAG_CHIM_CONNECTED 0x0002 #define HN_FLAG_HAS_RSSKEY 0x0004 #define HN_FLAG_HAS_RSSIND 0x0008 #define HN_FLAG_SYNTH_ATTACHED 0x0010 #define HN_FLAG_NO_SLEEPING 0x0020 #define HN_FLAG_RXBUF_REF 0x0040 #define HN_FLAG_CHIM_REF 0x0080 #define HN_FLAG_RXVF 0x0100 #define HN_FLAG_ERRORS (HN_FLAG_RXBUF_REF | HN_FLAG_CHIM_REF) #define HN_XVFFLAG_ENABLED 0x0001 #define HN_XVFFLAG_ACCBPF 0x0002 #define HN_NO_SLEEPING(sc) \ do { \ (sc)->hn_flags |= HN_FLAG_NO_SLEEPING; \ } while (0) #define HN_SLEEPING_OK(sc) \ do { \ (sc)->hn_flags &= ~HN_FLAG_NO_SLEEPING; \ } while (0) #define HN_CAN_SLEEP(sc) \ (((sc)->hn_flags & HN_FLAG_NO_SLEEPING) == 0) #define HN_CAP_VLAN 0x0001 #define HN_CAP_MTU 0x0002 #define HN_CAP_IPCS 0x0004 #define HN_CAP_TCP4CS 0x0008 #define HN_CAP_TCP6CS 0x0010 #define HN_CAP_UDP4CS 0x0020 #define HN_CAP_UDP6CS 0x0040 #define HN_CAP_TSO4 0x0080 #define HN_CAP_TSO6 0x0100 #define HN_CAP_HASHVAL 0x0200 +#define HN_CAP_UDPHASH 0x0400 /* Capability description for use with printf(9) %b identifier. */ #define HN_CAP_BITS \ "\020\1VLAN\2MTU\3IPCS\4TCP4CS\5TCP6CS" \ - "\6UDP4CS\7UDP6CS\10TSO4\11TSO6\12HASHVAL" + "\6UDP4CS\7UDP6CS\10TSO4\11TSO6\12HASHVAL\13UDPHASH" #define HN_LINK_FLAG_LINKUP 0x0001 #define HN_LINK_FLAG_NETCHG 0x0002 #endif /* !_IF_HNVAR_H_ */ Index: projects/runtime-coverage/sys/dev/hyperv/vmbus/hyperv.c =================================================================== --- projects/runtime-coverage/sys/dev/hyperv/vmbus/hyperv.c (revision 324497) +++ projects/runtime-coverage/sys/dev/hyperv/vmbus/hyperv.c (revision 324498) @@ -1,334 +1,337 @@ /*- * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * Implements low-level interactions with Hypver-V/Azure */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define HYPERV_FREEBSD_BUILD 0ULL #define HYPERV_FREEBSD_VERSION ((uint64_t)__FreeBSD_version) #define HYPERV_FREEBSD_OSID 0ULL #define MSR_HV_GUESTID_BUILD_FREEBSD \ (HYPERV_FREEBSD_BUILD & MSR_HV_GUESTID_BUILD_MASK) #define MSR_HV_GUESTID_VERSION_FREEBSD \ ((HYPERV_FREEBSD_VERSION << MSR_HV_GUESTID_VERSION_SHIFT) & \ MSR_HV_GUESTID_VERSION_MASK) #define MSR_HV_GUESTID_OSID_FREEBSD \ ((HYPERV_FREEBSD_OSID << MSR_HV_GUESTID_OSID_SHIFT) & \ MSR_HV_GUESTID_OSID_MASK) #define MSR_HV_GUESTID_FREEBSD \ (MSR_HV_GUESTID_BUILD_FREEBSD | \ MSR_HV_GUESTID_VERSION_FREEBSD | \ MSR_HV_GUESTID_OSID_FREEBSD | \ MSR_HV_GUESTID_OSTYPE_FREEBSD) struct hypercall_ctx { void *hc_addr; vm_paddr_t hc_paddr; }; static u_int hyperv_get_timecount(struct timecounter *); static bool hyperv_identify(void); static void hypercall_memfree(void); +u_int hyperv_ver_major; + u_int hyperv_features; u_int hyperv_recommends; static u_int hyperv_pm_features; static u_int hyperv_features3; hyperv_tc64_t hyperv_tc64; static struct timecounter hyperv_timecounter = { .tc_get_timecount = hyperv_get_timecount, .tc_poll_pps = NULL, .tc_counter_mask = 0xffffffff, .tc_frequency = HYPERV_TIMER_FREQ, .tc_name = "Hyper-V", .tc_quality = 2000, .tc_flags = 0, .tc_priv = NULL }; static struct hypercall_ctx hypercall_context; static u_int hyperv_get_timecount(struct timecounter *tc __unused) { return rdmsr(MSR_HV_TIME_REF_COUNT); } static uint64_t hyperv_tc64_rdmsr(void) { return (rdmsr(MSR_HV_TIME_REF_COUNT)); } uint64_t hypercall_post_message(bus_addr_t msg_paddr) { return hypercall_md(hypercall_context.hc_addr, HYPERCALL_POST_MESSAGE, msg_paddr, 0); } uint64_t hypercall_signal_event(bus_addr_t monprm_paddr) { return hypercall_md(hypercall_context.hc_addr, HYPERCALL_SIGNAL_EVENT, monprm_paddr, 0); } int hyperv_guid2str(const struct hyperv_guid *guid, char *buf, size_t sz) { const uint8_t *d = guid->hv_guid; return snprintf(buf, sz, "%02x%02x%02x%02x-" "%02x%02x-%02x%02x-%02x%02x-" "%02x%02x%02x%02x%02x%02x", d[3], d[2], d[1], d[0], d[5], d[4], d[7], d[6], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]); } static bool hyperv_identify(void) { u_int regs[4]; unsigned int maxleaf; if (vm_guest != VM_GUEST_HV) return (false); do_cpuid(CPUID_LEAF_HV_MAXLEAF, regs); maxleaf = regs[0]; if (maxleaf < CPUID_LEAF_HV_LIMITS) return (false); do_cpuid(CPUID_LEAF_HV_INTERFACE, regs); if (regs[0] != CPUID_HV_IFACE_HYPERV) return (false); do_cpuid(CPUID_LEAF_HV_FEATURES, regs); if ((regs[0] & CPUID_HV_MSR_HYPERCALL) == 0) { /* * Hyper-V w/o Hypercall is impossible; someone * is faking Hyper-V. */ return (false); } hyperv_features = regs[0]; hyperv_pm_features = regs[2]; hyperv_features3 = regs[3]; do_cpuid(CPUID_LEAF_HV_IDENTITY, regs); + hyperv_ver_major = regs[1] >> 16; printf("Hyper-V Version: %d.%d.%d [SP%d]\n", - regs[1] >> 16, regs[1] & 0xffff, regs[0], regs[2]); + hyperv_ver_major, regs[1] & 0xffff, regs[0], regs[2]); printf(" Features=0x%b\n", hyperv_features, "\020" "\001VPRUNTIME" /* MSR_HV_VP_RUNTIME */ "\002TMREFCNT" /* MSR_HV_TIME_REF_COUNT */ "\003SYNIC" /* MSRs for SynIC */ "\004SYNTM" /* MSRs for SynTimer */ "\005APIC" /* MSR_HV_{EOI,ICR,TPR} */ "\006HYPERCALL" /* MSR_HV_{GUEST_OS_ID,HYPERCALL} */ "\007VPINDEX" /* MSR_HV_VP_INDEX */ "\010RESET" /* MSR_HV_RESET */ "\011STATS" /* MSR_HV_STATS_ */ "\012REFTSC" /* MSR_HV_REFERENCE_TSC */ "\013IDLE" /* MSR_HV_GUEST_IDLE */ "\014TMFREQ" /* MSR_HV_{TSC,APIC}_FREQUENCY */ "\015DEBUG"); /* MSR_HV_SYNTH_DEBUG_ */ printf(" PM Features=0x%b [C%u]\n", (hyperv_pm_features & ~CPUPM_HV_CSTATE_MASK), "\020" "\005C3HPET", /* HPET is required for C3 state */ CPUPM_HV_CSTATE(hyperv_pm_features)); printf(" Features3=0x%b\n", hyperv_features3, "\020" "\001MWAIT" /* MWAIT */ "\002DEBUG" /* guest debug support */ "\003PERFMON" /* performance monitor */ "\004PCPUDPE" /* physical CPU dynamic partition event */ "\005XMMHC" /* hypercall input through XMM regs */ "\006IDLE" /* guest idle support */ "\007SLEEP" /* hypervisor sleep support */ "\010NUMA" /* NUMA distance query support */ "\011TMFREQ" /* timer frequency query (TSC, LAPIC) */ "\012SYNCMC" /* inject synthetic machine checks */ "\013CRASH" /* MSRs for guest crash */ "\014DEBUGMSR" /* MSRs for guest debug */ "\015NPIEP" /* NPIEP */ "\016HVDIS"); /* disabling hypervisor */ do_cpuid(CPUID_LEAF_HV_RECOMMENDS, regs); hyperv_recommends = regs[0]; if (bootverbose) printf(" Recommends: %08x %08x\n", regs[0], regs[1]); do_cpuid(CPUID_LEAF_HV_LIMITS, regs); if (bootverbose) { printf(" Limits: Vcpu:%d Lcpu:%d Int:%d\n", regs[0], regs[1], regs[2]); } if (maxleaf >= CPUID_LEAF_HV_HWFEATURES) { do_cpuid(CPUID_LEAF_HV_HWFEATURES, regs); if (bootverbose) { printf(" HW Features: %08x, AMD: %08x\n", regs[0], regs[3]); } } return (true); } static void hyperv_init(void *dummy __unused) { if (!hyperv_identify()) { /* Not Hyper-V; reset guest id to the generic one. */ if (vm_guest == VM_GUEST_HV) vm_guest = VM_GUEST_VM; return; } /* Set guest id */ wrmsr(MSR_HV_GUEST_OS_ID, MSR_HV_GUESTID_FREEBSD); if (hyperv_features & CPUID_HV_MSR_TIME_REFCNT) { /* Register Hyper-V timecounter */ tc_init(&hyperv_timecounter); /* * Install 64 bits timecounter method for other modules * to use. */ hyperv_tc64 = hyperv_tc64_rdmsr; } } SYSINIT(hyperv_initialize, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, hyperv_init, NULL); static void hypercall_memfree(void) { kmem_free(kernel_arena, (vm_offset_t)hypercall_context.hc_addr, PAGE_SIZE); hypercall_context.hc_addr = NULL; } static void hypercall_create(void *arg __unused) { uint64_t hc, hc_orig; if (vm_guest != VM_GUEST_HV) return; /* * NOTE: * - busdma(9), i.e. hyperv_dmamem APIs, can _not_ be used due to * the NX bit. * - Assume kmem_malloc() returns properly aligned memory. */ hypercall_context.hc_addr = (void *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK); hypercall_context.hc_paddr = vtophys(hypercall_context.hc_addr); /* Get the 'reserved' bits, which requires preservation. */ hc_orig = rdmsr(MSR_HV_HYPERCALL); /* * Setup the Hypercall page. * * NOTE: 'reserved' bits MUST be preserved. */ hc = ((hypercall_context.hc_paddr >> PAGE_SHIFT) << MSR_HV_HYPERCALL_PGSHIFT) | (hc_orig & MSR_HV_HYPERCALL_RSVD_MASK) | MSR_HV_HYPERCALL_ENABLE; wrmsr(MSR_HV_HYPERCALL, hc); /* * Confirm that Hypercall page did get setup. */ hc = rdmsr(MSR_HV_HYPERCALL); if ((hc & MSR_HV_HYPERCALL_ENABLE) == 0) { printf("hyperv: Hypercall setup failed\n"); hypercall_memfree(); /* Can't perform any Hyper-V specific actions */ vm_guest = VM_GUEST_VM; return; } if (bootverbose) printf("hyperv: Hypercall created\n"); } SYSINIT(hypercall_ctor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_create, NULL); static void hypercall_destroy(void *arg __unused) { uint64_t hc; if (hypercall_context.hc_addr == NULL) return; /* Disable Hypercall */ hc = rdmsr(MSR_HV_HYPERCALL); wrmsr(MSR_HV_HYPERCALL, (hc & MSR_HV_HYPERCALL_RSVD_MASK)); hypercall_memfree(); if (bootverbose) printf("hyperv: Hypercall destroyed\n"); } SYSUNINIT(hypercall_dtor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_destroy, NULL); Index: projects/runtime-coverage/sys/dev/hyperv/vmbus/vmbus.c =================================================================== --- projects/runtime-coverage/sys/dev/hyperv/vmbus/vmbus.c (revision 324497) +++ projects/runtime-coverage/sys/dev/hyperv/vmbus/vmbus.c (revision 324498) @@ -1,1538 +1,1551 @@ /*- * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * VM Bus Driver Implementation */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi_if.h" #include "pcib_if.h" #include "vmbus_if.h" #define VMBUS_GPADL_START 0xe1e10 struct vmbus_msghc { struct vmbus_xact *mh_xact; struct hypercall_postmsg_in mh_inprm_save; }; static void vmbus_identify(driver_t *, device_t); static int vmbus_probe(device_t); static int vmbus_attach(device_t); static int vmbus_detach(device_t); static int vmbus_read_ivar(device_t, device_t, int, uintptr_t *); static int vmbus_child_pnpinfo_str(device_t, device_t, char *, size_t); static struct resource *vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags); static int vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs); static int vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs); static int vmbus_alloc_msix(device_t bus, device_t dev, int *irq); static int vmbus_release_msix(device_t bus, device_t dev, int irq); static int vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr, uint32_t *data); static uint32_t vmbus_get_version_method(device_t, device_t); static int vmbus_probe_guid_method(device_t, device_t, const struct hyperv_guid *); static uint32_t vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu); static struct taskqueue *vmbus_get_eventtq_method(device_t, device_t, int); #ifdef EARLY_AP_STARTUP static void vmbus_intrhook(void *); #endif static int vmbus_init(struct vmbus_softc *); static int vmbus_connect(struct vmbus_softc *, uint32_t); static int vmbus_req_channels(struct vmbus_softc *sc); static void vmbus_disconnect(struct vmbus_softc *); static int vmbus_scan(struct vmbus_softc *); static void vmbus_scan_teardown(struct vmbus_softc *); static void vmbus_scan_done(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_chanmsg_handle(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_msg_task(void *, int); static void vmbus_synic_setup(void *); static void vmbus_synic_teardown(void *); static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS); static int vmbus_dma_alloc(struct vmbus_softc *); static void vmbus_dma_free(struct vmbus_softc *); static int vmbus_intr_setup(struct vmbus_softc *); static void vmbus_intr_teardown(struct vmbus_softc *); static int vmbus_doattach(struct vmbus_softc *); static void vmbus_event_proc_dummy(struct vmbus_softc *, int); static struct vmbus_softc *vmbus_sc; +SYSCTL_NODE(_hw, OID_AUTO, vmbus, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, + "Hyper-V vmbus"); + +static int vmbus_pin_evttask = 1; +SYSCTL_INT(_hw_vmbus, OID_AUTO, pin_evttask, CTLFLAG_RDTUN, + &vmbus_pin_evttask, 0, "Pin event tasks to their respective CPU"); + extern inthand_t IDTVEC(vmbus_isr); static const uint32_t vmbus_version[] = { VMBUS_VERSION_WIN8_1, VMBUS_VERSION_WIN8, VMBUS_VERSION_WIN7, VMBUS_VERSION_WS2008 }; static const vmbus_chanmsg_proc_t vmbus_chanmsg_handlers[VMBUS_CHANMSG_TYPE_MAX] = { VMBUS_CHANMSG_PROC(CHOFFER_DONE, vmbus_scan_done), VMBUS_CHANMSG_PROC_WAKEUP(CONNECT_RESP) }; static device_method_t vmbus_methods[] = { /* Device interface */ DEVMETHOD(device_identify, vmbus_identify), DEVMETHOD(device_probe, vmbus_probe), DEVMETHOD(device_attach, vmbus_attach), DEVMETHOD(device_detach, vmbus_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), /* Bus interface */ DEVMETHOD(bus_add_child, bus_generic_add_child), DEVMETHOD(bus_print_child, bus_generic_print_child), DEVMETHOD(bus_read_ivar, vmbus_read_ivar), DEVMETHOD(bus_child_pnpinfo_str, vmbus_child_pnpinfo_str), DEVMETHOD(bus_alloc_resource, vmbus_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), #if __FreeBSD_version >= 1100000 DEVMETHOD(bus_get_cpus, bus_generic_get_cpus), #endif /* pcib interface */ DEVMETHOD(pcib_alloc_msi, vmbus_alloc_msi), DEVMETHOD(pcib_release_msi, vmbus_release_msi), DEVMETHOD(pcib_alloc_msix, vmbus_alloc_msix), DEVMETHOD(pcib_release_msix, vmbus_release_msix), DEVMETHOD(pcib_map_msi, vmbus_map_msi), /* Vmbus interface */ DEVMETHOD(vmbus_get_version, vmbus_get_version_method), DEVMETHOD(vmbus_probe_guid, vmbus_probe_guid_method), DEVMETHOD(vmbus_get_vcpu_id, vmbus_get_vcpu_id_method), DEVMETHOD(vmbus_get_event_taskq, vmbus_get_eventtq_method), DEVMETHOD_END }; static driver_t vmbus_driver = { "vmbus", vmbus_methods, sizeof(struct vmbus_softc) }; static devclass_t vmbus_devclass; DRIVER_MODULE(vmbus, pcib, vmbus_driver, vmbus_devclass, NULL, NULL); DRIVER_MODULE(vmbus, acpi_syscontainer, vmbus_driver, vmbus_devclass, NULL, NULL); MODULE_DEPEND(vmbus, acpi, 1, 1, 1); MODULE_DEPEND(vmbus, pci, 1, 1, 1); MODULE_VERSION(vmbus, 1); static __inline struct vmbus_softc * vmbus_get_softc(void) { return vmbus_sc; } void vmbus_msghc_reset(struct vmbus_msghc *mh, size_t dsize) { struct hypercall_postmsg_in *inprm; if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); inprm = vmbus_xact_req_data(mh->mh_xact); memset(inprm, 0, HYPERCALL_POSTMSGIN_SIZE); inprm->hc_connid = VMBUS_CONNID_MESSAGE; inprm->hc_msgtype = HYPERV_MSGTYPE_CHANNEL; inprm->hc_dsize = dsize; } struct vmbus_msghc * vmbus_msghc_get(struct vmbus_softc *sc, size_t dsize) { struct vmbus_msghc *mh; struct vmbus_xact *xact; if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); xact = vmbus_xact_get(sc->vmbus_xc, dsize + __offsetof(struct hypercall_postmsg_in, hc_data[0])); if (xact == NULL) return (NULL); mh = vmbus_xact_priv(xact, sizeof(*mh)); mh->mh_xact = xact; vmbus_msghc_reset(mh, dsize); return (mh); } void vmbus_msghc_put(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { vmbus_xact_put(mh->mh_xact); } void * vmbus_msghc_dataptr(struct vmbus_msghc *mh) { struct hypercall_postmsg_in *inprm; inprm = vmbus_xact_req_data(mh->mh_xact); return (inprm->hc_data); } int vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) { sbintime_t time = SBT_1MS; struct hypercall_postmsg_in *inprm; bus_addr_t inprm_paddr; int i; inprm = vmbus_xact_req_data(mh->mh_xact); inprm_paddr = vmbus_xact_req_paddr(mh->mh_xact); /* * Save the input parameter so that we could restore the input * parameter if the Hypercall failed. * * XXX * Is this really necessary?! i.e. Will the Hypercall ever * overwrite the input parameter? */ memcpy(&mh->mh_inprm_save, inprm, HYPERCALL_POSTMSGIN_SIZE); /* * In order to cope with transient failures, e.g. insufficient * resources on host side, we retry the post message Hypercall * several times. 20 retries seem sufficient. */ #define HC_RETRY_MAX 20 for (i = 0; i < HC_RETRY_MAX; ++i) { uint64_t status; status = hypercall_post_message(inprm_paddr); if (status == HYPERCALL_STATUS_SUCCESS) return 0; pause_sbt("hcpmsg", time, 0, C_HARDCLOCK); if (time < SBT_1S * 2) time *= 2; /* Restore input parameter and try again */ memcpy(inprm, &mh->mh_inprm_save, HYPERCALL_POSTMSGIN_SIZE); } #undef HC_RETRY_MAX return EIO; } int vmbus_msghc_exec(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { int error; vmbus_xact_activate(mh->mh_xact); error = vmbus_msghc_exec_noresult(mh); if (error) vmbus_xact_deactivate(mh->mh_xact); return error; } void vmbus_msghc_exec_cancel(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { vmbus_xact_deactivate(mh->mh_xact); } const struct vmbus_message * vmbus_msghc_wait_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { size_t resp_len; return (vmbus_xact_wait(mh->mh_xact, &resp_len)); } const struct vmbus_message * vmbus_msghc_poll_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { size_t resp_len; return (vmbus_xact_poll(mh->mh_xact, &resp_len)); } void vmbus_msghc_wakeup(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_xact_ctx_wakeup(sc->vmbus_xc, msg, sizeof(*msg)); } uint32_t vmbus_gpadl_alloc(struct vmbus_softc *sc) { uint32_t gpadl; again: gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1); if (gpadl == 0) goto again; return (gpadl); } static int vmbus_connect(struct vmbus_softc *sc, uint32_t version) { struct vmbus_chanmsg_connect *req; const struct vmbus_message *msg; struct vmbus_msghc *mh; int error, done = 0; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) return ENXIO; req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CONNECT; req->chm_ver = version; req->chm_evtflags = sc->vmbus_evtflags_dma.hv_paddr; req->chm_mnf1 = sc->vmbus_mnf1_dma.hv_paddr; req->chm_mnf2 = sc->vmbus_mnf2_dma.hv_paddr; error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_msghc_put(sc, mh); return error; } msg = vmbus_msghc_wait_result(sc, mh); done = ((const struct vmbus_chanmsg_connect_resp *) msg->msg_data)->chm_done; vmbus_msghc_put(sc, mh); return (done ? 0 : EOPNOTSUPP); } static int vmbus_init(struct vmbus_softc *sc) { int i; for (i = 0; i < nitems(vmbus_version); ++i) { int error; error = vmbus_connect(sc, vmbus_version[i]); if (!error) { sc->vmbus_version = vmbus_version[i]; device_printf(sc->vmbus_dev, "version %u.%u\n", VMBUS_VERSION_MAJOR(sc->vmbus_version), VMBUS_VERSION_MINOR(sc->vmbus_version)); return 0; } } return ENXIO; } static void vmbus_disconnect(struct vmbus_softc *sc) { struct vmbus_chanmsg_disconnect *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for disconnect\n"); return; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_DISCONNECT; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { device_printf(sc->vmbus_dev, "disconnect msg hypercall failed\n"); } } static int vmbus_req_channels(struct vmbus_softc *sc) { struct vmbus_chanmsg_chrequest *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) return ENXIO; req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHREQUEST; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); return error; } static void vmbus_scan_done_task(void *xsc, int pending __unused) { struct vmbus_softc *sc = xsc; mtx_lock(&Giant); sc->vmbus_scandone = true; mtx_unlock(&Giant); wakeup(&sc->vmbus_scandone); } static void vmbus_scan_done(struct vmbus_softc *sc, const struct vmbus_message *msg __unused) { taskqueue_enqueue(sc->vmbus_devtq, &sc->vmbus_scandone_task); } static int vmbus_scan(struct vmbus_softc *sc) { int error; /* * Identify, probe and attach for non-channel devices. */ bus_generic_probe(sc->vmbus_dev); bus_generic_attach(sc->vmbus_dev); /* * This taskqueue serializes vmbus devices' attach and detach * for channel offer and rescind messages. */ sc->vmbus_devtq = taskqueue_create("vmbus dev", M_WAITOK, taskqueue_thread_enqueue, &sc->vmbus_devtq); taskqueue_start_threads(&sc->vmbus_devtq, 1, PI_NET, "vmbusdev"); TASK_INIT(&sc->vmbus_scandone_task, 0, vmbus_scan_done_task, sc); /* * This taskqueue handles sub-channel detach, so that vmbus * device's detach running in vmbus_devtq can drain its sub- * channels. */ sc->vmbus_subchtq = taskqueue_create("vmbus subch", M_WAITOK, taskqueue_thread_enqueue, &sc->vmbus_subchtq); taskqueue_start_threads(&sc->vmbus_subchtq, 1, PI_NET, "vmbussch"); /* * Start vmbus scanning. */ error = vmbus_req_channels(sc); if (error) { device_printf(sc->vmbus_dev, "channel request failed: %d\n", error); return (error); } /* * Wait for all vmbus devices from the initial channel offers to be * attached. */ GIANT_REQUIRED; while (!sc->vmbus_scandone) mtx_sleep(&sc->vmbus_scandone, &Giant, 0, "vmbusdev", 0); if (bootverbose) { device_printf(sc->vmbus_dev, "device scan, probe and attach " "done\n"); } return (0); } static void vmbus_scan_teardown(struct vmbus_softc *sc) { GIANT_REQUIRED; if (sc->vmbus_devtq != NULL) { mtx_unlock(&Giant); taskqueue_free(sc->vmbus_devtq); mtx_lock(&Giant); sc->vmbus_devtq = NULL; } if (sc->vmbus_subchtq != NULL) { mtx_unlock(&Giant); taskqueue_free(sc->vmbus_subchtq); mtx_lock(&Giant); sc->vmbus_subchtq = NULL; } } static void vmbus_chanmsg_handle(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_chanmsg_proc_t msg_proc; uint32_t msg_type; msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type; if (msg_type >= VMBUS_CHANMSG_TYPE_MAX) { device_printf(sc->vmbus_dev, "unknown message type 0x%x\n", msg_type); return; } msg_proc = vmbus_chanmsg_handlers[msg_type]; if (msg_proc != NULL) msg_proc(sc, msg); /* Channel specific processing */ vmbus_chan_msgproc(sc, msg); } static void vmbus_msg_task(void *xsc, int pending __unused) { struct vmbus_softc *sc = xsc; volatile struct vmbus_message *msg; msg = VMBUS_PCPU_GET(sc, message, curcpu) + VMBUS_SINT_MESSAGE; for (;;) { if (msg->msg_type == HYPERV_MSGTYPE_NONE) { /* No message */ break; } else if (msg->msg_type == HYPERV_MSGTYPE_CHANNEL) { /* Channel message */ vmbus_chanmsg_handle(sc, __DEVOLATILE(const struct vmbus_message *, msg)); } msg->msg_type = HYPERV_MSGTYPE_NONE; /* * Make sure the write to msg_type (i.e. set to * HYPERV_MSGTYPE_NONE) happens before we read the * msg_flags and EOMing. Otherwise, the EOMing will * not deliver any more messages since there is no * empty slot * * NOTE: * mb() is used here, since atomic_thread_fence_seq_cst() * will become compiler fence on UP kernel. */ mb(); if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) { /* * This will cause message queue rescan to possibly * deliver another msg from the hypervisor */ wrmsr(MSR_HV_EOM, 0); } } } static __inline int vmbus_handle_intr1(struct vmbus_softc *sc, struct trapframe *frame, int cpu) { volatile struct vmbus_message *msg; struct vmbus_message *msg_base; msg_base = VMBUS_PCPU_GET(sc, message, cpu); /* * Check event timer. * * TODO: move this to independent IDT vector. */ msg = msg_base + VMBUS_SINT_TIMER; if (msg->msg_type == HYPERV_MSGTYPE_TIMER_EXPIRED) { msg->msg_type = HYPERV_MSGTYPE_NONE; vmbus_et_intr(frame); /* * Make sure the write to msg_type (i.e. set to * HYPERV_MSGTYPE_NONE) happens before we read the * msg_flags and EOMing. Otherwise, the EOMing will * not deliver any more messages since there is no * empty slot * * NOTE: * mb() is used here, since atomic_thread_fence_seq_cst() * will become compiler fence on UP kernel. */ mb(); if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) { /* * This will cause message queue rescan to possibly * deliver another msg from the hypervisor */ wrmsr(MSR_HV_EOM, 0); } } /* * Check events. Hot path for network and storage I/O data; high rate. * * NOTE: * As recommended by the Windows guest fellows, we check events before * checking messages. */ sc->vmbus_event_proc(sc, cpu); /* * Check messages. Mainly management stuffs; ultra low rate. */ msg = msg_base + VMBUS_SINT_MESSAGE; if (__predict_false(msg->msg_type != HYPERV_MSGTYPE_NONE)) { taskqueue_enqueue(VMBUS_PCPU_GET(sc, message_tq, cpu), VMBUS_PCPU_PTR(sc, message_task, cpu)); } return (FILTER_HANDLED); } void vmbus_handle_intr(struct trapframe *trap_frame) { struct vmbus_softc *sc = vmbus_get_softc(); int cpu = curcpu; /* * Disable preemption. */ critical_enter(); /* * Do a little interrupt counting. */ (*VMBUS_PCPU_GET(sc, intr_cnt, cpu))++; vmbus_handle_intr1(sc, trap_frame, cpu); /* * Enable preemption. */ critical_exit(); } static void vmbus_synic_setup(void *xsc) { struct vmbus_softc *sc = xsc; int cpu = curcpu; uint64_t val, orig; uint32_t sint; if (hyperv_features & CPUID_HV_MSR_VP_INDEX) { /* Save virtual processor id. */ VMBUS_PCPU_GET(sc, vcpuid, cpu) = rdmsr(MSR_HV_VP_INDEX); } else { /* Set virtual processor id to 0 for compatibility. */ VMBUS_PCPU_GET(sc, vcpuid, cpu) = 0; } /* * Setup the SynIC message. */ orig = rdmsr(MSR_HV_SIMP); val = MSR_HV_SIMP_ENABLE | (orig & MSR_HV_SIMP_RSVD_MASK) | ((VMBUS_PCPU_GET(sc, message_dma.hv_paddr, cpu) >> PAGE_SHIFT) << MSR_HV_SIMP_PGSHIFT); wrmsr(MSR_HV_SIMP, val); /* * Setup the SynIC event flags. */ orig = rdmsr(MSR_HV_SIEFP); val = MSR_HV_SIEFP_ENABLE | (orig & MSR_HV_SIEFP_RSVD_MASK) | ((VMBUS_PCPU_GET(sc, event_flags_dma.hv_paddr, cpu) >> PAGE_SHIFT) << MSR_HV_SIEFP_PGSHIFT); wrmsr(MSR_HV_SIEFP, val); /* * Configure and unmask SINT for message and event flags. */ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; orig = rdmsr(sint); val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI | (orig & MSR_HV_SINT_RSVD_MASK); wrmsr(sint, val); /* * Configure and unmask SINT for timer. */ sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER; orig = rdmsr(sint); val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI | (orig & MSR_HV_SINT_RSVD_MASK); wrmsr(sint, val); /* * All done; enable SynIC. */ orig = rdmsr(MSR_HV_SCONTROL); val = MSR_HV_SCTRL_ENABLE | (orig & MSR_HV_SCTRL_RSVD_MASK); wrmsr(MSR_HV_SCONTROL, val); } static void vmbus_synic_teardown(void *arg) { uint64_t orig; uint32_t sint; /* * Disable SynIC. */ orig = rdmsr(MSR_HV_SCONTROL); wrmsr(MSR_HV_SCONTROL, (orig & MSR_HV_SCTRL_RSVD_MASK)); /* * Mask message and event flags SINT. */ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; orig = rdmsr(sint); wrmsr(sint, orig | MSR_HV_SINT_MASKED); /* * Mask timer SINT. */ sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER; orig = rdmsr(sint); wrmsr(sint, orig | MSR_HV_SINT_MASKED); /* * Teardown SynIC message. */ orig = rdmsr(MSR_HV_SIMP); wrmsr(MSR_HV_SIMP, (orig & MSR_HV_SIMP_RSVD_MASK)); /* * Teardown SynIC event flags. */ orig = rdmsr(MSR_HV_SIEFP); wrmsr(MSR_HV_SIEFP, (orig & MSR_HV_SIEFP_RSVD_MASK)); } static int vmbus_dma_alloc(struct vmbus_softc *sc) { bus_dma_tag_t parent_dtag; uint8_t *evtflags; int cpu; parent_dtag = bus_get_dma_tag(sc->vmbus_dev); CPU_FOREACH(cpu) { void *ptr; /* * Per-cpu messages and event flags. */ ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, VMBUS_PCPU_PTR(sc, message_dma, cpu), BUS_DMA_WAITOK | BUS_DMA_ZERO); if (ptr == NULL) return ENOMEM; VMBUS_PCPU_GET(sc, message, cpu) = ptr; ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), BUS_DMA_WAITOK | BUS_DMA_ZERO); if (ptr == NULL) return ENOMEM; VMBUS_PCPU_GET(sc, event_flags, cpu) = ptr; } evtflags = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, &sc->vmbus_evtflags_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (evtflags == NULL) return ENOMEM; sc->vmbus_rx_evtflags = (u_long *)evtflags; sc->vmbus_tx_evtflags = (u_long *)(evtflags + (PAGE_SIZE / 2)); sc->vmbus_evtflags = evtflags; sc->vmbus_mnf1 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, &sc->vmbus_mnf1_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->vmbus_mnf1 == NULL) return ENOMEM; sc->vmbus_mnf2 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, sizeof(struct vmbus_mnf), &sc->vmbus_mnf2_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->vmbus_mnf2 == NULL) return ENOMEM; return 0; } static void vmbus_dma_free(struct vmbus_softc *sc) { int cpu; if (sc->vmbus_evtflags != NULL) { hyperv_dmamem_free(&sc->vmbus_evtflags_dma, sc->vmbus_evtflags); sc->vmbus_evtflags = NULL; sc->vmbus_rx_evtflags = NULL; sc->vmbus_tx_evtflags = NULL; } if (sc->vmbus_mnf1 != NULL) { hyperv_dmamem_free(&sc->vmbus_mnf1_dma, sc->vmbus_mnf1); sc->vmbus_mnf1 = NULL; } if (sc->vmbus_mnf2 != NULL) { hyperv_dmamem_free(&sc->vmbus_mnf2_dma, sc->vmbus_mnf2); sc->vmbus_mnf2 = NULL; } CPU_FOREACH(cpu) { if (VMBUS_PCPU_GET(sc, message, cpu) != NULL) { hyperv_dmamem_free( VMBUS_PCPU_PTR(sc, message_dma, cpu), VMBUS_PCPU_GET(sc, message, cpu)); VMBUS_PCPU_GET(sc, message, cpu) = NULL; } if (VMBUS_PCPU_GET(sc, event_flags, cpu) != NULL) { hyperv_dmamem_free( VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), VMBUS_PCPU_GET(sc, event_flags, cpu)); VMBUS_PCPU_GET(sc, event_flags, cpu) = NULL; } } } static int vmbus_intr_setup(struct vmbus_softc *sc) { int cpu; CPU_FOREACH(cpu) { char buf[MAXCOMLEN + 1]; cpuset_t cpu_mask; /* Allocate an interrupt counter for Hyper-V interrupt */ snprintf(buf, sizeof(buf), "cpu%d:hyperv", cpu); intrcnt_add(buf, VMBUS_PCPU_PTR(sc, intr_cnt, cpu)); /* * Setup taskqueue to handle events. Task will be per- * channel. */ VMBUS_PCPU_GET(sc, event_tq, cpu) = taskqueue_create_fast( "hyperv event", M_WAITOK, taskqueue_thread_enqueue, VMBUS_PCPU_PTR(sc, event_tq, cpu)); - CPU_SETOF(cpu, &cpu_mask); - taskqueue_start_threads_cpuset( - VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, &cpu_mask, - "hvevent%d", cpu); + if (vmbus_pin_evttask) { + CPU_SETOF(cpu, &cpu_mask); + taskqueue_start_threads_cpuset( + VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, + &cpu_mask, "hvevent%d", cpu); + } else { + taskqueue_start_threads( + VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, + "hvevent%d", cpu); + } /* * Setup tasks and taskqueues to handle messages. */ VMBUS_PCPU_GET(sc, message_tq, cpu) = taskqueue_create_fast( "hyperv msg", M_WAITOK, taskqueue_thread_enqueue, VMBUS_PCPU_PTR(sc, message_tq, cpu)); CPU_SETOF(cpu, &cpu_mask); taskqueue_start_threads_cpuset( VMBUS_PCPU_PTR(sc, message_tq, cpu), 1, PI_NET, &cpu_mask, "hvmsg%d", cpu); TASK_INIT(VMBUS_PCPU_PTR(sc, message_task, cpu), 0, vmbus_msg_task, sc); } /* * All Hyper-V ISR required resources are setup, now let's find a * free IDT vector for Hyper-V ISR and set it up. */ sc->vmbus_idtvec = lapic_ipi_alloc(IDTVEC(vmbus_isr)); if (sc->vmbus_idtvec < 0) { device_printf(sc->vmbus_dev, "cannot find free IDT vector\n"); return ENXIO; } if (bootverbose) { device_printf(sc->vmbus_dev, "vmbus IDT vector %d\n", sc->vmbus_idtvec); } return 0; } static void vmbus_intr_teardown(struct vmbus_softc *sc) { int cpu; if (sc->vmbus_idtvec >= 0) { lapic_ipi_free(sc->vmbus_idtvec); sc->vmbus_idtvec = -1; } CPU_FOREACH(cpu) { if (VMBUS_PCPU_GET(sc, event_tq, cpu) != NULL) { taskqueue_free(VMBUS_PCPU_GET(sc, event_tq, cpu)); VMBUS_PCPU_GET(sc, event_tq, cpu) = NULL; } if (VMBUS_PCPU_GET(sc, message_tq, cpu) != NULL) { taskqueue_drain(VMBUS_PCPU_GET(sc, message_tq, cpu), VMBUS_PCPU_PTR(sc, message_task, cpu)); taskqueue_free(VMBUS_PCPU_GET(sc, message_tq, cpu)); VMBUS_PCPU_GET(sc, message_tq, cpu) = NULL; } } } static int vmbus_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { return (ENOENT); } static int vmbus_child_pnpinfo_str(device_t dev, device_t child, char *buf, size_t buflen) { const struct vmbus_channel *chan; char guidbuf[HYPERV_GUID_STRLEN]; chan = vmbus_get_channel(child); if (chan == NULL) { /* Event timer device, which does not belong to a channel */ return (0); } strlcat(buf, "classid=", buflen); hyperv_guid2str(&chan->ch_guid_type, guidbuf, sizeof(guidbuf)); strlcat(buf, guidbuf, buflen); strlcat(buf, " deviceid=", buflen); hyperv_guid2str(&chan->ch_guid_inst, guidbuf, sizeof(guidbuf)); strlcat(buf, guidbuf, buflen); return (0); } int vmbus_add_child(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; device_t parent = sc->vmbus_dev; mtx_lock(&Giant); chan->ch_dev = device_add_child(parent, NULL, -1); if (chan->ch_dev == NULL) { mtx_unlock(&Giant); device_printf(parent, "device_add_child for chan%u failed\n", chan->ch_id); return (ENXIO); } device_set_ivars(chan->ch_dev, chan); device_probe_and_attach(chan->ch_dev); mtx_unlock(&Giant); return (0); } int vmbus_delete_child(struct vmbus_channel *chan) { int error = 0; mtx_lock(&Giant); if (chan->ch_dev != NULL) { error = device_delete_child(chan->ch_vmbus->vmbus_dev, chan->ch_dev); chan->ch_dev = NULL; } mtx_unlock(&Giant); return (error); } static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS) { struct vmbus_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", VMBUS_VERSION_MAJOR(sc->vmbus_version), VMBUS_VERSION_MINOR(sc->vmbus_version)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } /* * We need the function to make sure the MMIO resource is allocated from the * ranges found in _CRS. * * For the release function, we can use bus_generic_release_resource(). */ static struct resource * vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { device_t parent = device_get_parent(dev); struct resource *res; #ifdef NEW_PCIB if (type == SYS_RES_MEMORY) { struct vmbus_softc *sc = device_get_softc(dev); res = pcib_host_res_alloc(&sc->vmbus_mmio_res, child, type, rid, start, end, count, flags); } else #endif { res = BUS_ALLOC_RESOURCE(parent, child, type, rid, start, end, count, flags); } return (res); } static int vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs) { return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount, irqs)); } static int vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs) { return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs)); } static int vmbus_alloc_msix(device_t bus, device_t dev, int *irq) { return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq)); } static int vmbus_release_msix(device_t bus, device_t dev, int irq) { return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq)); } static int vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr, uint32_t *data) { return (PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data)); } static uint32_t vmbus_get_version_method(device_t bus, device_t dev) { struct vmbus_softc *sc = device_get_softc(bus); return sc->vmbus_version; } static int vmbus_probe_guid_method(device_t bus, device_t dev, const struct hyperv_guid *guid) { const struct vmbus_channel *chan = vmbus_get_channel(dev); if (memcmp(&chan->ch_guid_type, guid, sizeof(struct hyperv_guid)) == 0) return 0; return ENXIO; } static uint32_t vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu) { const struct vmbus_softc *sc = device_get_softc(bus); return (VMBUS_PCPU_GET(sc, vcpuid, cpu)); } static struct taskqueue * vmbus_get_eventtq_method(device_t bus, device_t dev __unused, int cpu) { const struct vmbus_softc *sc = device_get_softc(bus); KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu%d", cpu)); return (VMBUS_PCPU_GET(sc, event_tq, cpu)); } #ifdef NEW_PCIB #define VTPM_BASE_ADDR 0xfed40000 #define FOUR_GB (1ULL << 32) enum parse_pass { parse_64, parse_32 }; struct parse_context { device_t vmbus_dev; enum parse_pass pass; }; static ACPI_STATUS parse_crs(ACPI_RESOURCE *res, void *ctx) { const struct parse_context *pc = ctx; device_t vmbus_dev = pc->vmbus_dev; struct vmbus_softc *sc = device_get_softc(vmbus_dev); UINT64 start, end; switch (res->Type) { case ACPI_RESOURCE_TYPE_ADDRESS32: start = res->Data.Address32.Address.Minimum; end = res->Data.Address32.Address.Maximum; break; case ACPI_RESOURCE_TYPE_ADDRESS64: start = res->Data.Address64.Address.Minimum; end = res->Data.Address64.Address.Maximum; break; default: /* Unused types. */ return (AE_OK); } /* * We don't use <1MB addresses. */ if (end < 0x100000) return (AE_OK); /* Don't conflict with vTPM. */ if (end >= VTPM_BASE_ADDR && start < VTPM_BASE_ADDR) end = VTPM_BASE_ADDR - 1; if ((pc->pass == parse_32 && start < FOUR_GB) || (pc->pass == parse_64 && start >= FOUR_GB)) pcib_host_res_decodes(&sc->vmbus_mmio_res, SYS_RES_MEMORY, start, end, 0); return (AE_OK); } static void vmbus_get_crs(device_t dev, device_t vmbus_dev, enum parse_pass pass) { struct parse_context pc; ACPI_STATUS status; if (bootverbose) device_printf(dev, "walking _CRS, pass=%d\n", pass); pc.vmbus_dev = vmbus_dev; pc.pass = pass; status = AcpiWalkResources(acpi_get_handle(dev), "_CRS", parse_crs, &pc); if (bootverbose && ACPI_FAILURE(status)) device_printf(dev, "_CRS: not found, pass=%d\n", pass); } static void vmbus_get_mmio_res_pass(device_t dev, enum parse_pass pass) { device_t acpi0, parent; parent = device_get_parent(dev); acpi0 = device_get_parent(parent); if (strcmp("acpi0", device_get_nameunit(acpi0)) == 0) { device_t *children; int count; /* * Try to locate VMBUS resources and find _CRS on them. */ if (device_get_children(acpi0, &children, &count) == 0) { int i; for (i = 0; i < count; ++i) { if (!device_is_attached(children[i])) continue; if (strcmp("vmbus_res", device_get_name(children[i])) == 0) vmbus_get_crs(children[i], dev, pass); } free(children, M_TEMP); } /* * Try to find _CRS on acpi. */ vmbus_get_crs(acpi0, dev, pass); } else { device_printf(dev, "not grandchild of acpi\n"); } /* * Try to find _CRS on parent. */ vmbus_get_crs(parent, dev, pass); } static void vmbus_get_mmio_res(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); /* * We walk the resources twice to make sure that: in the resource * list, the 32-bit resources appear behind the 64-bit resources. * NB: resource_list_add() uses INSERT_TAIL. This way, when we * iterate through the list to find a range for a 64-bit BAR in * vmbus_alloc_resource(), we can make sure we try to use >4GB * ranges first. */ pcib_host_res_init(dev, &sc->vmbus_mmio_res); vmbus_get_mmio_res_pass(dev, parse_64); vmbus_get_mmio_res_pass(dev, parse_32); } static void vmbus_free_mmio_res(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); pcib_host_res_free(dev, &sc->vmbus_mmio_res); } #endif /* NEW_PCIB */ static void vmbus_identify(driver_t *driver, device_t parent) { if (device_get_unit(parent) != 0 || vm_guest != VM_GUEST_HV || (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) return; device_add_child(parent, "vmbus", -1); } static int vmbus_probe(device_t dev) { if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV || (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) return (ENXIO); device_set_desc(dev, "Hyper-V Vmbus"); return (BUS_PROBE_DEFAULT); } /** * @brief Main vmbus driver initialization routine. * * Here, we * - initialize the vmbus driver context * - setup various driver entry points * - invoke the vmbus hv main init routine * - get the irq resource * - invoke the vmbus to add the vmbus root device * - setup the vmbus root device * - retrieve the channel offers */ static int vmbus_doattach(struct vmbus_softc *sc) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int ret; if (sc->vmbus_flags & VMBUS_FLAG_ATTACHED) return (0); #ifdef NEW_PCIB vmbus_get_mmio_res(sc->vmbus_dev); #endif sc->vmbus_flags |= VMBUS_FLAG_ATTACHED; sc->vmbus_gpadl = VMBUS_GPADL_START; mtx_init(&sc->vmbus_prichan_lock, "vmbus prichan", NULL, MTX_DEF); TAILQ_INIT(&sc->vmbus_prichans); mtx_init(&sc->vmbus_chan_lock, "vmbus channel", NULL, MTX_DEF); TAILQ_INIT(&sc->vmbus_chans); sc->vmbus_chmap = malloc( sizeof(struct vmbus_channel *) * VMBUS_CHAN_MAX, M_DEVBUF, M_WAITOK | M_ZERO); /* * Create context for "post message" Hypercalls */ sc->vmbus_xc = vmbus_xact_ctx_create(bus_get_dma_tag(sc->vmbus_dev), HYPERCALL_POSTMSGIN_SIZE, VMBUS_MSG_SIZE, sizeof(struct vmbus_msghc)); if (sc->vmbus_xc == NULL) { ret = ENXIO; goto cleanup; } /* * Allocate DMA stuffs. */ ret = vmbus_dma_alloc(sc); if (ret != 0) goto cleanup; /* * Setup interrupt. */ ret = vmbus_intr_setup(sc); if (ret != 0) goto cleanup; /* * Setup SynIC. */ if (bootverbose) device_printf(sc->vmbus_dev, "smp_started = %d\n", smp_started); smp_rendezvous(NULL, vmbus_synic_setup, NULL, sc); sc->vmbus_flags |= VMBUS_FLAG_SYNIC; /* * Initialize vmbus, e.g. connect to Hypervisor. */ ret = vmbus_init(sc); if (ret != 0) goto cleanup; if (sc->vmbus_version == VMBUS_VERSION_WS2008 || sc->vmbus_version == VMBUS_VERSION_WIN7) sc->vmbus_event_proc = vmbus_event_proc_compat; else sc->vmbus_event_proc = vmbus_event_proc; ret = vmbus_scan(sc); if (ret != 0) goto cleanup; ctx = device_get_sysctl_ctx(sc->vmbus_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->vmbus_dev)); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, vmbus_sysctl_version, "A", "vmbus version"); return (ret); cleanup: vmbus_scan_teardown(sc); vmbus_intr_teardown(sc); vmbus_dma_free(sc); if (sc->vmbus_xc != NULL) { vmbus_xact_ctx_destroy(sc->vmbus_xc); sc->vmbus_xc = NULL; } free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); mtx_destroy(&sc->vmbus_prichan_lock); mtx_destroy(&sc->vmbus_chan_lock); return (ret); } static void vmbus_event_proc_dummy(struct vmbus_softc *sc __unused, int cpu __unused) { } #ifdef EARLY_AP_STARTUP static void vmbus_intrhook(void *xsc) { struct vmbus_softc *sc = xsc; if (bootverbose) device_printf(sc->vmbus_dev, "intrhook\n"); vmbus_doattach(sc); config_intrhook_disestablish(&sc->vmbus_intrhook); } #endif /* EARLY_AP_STARTUP */ static int vmbus_attach(device_t dev) { vmbus_sc = device_get_softc(dev); vmbus_sc->vmbus_dev = dev; vmbus_sc->vmbus_idtvec = -1; /* * Event processing logic will be configured: * - After the vmbus protocol version negotiation. * - Before we request channel offers. */ vmbus_sc->vmbus_event_proc = vmbus_event_proc_dummy; #ifdef EARLY_AP_STARTUP /* * Defer the real attach until the pause(9) works as expected. */ vmbus_sc->vmbus_intrhook.ich_func = vmbus_intrhook; vmbus_sc->vmbus_intrhook.ich_arg = vmbus_sc; config_intrhook_establish(&vmbus_sc->vmbus_intrhook); #else /* !EARLY_AP_STARTUP */ /* * If the system has already booted and thread * scheduling is possible indicated by the global * cold set to zero, we just call the driver * initialization directly. */ if (!cold) vmbus_doattach(vmbus_sc); #endif /* EARLY_AP_STARTUP */ return (0); } static int vmbus_detach(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); bus_generic_detach(dev); vmbus_chan_destroy_all(sc); vmbus_scan_teardown(sc); vmbus_disconnect(sc); if (sc->vmbus_flags & VMBUS_FLAG_SYNIC) { sc->vmbus_flags &= ~VMBUS_FLAG_SYNIC; smp_rendezvous(NULL, vmbus_synic_teardown, NULL, NULL); } vmbus_intr_teardown(sc); vmbus_dma_free(sc); if (sc->vmbus_xc != NULL) { vmbus_xact_ctx_destroy(sc->vmbus_xc); sc->vmbus_xc = NULL; } free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); mtx_destroy(&sc->vmbus_prichan_lock); mtx_destroy(&sc->vmbus_chan_lock); #ifdef NEW_PCIB vmbus_free_mmio_res(dev); #endif return (0); } #ifndef EARLY_AP_STARTUP static void vmbus_sysinit(void *arg __unused) { struct vmbus_softc *sc = vmbus_get_softc(); if (vm_guest != VM_GUEST_HV || sc == NULL) return; /* * If the system has already booted and thread * scheduling is possible, as indicated by the * global cold set to zero, we just call the driver * initialization directly. */ if (!cold) vmbus_doattach(sc); } /* * NOTE: * We have to start as the last step of SI_SUB_SMP, i.e. after SMP is * initialized. */ SYSINIT(vmbus_initialize, SI_SUB_SMP, SI_ORDER_ANY, vmbus_sysinit, NULL); #endif /* !EARLY_AP_STARTUP */ Index: projects/runtime-coverage/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c =================================================================== --- projects/runtime-coverage/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c (revision 324497) +++ projects/runtime-coverage/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c (revision 324498) @@ -1,2310 +1,2290 @@ /*- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #undef inode #include #include #include #include "user.h" #include "mlx5_ib.h" #include #include #define DRIVER_NAME "mlx5_ib" #define DRIVER_VERSION "3.2-rc1" #define DRIVER_RELDATE "May 2016" MODULE_AUTHOR("Eli Cohen "); MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1); MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1); MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1); MODULE_VERSION(mlx5ib, 1); static int deprecated_prof_sel = 2; module_param_named(prof_sel, deprecated_prof_sel, int, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); enum { MLX5_STANDARD_ATOMIC_SIZE = 0x8, }; struct workqueue_struct *mlx5_ib_wq; static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; static void get_atomic_caps(struct mlx5_ib_dev *dev, struct ib_device_attr *props) { int tmp; u8 atomic_operations; u8 atomic_size_qp; u8 atomic_req_endianess; atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); atomic_req_endianess = MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode) || !mlx5_host_is_le(); tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; if (((atomic_operations & tmp) == tmp) && (atomic_size_qp & 8)) { if (atomic_req_endianess) { props->atomic_cap = IB_ATOMIC_HCA; } else { props->atomic_cap = IB_ATOMIC_NONE; } } else { props->atomic_cap = IB_ATOMIC_NONE; } tmp = MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | MLX5_ATOMIC_OPS_MASKED_FETCH_ADD; if (((atomic_operations & tmp) == tmp) &&(atomic_size_qp & 8)) { if (atomic_req_endianess) props->masked_atomic_cap = IB_ATOMIC_HCA; else { props->masked_atomic_cap = IB_ATOMIC_NONE; } } else { props->masked_atomic_cap = IB_ATOMIC_NONE; } } static enum rdma_link_layer mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx5_ib_dev *dev = to_mdev(device); switch (MLX5_CAP_GEN(dev->mdev, port_type)) { case MLX5_CAP_PORT_TYPE_IB: return IB_LINK_LAYER_INFINIBAND; case MLX5_CAP_PORT_TYPE_ETH: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; } } static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { return !dev->mdev->issi; } enum { MLX5_VPORT_ACCESS_METHOD_MAD, MLX5_VPORT_ACCESS_METHOD_HCA, MLX5_VPORT_ACCESS_METHOD_NIC, }; static int mlx5_get_vport_access_method(struct ib_device *ibdev) { if (mlx5_use_mad_ifc(to_mdev(ibdev))) return MLX5_VPORT_ACCESS_METHOD_MAD; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) return MLX5_VPORT_ACCESS_METHOD_NIC; return MLX5_VPORT_ACCESS_METHOD_HCA; } static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u64 tmp; int err; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_system_image_guid_mad_ifc(ibdev, sys_image_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); if (!err) *sys_image_guid = cpu_to_be64(tmp); return err; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); if (!err) *sys_image_guid = cpu_to_be64(tmp); return err; default: return -EINVAL; } } static int mlx5_query_max_pkeys(struct ib_device *ibdev, u16 *max_pkeys) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_max_pkeys_mad_ifc(ibdev, max_pkeys); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); return 0; default: return -EINVAL; } } static int mlx5_query_vendor_id(struct ib_device *ibdev, u32 *vendor_id) { struct mlx5_ib_dev *dev = to_mdev(ibdev); switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_vendor_id_mad_ifc(ibdev, vendor_id); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_core_query_vendor_id(dev->mdev, vendor_id); default: return -EINVAL; } } static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid) { u64 tmp; int err; switch (mlx5_get_vport_access_method(&dev->ib_dev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_node_guid_mad_ifc(dev, node_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); if (!err) *node_guid = cpu_to_be64(tmp); return err; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); if (!err) *node_guid = cpu_to_be64(tmp); return err; default: return -EINVAL; } } struct mlx5_reg_node_desc { u8 desc[64]; }; static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) { struct mlx5_reg_node_desc in; if (mlx5_use_mad_ifc(dev)) return mlx5_query_node_desc_mad_ifc(dev, node_desc); memset(&in, 0, sizeof(in)); return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc, sizeof(struct mlx5_reg_node_desc), MLX5_REG_NODE_DESC, 0, 0); } static int mlx5_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; int max_sq_desc; int max_rq_sg; int max_sq_sg; int err; memset(props, 0, sizeof(*props)); err = mlx5_query_system_image_guid(ibdev, &props->sys_image_guid); if (err) return err; err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys); if (err) return err; err = mlx5_query_vendor_id(ibdev, &props->vendor_id); if (err) return err; props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | ((u64)fw_rev_min(dev->mdev) << 16) | fw_rev_sub(dev->mdev); props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN; if (MLX5_CAP_GEN(mdev, pkv)) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (MLX5_CAP_GEN(mdev, qkv)) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (MLX5_CAP_GEN(mdev, apm)) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; props->max_mr_size = ~0ull; props->page_size_cap = ~(u32)((1ull << MLX5_CAP_GEN(mdev, log_pg_sz)) -1); props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / sizeof(struct mlx5_wqe_data_seg); max_sq_desc = min((int)MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); props->max_sge = min(max_rq_sg, max_sq_sg); props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp); props->max_srq = 1 << MLX5_CAP_GEN(mdev, log_max_srq); props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1; props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = (unsigned int)-1; get_atomic_caps(dev, props); props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ props->max_ah = INT_MAX; return 0; } enum mlx5_ib_width { MLX5_IB_WIDTH_1X = 1 << 0, MLX5_IB_WIDTH_2X = 1 << 1, MLX5_IB_WIDTH_4X = 1 << 2, MLX5_IB_WIDTH_8X = 1 << 3, MLX5_IB_WIDTH_12X = 1 << 4 }; static int translate_active_width(struct ib_device *ibdev, u8 active_width, u8 *ib_width) { struct mlx5_ib_dev *dev = to_mdev(ibdev); int err = 0; if (active_width & MLX5_IB_WIDTH_1X) { *ib_width = IB_WIDTH_1X; } else if (active_width & MLX5_IB_WIDTH_2X) { mlx5_ib_warn(dev, "active_width %d is not supported by IB spec\n", (int)active_width); err = -EINVAL; } else if (active_width & MLX5_IB_WIDTH_4X) { *ib_width = IB_WIDTH_4X; } else if (active_width & MLX5_IB_WIDTH_8X) { *ib_width = IB_WIDTH_8X; } else if (active_width & MLX5_IB_WIDTH_12X) { *ib_width = IB_WIDTH_12X; } else { mlx5_ib_dbg(dev, "Invalid active_width %d\n", (int)active_width); err = -EINVAL; } return err; } /* * TODO: Move to IB core */ enum ib_max_vl_num { __IB_MAX_VL_0 = 1, __IB_MAX_VL_0_1 = 2, __IB_MAX_VL_0_3 = 3, __IB_MAX_VL_0_7 = 4, __IB_MAX_VL_0_14 = 5, }; enum mlx5_vl_hw_cap { MLX5_VL_HW_0 = 1, MLX5_VL_HW_0_1 = 2, MLX5_VL_HW_0_2 = 3, MLX5_VL_HW_0_3 = 4, MLX5_VL_HW_0_4 = 5, MLX5_VL_HW_0_5 = 6, MLX5_VL_HW_0_6 = 7, MLX5_VL_HW_0_7 = 8, MLX5_VL_HW_0_14 = 15 }; static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, u8 *max_vl_num) { switch (vl_hw_cap) { case MLX5_VL_HW_0: *max_vl_num = __IB_MAX_VL_0; break; case MLX5_VL_HW_0_1: *max_vl_num = __IB_MAX_VL_0_1; break; case MLX5_VL_HW_0_3: *max_vl_num = __IB_MAX_VL_0_3; break; case MLX5_VL_HW_0_7: *max_vl_num = __IB_MAX_VL_0_7; break; case MLX5_VL_HW_0_14: *max_vl_num = __IB_MAX_VL_0_14; break; default: return -EINVAL; } return 0; } static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u32 *rep; int outlen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); struct mlx5_ptys_reg *ptys; struct mlx5_pmtu_reg *pmtu; struct mlx5_pvlc_reg pvlc; void *ctx; int err; rep = mlx5_vzalloc(outlen); ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL); if (!rep || !ptys || !pmtu) { err = -ENOMEM; goto out; } memset(props, 0, sizeof(*props)); /* what if I am pf with dual port */ err = mlx5_query_hca_vport_context(mdev, port, 0, rep, outlen); if (err) goto out; ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context); props->lid = MLX5_GET(hca_vport_context, ctx, lid); props->lmc = MLX5_GET(hca_vport_context, ctx, lmc); props->sm_lid = MLX5_GET(hca_vport_context, ctx, sm_lid); props->sm_sl = MLX5_GET(hca_vport_context, ctx, sm_sl); props->state = MLX5_GET(hca_vport_context, ctx, vport_state); props->phys_state = MLX5_GET(hca_vport_context, ctx, port_physical_state); props->port_cap_flags = MLX5_GET(hca_vport_context, ctx, cap_mask1); props->gid_tbl_len = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size)); props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); props->bad_pkey_cntr = MLX5_GET(hca_vport_context, ctx, pkey_violation_counter); props->qkey_viol_cntr = MLX5_GET(hca_vport_context, ctx, qkey_violation_counter); props->subnet_timeout = MLX5_GET(hca_vport_context, ctx, subnet_timeout); props->init_type_reply = MLX5_GET(hca_vport_context, ctx, init_type_reply); ptys->proto_mask |= MLX5_PTYS_IB; ptys->local_port = port; err = mlx5_core_access_ptys(mdev, ptys, 0); if (err) goto out; err = translate_active_width(ibdev, ptys->ib_link_width_oper, &props->active_width); if (err) goto out; props->active_speed = (u8)ptys->ib_proto_oper; pmtu->local_port = port; err = mlx5_core_access_pmtu(mdev, pmtu, 0); if (err) goto out; props->max_mtu = pmtu->max_mtu; props->active_mtu = pmtu->oper_mtu; memset(&pvlc, 0, sizeof(pvlc)); pvlc.local_port = port; err = mlx5_core_access_pvlc(mdev, &pvlc, 0); if (err) goto out; err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap, &props->max_vl_num); out: kvfree(rep); kfree(ptys); kfree(pmtu); return err; } int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_port_mad_ifc(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_port_ib(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_port_roce(ibdev, port, props); default: return -EINVAL; } } static inline int mlx5_addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->if_addrlen != ETH_ALEN) return -1; memcpy(eui, IF_LLADDR(dev), 3); memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); /* NOTE: The scope ID is added by the GID to IP conversion */ eui[3] = 0xFF; eui[4] = 0xFE; eui[0] ^= 2; return 0; } static void mlx5_make_default_gid(struct net_device *dev, union ib_gid *gid) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); mlx5_addrconf_ifid_eui48(&gid->raw[8], dev); } -static inline int -mlx5_ip2gid(const struct sockaddr *addr, union ib_gid *gid) -{ - switch (addr->sa_family) { - case AF_INET: - ipv6_addr_set_v4mapped(((const struct sockaddr_in *)addr)->sin_addr.s_addr, - (struct in6_addr *)gid->raw); - break; - case AF_INET6: - memcpy(gid->raw, &((const struct sockaddr_in6 *)addr)->sin6_addr, 16); - /* clear SCOPE ID */ - gid->raw[2] = 0; - gid->raw[3] = 0; - break; - default: - return -EINVAL; - } - return 0; -} - static void mlx5_ib_roce_port_update(void *arg) { struct mlx5_ib_port *port = (struct mlx5_ib_port *)arg; struct mlx5_ib_dev *dev = port->dev; struct mlx5_core_dev *mdev = dev->mdev; struct net_device *xdev[MLX5_IB_GID_MAX]; struct net_device *idev; struct net_device *ndev; struct ifaddr *ifa; union ib_gid gid_temp; while (port->port_gone == 0) { int update = 0; int gid_index = 0; int j; int error; ndev = mlx5_get_protocol_dev(mdev, MLX5_INTERFACE_PROTOCOL_ETH); if (ndev == NULL) { pause("W", hz); continue; } CURVNET_SET_QUIET(ndev->if_vnet); memset(&gid_temp, 0, sizeof(gid_temp)); mlx5_make_default_gid(ndev, &gid_temp); if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) { port->gid_table[gid_index] = gid_temp; update = 1; } xdev[gid_index] = ndev; gid_index++; IFNET_RLOCK(); TAILQ_FOREACH(idev, &V_ifnet, if_link) { if (idev == ndev) break; } if (idev != NULL) { TAILQ_FOREACH(idev, &V_ifnet, if_link) { if (idev != ndev) { if (idev->if_type != IFT_L2VLAN) continue; if (ndev != rdma_vlan_dev_real_dev(idev)) continue; } /* clone address information for IPv4 and IPv6 */ IF_ADDR_RLOCK(idev); TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || (ifa->ifa_addr->sa_family != AF_INET && ifa->ifa_addr->sa_family != AF_INET6) || gid_index >= MLX5_IB_GID_MAX) continue; memset(&gid_temp, 0, sizeof(gid_temp)); - mlx5_ip2gid(ifa->ifa_addr, &gid_temp); + rdma_ip2gid(ifa->ifa_addr, &gid_temp); /* check for existing entry */ for (j = 0; j != gid_index; j++) { if (bcmp(&gid_temp, &port->gid_table[j], sizeof(gid_temp)) == 0) break; } /* check if new entry must be added */ if (j == gid_index) { if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) { port->gid_table[gid_index] = gid_temp; update = 1; } xdev[gid_index] = idev; gid_index++; } } IF_ADDR_RUNLOCK(idev); } } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (update != 0 && mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { struct ib_event event = { .device = &dev->ib_dev, .element.port_num = port->port_num + 1, .event = IB_EVENT_GID_CHANGE, }; /* add new entries, if any */ for (j = 0; j != gid_index; j++) { error = modify_gid_roce(&dev->ib_dev, port->port_num, j, port->gid_table + j, xdev[j]); if (error != 0) printf("mlx5_ib: Failed to update ROCE GID table: %d\n", error); } memset(&gid_temp, 0, sizeof(gid_temp)); /* clear old entries, if any */ for (; j != MLX5_IB_GID_MAX; j++) { if (bcmp(&gid_temp, port->gid_table + j, sizeof(gid_temp)) == 0) continue; port->gid_table[j] = gid_temp; (void) modify_gid_roce(&dev->ib_dev, port->port_num, j, port->gid_table + j, ndev); } /* make sure ibcore gets updated */ ib_dispatch_event(&event); } pause("W", hz); } do { struct ib_event event = { .device = &dev->ib_dev, .element.port_num = port->port_num + 1, .event = IB_EVENT_GID_CHANGE, }; /* make sure ibcore gets updated */ ib_dispatch_event(&event); /* wait a bit */ pause("W", hz); } while (0); port->port_gone = 2; kthread_exit(); } static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_gids_mad_ifc(ibdev, port, index, gid); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid); case MLX5_VPORT_ACCESS_METHOD_NIC: if (port == 0 || port > MLX5_CAP_GEN(mdev, num_ports) || index < 0 || index >= MLX5_IB_GID_MAX || dev->port[port - 1].port_gone != 0) memset(gid, 0, sizeof(*gid)); else *gid = dev->port[port - 1].gid_table[index]; return 0; default: return -EINVAL; } } static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_pkey_mad_ifc(ibdev, port, index, pkey); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, pkey); default: return -EINVAL; } } static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_reg_node_desc in; struct mlx5_reg_node_desc out; int err; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ memcpy(&in, props->node_desc, 64); err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, sizeof(out), MLX5_REG_NODE_DESC, 0, 1); if (err) return err; memcpy(ibdev->node_desc, props->node_desc, 64); return err; } static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { u8 is_eth = (mlx5_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_ETHERNET); struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_port_attr attr; u32 tmp; int err; /* return OK if this is RoCE. CM calls ib_modify_port() regardless * of whether port link layer is ETH or IB. For ETH ports, qkey * violations and port capabilities are not valid. */ if (is_eth) return 0; mutex_lock(&dev->cap_mask_mutex); err = mlx5_ib_query_port(ibdev, port, &attr); if (err) goto out; tmp = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx5_set_port_caps(dev->mdev, port, tmp); out: mutex_unlock(&dev->cap_mask_mutex); return err; } enum mlx5_cap_flags { MLX5_CAP_COMPACT_AV = 1 << 0, }; static void set_mlx5_flags(u32 *flags, struct mlx5_core_dev *dev) { *flags |= MLX5_CAP_GEN(dev, compact_address_vector) ? MLX5_CAP_COMPACT_AV : 0; } static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_ucontext_req_v2 req; struct mlx5_ib_alloc_ucontext_resp resp; struct mlx5_ib_ucontext *context; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; int gross_uuars; int num_uars; int ver; int uuarn; int err; int i; size_t reqlen; if (!dev->ib_active) return ERR_PTR(-EAGAIN); memset(&req, 0, sizeof(req)); memset(&resp, 0, sizeof(resp)); reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) ver = 2; else { mlx5_ib_err(dev, "request malformed, reqlen: %ld\n", (long)reqlen); return ERR_PTR(-EINVAL); } err = ib_copy_from_udata(&req, udata, reqlen); if (err) { mlx5_ib_err(dev, "copy failed\n"); return ERR_PTR(err); } if (req.reserved) { mlx5_ib_err(dev, "request corrupted\n"); return ERR_PTR(-EINVAL); } if (req.total_num_uuars == 0 || req.total_num_uuars > MLX5_MAX_UUARS) { mlx5_ib_warn(dev, "wrong num_uuars: %d\n", req.total_num_uuars); return ERR_PTR(-ENOMEM); } req.total_num_uuars = ALIGN(req.total_num_uuars, MLX5_NON_FP_BF_REGS_PER_PAGE); if (req.num_low_latency_uuars > req.total_num_uuars - 1) { mlx5_ib_warn(dev, "wrong num_low_latency_uuars: %d ( > %d)\n", req.total_num_uuars, req.total_num_uuars); return ERR_PTR(-EINVAL); } num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); resp.cache_line_size = L1_CACHE_BYTES; resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); set_mlx5_flags(&resp.flags, dev->mdev); if (offsetof(struct mlx5_ib_alloc_ucontext_resp, max_desc_sz_sq_dc) < udata->outlen) resp.max_desc_sz_sq_dc = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq_dc); if (offsetof(struct mlx5_ib_alloc_ucontext_resp, atomic_arg_sizes_dc) < udata->outlen) resp.atomic_arg_sizes_dc = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); uuari = &context->uuari; mutex_init(&uuari->lock); uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); if (!uars) { err = -ENOMEM; goto out_ctx; } uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), sizeof(*uuari->bitmap), GFP_KERNEL); if (!uuari->bitmap) { err = -ENOMEM; goto out_uar_ctx; } /* * clear all fast path uuars */ for (i = 0; i < gross_uuars; i++) { uuarn = i & 3; if (uuarn == 2 || uuarn == 3) set_bit(i, uuari->bitmap); } uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); if (!uuari->count) { err = -ENOMEM; goto out_bitmap; } for (i = 0; i < num_uars; i++) { err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); if (err) { mlx5_ib_err(dev, "uar alloc failed at %d\n", i); goto out_uars; } } for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) context->dynamic_wc_uar_index[i] = MLX5_IB_INVALID_UAR_INDEX; INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); resp.tot_uuars = req.total_num_uuars; resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); err = ib_copy_to_udata(udata, &resp, min_t(size_t, udata->outlen, sizeof(resp))); if (err) goto out_uars; uuari->ver = ver; uuari->num_low_latency_uuars = req.num_low_latency_uuars; uuari->uars = uars; uuari->num_uars = num_uars; if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn); if (err) goto out_uars; } return &context->ibucontext; out_uars: for (i--; i >= 0; i--) mlx5_cmd_free_uar(dev->mdev, uars[i].index); kfree(uuari->count); out_bitmap: kfree(uuari->bitmap); out_uar_ctx: kfree(uars); out_ctx: kfree(context); return ERR_PTR(err); } static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_uuar_info *uuari = &context->uuari; int i; if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) mlx5_dealloc_transport_domain(dev->mdev, context->tdn); for (i = 0; i < uuari->num_uars; i++) { if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); } for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) { if (context->dynamic_wc_uar_index[i] != MLX5_IB_INVALID_UAR_INDEX) mlx5_cmd_free_uar(dev->mdev, context->dynamic_wc_uar_index[i]); } kfree(uuari->count); kfree(uuari->bitmap); kfree(uuari->uars); kfree(context); return 0; } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) { return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index; } static int get_command(unsigned long offset) { return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; } static int get_arg(unsigned long offset) { return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1); } static int get_index(unsigned long offset) { return get_arg(offset); } static int uar_mmap(struct vm_area_struct *vma, pgprot_t prot, bool is_wc, struct mlx5_uuar_info *uuari, struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) { unsigned long idx; phys_addr_t pfn; if (vma->vm_end - vma->vm_start != PAGE_SIZE) { mlx5_ib_warn(dev, "wrong size, expected PAGE_SIZE(%ld) got %ld\n", (long)PAGE_SIZE, (long)(vma->vm_end - vma->vm_start)); return -EINVAL; } idx = get_index(vma->vm_pgoff); if (idx >= uuari->num_uars) { mlx5_ib_warn(dev, "wrong offset, idx:%ld num_uars:%d\n", idx, uuari->num_uars); return -EINVAL; } pfn = uar_index2pfn(dev, uuari->uars[idx].index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, (unsigned long long)pfn); vma->vm_page_prot = prot; if (io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot)) { mlx5_ib_err(dev, "io remap failed\n"); return -EAGAIN; } mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA 0x%llx\n", is_wc ? "WC" : "NC", (long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT); return 0; } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_uuar_info *uuari = &context->uuari; unsigned long command; command = get_command(vma->vm_pgoff); switch (command) { case MLX5_IB_MMAP_REGULAR_PAGE: return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), true, uuari, dev, context); break; case MLX5_IB_MMAP_WC_PAGE: return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), true, uuari, dev, context); break; case MLX5_IB_MMAP_NC_PAGE: return uar_mmap(vma, pgprot_noncached(vma->vm_page_prot), false, uuari, dev, context); break; default: return -EINVAL; } return 0; } static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) { struct mlx5_create_mkey_mbox_in *in; struct mlx5_mkey_seg *seg; struct mlx5_core_mr mr; int err; in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return -ENOMEM; seg = &in->seg; seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA; seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in), NULL, NULL, NULL); if (err) { mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); goto err_in; } kfree(in); *key = mr.key; return 0; err_in: kfree(in); return err; } static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key) { struct mlx5_core_mr mr; int err; memset(&mr, 0, sizeof(mr)); mr.key = key; err = mlx5_core_destroy_mkey(dev->mdev, &mr); if (err) mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key); } static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_pd_resp resp; struct mlx5_ib_pd *pd; int err; pd = kmalloc(sizeof(*pd), GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); if (err) { mlx5_ib_warn(dev, "pd alloc failed\n"); kfree(pd); return ERR_PTR(err); } if (context) { resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { mlx5_ib_err(dev, "copy failed\n"); mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } } else { err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn); if (err) { mlx5_ib_err(dev, "alloc mkey failed\n"); mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); kfree(pd); return ERR_PTR(err); } } return &pd->ibpd; } static int mlx5_ib_dealloc_pd(struct ib_pd *pd) { struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); if (!pd->uobject) free_pa_mkey(mdev, mpd->pa_lkey); mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); kfree(mpd); return 0; } static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; if (ibqp->qp_type == IB_QPT_RAW_PACKET) err = -EOPNOTSUPP; else err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; if (ibqp->qp_type == IB_QPT_RAW_PACKET) err = -EOPNOTSUPP; else err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); return err; } static int init_node_data(struct mlx5_ib_dev *dev) { int err; err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc); if (err) return err; return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); } static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages); } static ssize_t show_reg_pages(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", (unsigned)dev->mdev->pdev->revision); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, dev->mdev->board_id); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_fw_pages, &dev_attr_reg_pages, }; static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) { struct mlx5_ib_qp *mqp; struct mlx5_ib_cq *send_mcq, *recv_mcq; struct mlx5_core_cq *mcq; struct list_head cq_armed_list; unsigned long flags_qp; unsigned long flags_cq; unsigned long flags; mlx5_ib_warn(ibdev, " started\n"); INIT_LIST_HEAD(&cq_armed_list); /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { spin_lock_irqsave(&mqp->sq.lock, flags_qp); if (mqp->sq.tail != mqp->sq.head) { send_mcq = to_mcq(mqp->ibqp.send_cq); spin_lock_irqsave(&send_mcq->lock, flags_cq); if (send_mcq->mcq.comp && mqp->ibqp.send_cq->comp_handler) { if (!send_mcq->mcq.reset_notify_added) { send_mcq->mcq.reset_notify_added = 1; list_add_tail(&send_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&send_mcq->lock, flags_cq); } spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); spin_lock_irqsave(&mqp->rq.lock, flags_qp); /* no handling is needed for SRQ */ if (!mqp->ibqp.srq) { if (mqp->rq.tail != mqp->rq.head) { recv_mcq = to_mcq(mqp->ibqp.recv_cq); spin_lock_irqsave(&recv_mcq->lock, flags_cq); if (recv_mcq->mcq.comp && mqp->ibqp.recv_cq->comp_handler) { if (!recv_mcq->mcq.reset_notify_added) { recv_mcq->mcq.reset_notify_added = 1; list_add_tail(&recv_mcq->mcq.reset_notify, &cq_armed_list); } } spin_unlock_irqrestore(&recv_mcq->lock, flags_cq); } } spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); } /*At that point all inflight post send were put to be executed as of we * lock/unlock above locks Now need to arm all involved CQs. */ list_for_each_entry(mcq, &cq_armed_list, reset_notify) { mcq->comp(mcq); } spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); mlx5_ib_warn(ibdev, " ended\n"); return; } static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; struct ib_event ibev; u8 port = 0; switch (event) { case MLX5_DEV_EVENT_SYS_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; mlx5_ib_handle_internal_error(ibdev); break; case MLX5_DEV_EVENT_PORT_UP: ibev.event = IB_EVENT_PORT_ACTIVE; port = (u8)param; break; case MLX5_DEV_EVENT_PORT_DOWN: case MLX5_DEV_EVENT_PORT_INITIALIZED: ibev.event = IB_EVENT_PORT_ERR; port = (u8)param; break; case MLX5_DEV_EVENT_LID_CHANGE: ibev.event = IB_EVENT_LID_CHANGE; port = (u8)param; break; case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; port = (u8)param; break; case MLX5_DEV_EVENT_GUID_CHANGE: ibev.event = IB_EVENT_GID_CHANGE; port = (u8)param; break; case MLX5_DEV_EVENT_CLIENT_REREG: ibev.event = IB_EVENT_CLIENT_REREGISTER; port = (u8)param; break; default: break; } ibev.device = &ibdev->ib_dev; ibev.element.port_num = port; if ((event != MLX5_DEV_EVENT_SYS_ERROR) && (port < 1 || port > ibdev->num_ports)) { mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); return; } if (ibdev->ib_active) ib_dispatch_event(&ibev); } static void get_ext_port_caps(struct mlx5_ib_dev *dev) { int port; for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) mlx5_query_ext_port_caps(dev, port); } static void config_atomic_responder(struct mlx5_ib_dev *dev, struct ib_device_attr *props) { enum ib_atomic_cap cap = props->atomic_cap; #if 0 if (cap == IB_ATOMIC_HCA || cap == IB_ATOMIC_GLOB) #endif dev->enable_atomic_resp = 1; dev->atomic_cap = cap; } enum mlx5_addr_align { MLX5_ADDR_ALIGN_0 = 0, MLX5_ADDR_ALIGN_64 = 64, MLX5_ADDR_ALIGN_128 = 128, }; static int get_port_caps(struct mlx5_ib_dev *dev) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; int port; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) goto out; dprops = kmalloc(sizeof(*dprops), GFP_KERNEL); if (!dprops) goto out; err = mlx5_ib_query_device(&dev->ib_dev, dprops); if (err) { mlx5_ib_warn(dev, "query_device failed %d\n", err); goto out; } config_atomic_responder(dev, dprops); for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err); break; } dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys; dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len; mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", dprops->max_pkeys, pprops->gid_tbl_len); } out: kfree(pprops); kfree(dprops); return err; } static void destroy_umrc_res(struct mlx5_ib_dev *dev) { int err; err = mlx5_mr_cache_cleanup(dev); if (err) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); ib_dereg_mr(dev->umrc.mr); ib_dealloc_pd(dev->umrc.pd); } enum { MAX_UMR_WR = 128, }; static int create_umr_res(struct mlx5_ib_dev *dev) { struct ib_pd *pd; struct ib_mr *mr; int ret; pd = ib_alloc_pd(&dev->ib_dev); if (IS_ERR(pd)) { mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); ret = PTR_ERR(pd); goto error_0; } mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(mr)) { mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n"); ret = PTR_ERR(mr); goto error_1; } dev->umrc.mr = mr; dev->umrc.pd = pd; ret = mlx5_mr_cache_init(dev); if (ret) { mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); goto error_4; } return 0; error_4: ib_dereg_mr(mr); error_1: ib_dealloc_pd(pd); error_0: return ret; } static int create_dev_resources(struct mlx5_ib_resources *devr) { struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; int ret = 0; struct ib_cq_init_attr cq_attr = { .cqe = 1 }; dev = container_of(devr, struct mlx5_ib_dev, devr); devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); goto error0; } devr->p0->device = &dev->ib_dev; devr->p0->uobject = NULL; atomic_set(&devr->p0->usecnt, 0); devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL); if (IS_ERR(devr->c0)) { ret = PTR_ERR(devr->c0); goto error1; } devr->c0->device = &dev->ib_dev; devr->c0->uobject = NULL; devr->c0->comp_handler = NULL; devr->c0->event_handler = NULL; devr->c0->cq_context = NULL; atomic_set(&devr->c0->usecnt, 0); devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->x0)) { ret = PTR_ERR(devr->x0); goto error2; } devr->x0->device = &dev->ib_dev; devr->x0->inode = NULL; atomic_set(&devr->x0->usecnt, 0); mutex_init(&devr->x0->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x0->tgt_qp_list); devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->x1)) { ret = PTR_ERR(devr->x1); goto error3; } devr->x1->device = &dev->ib_dev; devr->x1->inode = NULL; atomic_set(&devr->x1->usecnt, 0); mutex_init(&devr->x1->tgt_qp_mutex); INIT_LIST_HEAD(&devr->x1->tgt_qp_list); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_XRC; attr.ext.xrc.cq = devr->c0; attr.ext.xrc.xrcd = devr->x0; devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); if (IS_ERR(devr->s0)) { ret = PTR_ERR(devr->s0); goto error4; } devr->s0->device = &dev->ib_dev; devr->s0->pd = devr->p0; devr->s0->uobject = NULL; devr->s0->event_handler = NULL; devr->s0->srq_context = NULL; devr->s0->srq_type = IB_SRQT_XRC; devr->s0->ext.xrc.xrcd = devr->x0; devr->s0->ext.xrc.cq = devr->c0; atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); atomic_inc(&devr->s0->ext.xrc.cq->usecnt); atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); memset(&attr, 0, sizeof(attr)); attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_BASIC; devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL); if (IS_ERR(devr->s1)) { ret = PTR_ERR(devr->s1); goto error5; } devr->s1->device = &dev->ib_dev; devr->s1->pd = devr->p0; devr->s1->uobject = NULL; devr->s1->event_handler = NULL; devr->s1->srq_context = NULL; devr->s1->srq_type = IB_SRQT_BASIC; devr->s1->ext.xrc.cq = devr->c0; atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s1->usecnt, 0); return 0; error5: mlx5_ib_destroy_srq(devr->s0); error4: mlx5_ib_dealloc_xrcd(devr->x1); error3: mlx5_ib_dealloc_xrcd(devr->x0); error2: mlx5_ib_destroy_cq(devr->c0); error1: mlx5_ib_dealloc_pd(devr->p0); error0: return ret; } static void destroy_dev_resources(struct mlx5_ib_resources *devr) { mlx5_ib_destroy_srq(devr->s1); mlx5_ib_destroy_srq(devr->s0); mlx5_ib_dealloc_xrcd(devr->x0); mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); } static void enable_dc_tracer(struct mlx5_ib_dev *dev) { struct device *device = dev->ib_dev.dma_device; struct mlx5_dc_tracer *dct = &dev->dctr; int order; void *tmp; int size; int err; size = MLX5_CAP_GEN(dev->mdev, num_ports) * 4096; if (size <= PAGE_SIZE) order = 0; else order = 1; dct->pg = alloc_pages(GFP_KERNEL, order); if (!dct->pg) { mlx5_ib_err(dev, "failed to allocate %d pages\n", order); return; } tmp = page_address(dct->pg); memset(tmp, 0xff, size); dct->size = size; dct->order = order; dct->dma = dma_map_page(device, dct->pg, 0, size, DMA_FROM_DEVICE); if (dma_mapping_error(device, dct->dma)) { mlx5_ib_err(dev, "dma mapping error\n"); goto map_err; } err = mlx5_core_set_dc_cnak_trace(dev->mdev, 1, dct->dma); if (err) { mlx5_ib_warn(dev, "failed to enable DC tracer\n"); goto cmd_err; } return; cmd_err: dma_unmap_page(device, dct->dma, size, DMA_FROM_DEVICE); map_err: __free_pages(dct->pg, dct->order); dct->pg = NULL; } static void disable_dc_tracer(struct mlx5_ib_dev *dev) { struct device *device = dev->ib_dev.dma_device; struct mlx5_dc_tracer *dct = &dev->dctr; int err; if (!dct->pg) return; err = mlx5_core_set_dc_cnak_trace(dev->mdev, 0, dct->dma); if (err) { mlx5_ib_warn(dev, "failed to disable DC tracer\n"); return; } dma_unmap_page(device, dct->dma, dct->size, DMA_FROM_DEVICE); __free_pages(dct->pg, dct->order); dct->pg = NULL; } enum { MLX5_DC_CNAK_SIZE = 128, MLX5_NUM_BUF_IN_PAGE = PAGE_SIZE / MLX5_DC_CNAK_SIZE, MLX5_CNAK_TX_CQ_SIGNAL_FACTOR = 128, MLX5_DC_CNAK_SL = 0, MLX5_DC_CNAK_VL = 0, }; static int init_dc_improvements(struct mlx5_ib_dev *dev) { if (!mlx5_core_is_pf(dev->mdev)) return 0; if (!(MLX5_CAP_GEN(dev->mdev, dc_cnak_trace))) return 0; enable_dc_tracer(dev); return 0; } static void cleanup_dc_improvements(struct mlx5_ib_dev *dev) { disable_dc_tracer(dev); } static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num) { mlx5_vport_dealloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, dev->port[port_num].q_cnt_id); dev->port[port_num].q_cnt_id = 0; } static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev) { unsigned int i; for (i = 0; i < dev->num_ports; i++) mlx5_ib_dealloc_q_port_counter(dev, i); } static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) { int i; int ret; for (i = 0; i < dev->num_ports; i++) { ret = mlx5_vport_alloc_q_counter(dev->mdev, MLX5_INTERFACE_PROTOCOL_IB, &dev->port[i].q_cnt_id); if (ret) { mlx5_ib_warn(dev, "couldn't allocate queue counter for port %d\n", i + 1); goto dealloc_counters; } } return 0; dealloc_counters: while (--i >= 0) mlx5_ib_dealloc_q_port_counter(dev, i); return ret; } struct port_attribute { struct attribute attr; ssize_t (*show)(struct mlx5_ib_port *, struct port_attribute *, char *buf); ssize_t (*store)(struct mlx5_ib_port *, struct port_attribute *, const char *buf, size_t count); }; struct port_counter_attribute { struct port_attribute attr; size_t offset; }; static ssize_t port_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct port_attribute *port_attr = container_of(attr, struct port_attribute, attr); struct mlx5_ib_port_sysfs_group *p = container_of(kobj, struct mlx5_ib_port_sysfs_group, kobj); struct mlx5_ib_port *mibport = container_of(p, struct mlx5_ib_port, group); if (!port_attr->show) return -EIO; return port_attr->show(mibport, port_attr, buf); } static ssize_t show_port_counter(struct mlx5_ib_port *p, struct port_attribute *port_attr, char *buf) { int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); struct port_counter_attribute *counter_attr = container_of(port_attr, struct port_counter_attribute, attr); void *out; int ret; out = mlx5_vzalloc(outlen); if (!out) return -ENOMEM; ret = mlx5_vport_query_q_counter(p->dev->mdev, p->q_cnt_id, 0, out, outlen); if (ret) goto free; ret = sprintf(buf, "%d\n", be32_to_cpu(*(__be32 *)(out + counter_attr->offset))); free: kfree(out); return ret; } #define PORT_COUNTER_ATTR(_name) \ struct port_counter_attribute port_counter_attr_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_port_counter, NULL), \ .offset = MLX5_BYTE_OFF(query_q_counter_out, _name) \ } static PORT_COUNTER_ATTR(rx_write_requests); static PORT_COUNTER_ATTR(rx_read_requests); static PORT_COUNTER_ATTR(rx_atomic_requests); static PORT_COUNTER_ATTR(rx_dct_connect); static PORT_COUNTER_ATTR(out_of_buffer); static PORT_COUNTER_ATTR(out_of_sequence); static PORT_COUNTER_ATTR(duplicate_request); static PORT_COUNTER_ATTR(rnr_nak_retry_err); static PORT_COUNTER_ATTR(packet_seq_err); static PORT_COUNTER_ATTR(implied_nak_seq_err); static PORT_COUNTER_ATTR(local_ack_timeout_err); static struct attribute *counter_attrs[] = { &port_counter_attr_rx_write_requests.attr.attr, &port_counter_attr_rx_read_requests.attr.attr, &port_counter_attr_rx_atomic_requests.attr.attr, &port_counter_attr_rx_dct_connect.attr.attr, &port_counter_attr_out_of_buffer.attr.attr, &port_counter_attr_out_of_sequence.attr.attr, &port_counter_attr_duplicate_request.attr.attr, &port_counter_attr_rnr_nak_retry_err.attr.attr, &port_counter_attr_packet_seq_err.attr.attr, &port_counter_attr_implied_nak_seq_err.attr.attr, &port_counter_attr_local_ack_timeout_err.attr.attr, NULL }; static struct attribute_group port_counters_group = { .name = "counters", .attrs = counter_attrs }; static const struct sysfs_ops port_sysfs_ops = { .show = port_attr_show }; static struct kobj_type port_type = { .sysfs_ops = &port_sysfs_ops, }; static int add_port_attrs(struct mlx5_ib_dev *dev, struct kobject *parent, struct mlx5_ib_port_sysfs_group *port, u8 port_num) { int ret; ret = kobject_init_and_add(&port->kobj, &port_type, parent, "%d", port_num); if (ret) return ret; if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { ret = sysfs_create_group(&port->kobj, &port_counters_group); if (ret) goto put_kobj; } port->enabled = true; return ret; put_kobj: kobject_put(&port->kobj); return ret; } static void destroy_ports_attrs(struct mlx5_ib_dev *dev, unsigned int num_ports) { unsigned int i; for (i = 0; i < num_ports; i++) { struct mlx5_ib_port_sysfs_group *port = &dev->port[i].group; if (!port->enabled) continue; if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) sysfs_remove_group(&port->kobj, &port_counters_group); kobject_put(&port->kobj); port->enabled = false; } if (dev->ports_parent) { kobject_put(dev->ports_parent); dev->ports_parent = NULL; } } static int create_port_attrs(struct mlx5_ib_dev *dev) { int ret = 0; unsigned int i = 0; struct device *device = &dev->ib_dev.dev; dev->ports_parent = kobject_create_and_add("mlx5_ports", &device->kobj); if (!dev->ports_parent) return -ENOMEM; for (i = 0; i < dev->num_ports; i++) { ret = add_port_attrs(dev, dev->ports_parent, &dev->port[i].group, i + 1); if (ret) goto _destroy_ports_attrs; } return 0; _destroy_ports_attrs: destroy_ports_attrs(dev, i); return ret; } static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; int err; int i; printk_once(KERN_INFO "%s", mlx5_version); dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); if (!dev) return NULL; dev->mdev = mdev; dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), GFP_KERNEL); if (!dev->port) goto err_dealloc; for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { dev->port[i].dev = dev; dev->port[i].port_num = i; dev->port[i].port_gone = 0; memset(dev->port[i].gid_table, 0, sizeof(dev->port[i].gid_table)); } err = get_port_caps(dev); if (err) goto err_free_port; if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { if (MLX5_CAP_GEN(mdev, roce)) { err = mlx5_nic_vport_enable_roce(mdev); if (err) goto err_free_port; } else { goto err_free_port; } } MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = mdev->special_contexts.resd_lkey; dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->mdev->priv.eq_table.num_comp_vectors; dev->ib_dev.dma_device = &mdev->pdev->dev; dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.query_gid = mlx5_ib_query_gid; dev->ib_dev.query_pkey = mlx5_ib_query_pkey; dev->ib_dev.modify_device = mlx5_ib_modify_device; dev->ib_dev.modify_port = mlx5_ib_modify_port; dev->ib_dev.alloc_ucontext = mlx5_ib_alloc_ucontext; dev->ib_dev.dealloc_ucontext = mlx5_ib_dealloc_ucontext; dev->ib_dev.mmap = mlx5_ib_mmap; dev->ib_dev.alloc_pd = mlx5_ib_alloc_pd; dev->ib_dev.dealloc_pd = mlx5_ib_dealloc_pd; dev->ib_dev.create_ah = mlx5_ib_create_ah; dev->ib_dev.query_ah = mlx5_ib_query_ah; dev->ib_dev.destroy_ah = mlx5_ib_destroy_ah; dev->ib_dev.create_srq = mlx5_ib_create_srq; dev->ib_dev.modify_srq = mlx5_ib_modify_srq; dev->ib_dev.query_srq = mlx5_ib_query_srq; dev->ib_dev.destroy_srq = mlx5_ib_destroy_srq; dev->ib_dev.post_srq_recv = mlx5_ib_post_srq_recv; dev->ib_dev.create_qp = mlx5_ib_create_qp; dev->ib_dev.modify_qp = mlx5_ib_modify_qp; dev->ib_dev.query_qp = mlx5_ib_query_qp; dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; dev->ib_dev.post_send = mlx5_ib_post_send; dev->ib_dev.post_recv = mlx5_ib_post_recv; dev->ib_dev.create_cq = mlx5_ib_create_cq; dev->ib_dev.modify_cq = mlx5_ib_modify_cq; dev->ib_dev.resize_cq = mlx5_ib_resize_cq; dev->ib_dev.destroy_cq = mlx5_ib_destroy_cq; dev->ib_dev.poll_cq = mlx5_ib_poll_cq; dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; dev->ib_dev.reg_phys_mr = mlx5_ib_reg_phys_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; dev->ib_dev.process_mad = mlx5_ib_process_mad; dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; dev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } err = init_node_data(dev); if (err) goto err_disable_roce; mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); err = create_dev_resources(&dev->devr); if (err) goto err_disable_roce; err = mlx5_ib_alloc_q_counters(dev); if (err) goto err_odp; err = ib_register_device(&dev->ib_dev, NULL); if (err) goto err_q_cnt; err = create_umr_res(dev); if (err) goto err_dev; if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) { if (init_dc_improvements(dev)) mlx5_ib_dbg(dev, "init_dc_improvements - continuing\n"); } err = create_port_attrs(dev); if (err) goto err_dc; for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) goto err_port_attrs; } if (1) { struct thread *rl_thread = NULL; struct proc *rl_proc = NULL; for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { (void) kproc_kthread_add(mlx5_ib_roce_port_update, dev->port + i, &rl_proc, &rl_thread, RFHIGHPID, 0, "mlx5-ib-roce-port", "mlx5-ib-roce_port-%d", i); } } dev->ib_active = true; return dev; err_port_attrs: destroy_ports_attrs(dev, dev->num_ports); err_dc: if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) cleanup_dc_improvements(dev); destroy_umrc_res(dev); err_dev: ib_unregister_device(&dev->ib_dev); err_q_cnt: mlx5_ib_dealloc_q_counters(dev); err_odp: destroy_dev_resources(&dev->devr); err_disable_roce: if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) mlx5_nic_vport_disable_roce(mdev); err_free_port: kfree(dev->port); err_dealloc: ib_dealloc_device((struct ib_device *)dev); return NULL; } static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; int i; for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { dev->port[i].port_gone = 1; while (dev->port[i].port_gone != 2) pause("W", hz); } for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { device_remove_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); } destroy_ports_attrs(dev, dev->num_ports); if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) cleanup_dc_improvements(dev); mlx5_ib_dealloc_q_counters(dev); ib_unregister_device(&dev->ib_dev); destroy_umrc_res(dev); destroy_dev_resources(&dev->devr); if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) mlx5_nic_vport_disable_roce(mdev); kfree(dev->port); ib_dealloc_device(&dev->ib_dev); } static struct mlx5_interface mlx5_ib_interface = { .add = mlx5_ib_add, .remove = mlx5_ib_remove, .event = mlx5_ib_event, .protocol = MLX5_INTERFACE_PROTOCOL_IB, }; static int __init mlx5_ib_init(void) { int err; if (deprecated_prof_sel != 2) printf("mlx5_ib: WARN: ""prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); err = mlx5_register_interface(&mlx5_ib_interface); if (err) goto clean_odp; mlx5_ib_wq = create_singlethread_workqueue("mlx5_ib_wq"); if (!mlx5_ib_wq) { printf("mlx5_ib: ERR: ""%s: failed to create mlx5_ib_wq\n", __func__); goto err_unreg; } return err; err_unreg: mlx5_unregister_interface(&mlx5_ib_interface); clean_odp: return err; } static void __exit mlx5_ib_cleanup(void) { destroy_workqueue(mlx5_ib_wq); mlx5_unregister_interface(&mlx5_ib_interface); } module_init_order(mlx5_ib_init, SI_ORDER_THIRD); module_exit_order(mlx5_ib_cleanup, SI_ORDER_THIRD); Index: projects/runtime-coverage/sys/modules/Makefile =================================================================== --- projects/runtime-coverage/sys/modules/Makefile (revision 324497) +++ projects/runtime-coverage/sys/modules/Makefile (revision 324498) @@ -1,837 +1,838 @@ # $FreeBSD$ SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" SUBDIR_PARALLEL= # Modules that include binary-only blobs of microcode should be selectable by # MK_SOURCELESS_UCODE option (see below). .if defined(MODULES_OVERRIDE) && !defined(ALL_MODULES) SUBDIR=${MODULES_OVERRIDE} .else SUBDIR= \ ${_3dfx} \ ${_3dfx_linux} \ ${_aac} \ ${_aacraid} \ accf_data \ accf_dns \ accf_http \ acl_nfs4 \ acl_posix1e \ ${_acpi} \ ae \ ${_aesni} \ age \ ${_agp} \ aha \ ahci \ ${_aic} \ aic7xxx \ alc \ ale \ alq \ ${_amd_ecc_inject} \ ${_amdsbwd} \ ${_amdsmn} \ ${_amdtemp} \ amr \ ${_an} \ ${_aout} \ ${_apm} \ ${_arcmsr} \ ${_arcnet} \ ${_armv8crypto} \ ${_asmc} \ ata \ ath \ ath_dfs \ ath_hal \ ath_hal_ar5210 \ ath_hal_ar5211 \ ath_hal_ar5212 \ ath_hal_ar5416 \ ath_hal_ar9300 \ ath_main \ ath_rate \ ath_pci \ ${_autofs} \ ${_auxio} \ ${_bce} \ bfe \ bge \ bhnd \ ${_bxe} \ ${_bios} \ ${_bktr} \ ${_bm} \ bnxt \ bridgestp \ bwi \ bwn \ bwn_pci \ ${_bytgpio} \ cam \ ${_cardbus} \ ${_carp} \ cas \ ${_cbb} \ cc \ cd9660 \ cd9660_iconv \ ${_ce} \ ${_cfi} \ chacha20 \ ${_chromebook_platform} \ ${_ciss} \ cloudabi \ ${_cloudabi32} \ ${_cloudabi64} \ ${_cm} \ ${_cmx} \ ${_coff} \ ${_coretemp} \ ${_cp} \ ${_cpsw} \ ${_cpuctl} \ ${_cpufreq} \ ${_crypto} \ ${_cryptodev} \ ${_cs} \ ${_ctau} \ ctl \ ${_cxgb} \ ${_cxgbe} \ dc \ dcons \ dcons_crom \ de \ ${_dpms} \ ${_dpt} \ ${_drm} \ ${_drm2} \ dummynet \ ${_ed} \ ${_efirt} \ ${_elink} \ ${_em} \ ${_ena} \ ${_ep} \ ${_epic} \ esp \ ${_et} \ evdev \ ${_ex} \ ${_exca} \ ext2fs \ fdc \ fdescfs \ ${_fe} \ ${_ffec} \ filemon \ firewire \ firmware \ fuse \ ${_fxp} \ gem \ geom \ ${_glxiic} \ ${_glxsb} \ gpio \ hifn \ hme \ ${_hpt27xx} \ ${_hptiop} \ ${_hptmv} \ ${_hptnr} \ ${_hptrr} \ hwpmc \ ${_hwpmc_mips24k} \ ${_hwpmc_mips74k} \ ${_hyperv} \ i2c \ ${_ibcore} \ ${_ibcs2} \ ${_ichwd} \ ${_ida} \ if_bridge \ if_disc \ if_edsc \ ${_if_enc} \ if_epair \ ${_if_gif} \ ${_if_gre} \ ${_if_me} \ if_lagg \ ${_if_ndis} \ ${_if_stf} \ if_tap \ if_tun \ if_vlan \ if_vxlan \ ${_iir} \ imgact_binmisc \ ${_intelspi} \ ${_io} \ ${_ioat} \ ${_ipoib} \ ${_ipdivert} \ ${_ipfilter} \ ${_ipfw} \ ipfw_nat \ ${_ipfw_nat64} \ ${_ipfw_nptv6} \ ${_ipfw_pmod} \ ${_ipmi} \ ip6_mroute_mod \ ip_mroute_mod \ ${_ips} \ ${_ipsec} \ ${_ipw} \ ${_ipwfw} \ ${_isci} \ ${_iser} \ isp \ ${_ispfw} \ ${_iwi} \ ${_iwifw} \ ${_iwm} \ ${_iwmfw} \ ${_iwn} \ ${_iwnfw} \ ${_ix} \ ${_ixv} \ ${_ixgb} \ ${_ixl} \ ${_ixlv} \ jme \ joy \ kbdmux \ kgssapi \ kgssapi_krb5 \ khelp \ krpc \ ksyms \ le \ lge \ libalias \ libiconv \ libmbpool \ libmchain \ ${_linprocfs} \ ${_linsysfs} \ ${_linux} \ ${_linux_common} \ ${_linux64} \ linuxkpi \ ${_lio} \ lmc \ lpt \ mac_biba \ mac_bsdextended \ mac_ifoff \ mac_lomac \ mac_mls \ mac_none \ mac_partition \ mac_portacl \ mac_seeotheruids \ mac_stub \ mac_test \ malo \ md \ mdio \ mem \ mfi \ mii \ mlx \ ${_mlx4} \ ${_mlx4ib} \ ${_mlx4en} \ ${_mlx5} \ ${_mlx5en} \ ${_mlx5ib} \ ${_mly} \ mmc \ mmcsd \ mpr \ mps \ mpt \ mqueue \ mrsas \ msdosfs \ msdosfs_iconv \ ${_mse} \ msk \ ${_mthca} \ mvs \ mwl \ ${_mwlfw} \ mxge \ my \ ${_nandfs} \ ${_nandsim} \ ${_ncr} \ ${_nctgpio} \ ${_ncv} \ ${_ndis} \ ${_netgraph} \ ${_nfe} \ nfscl \ nfscommon \ nfsd \ nfslock \ nfslockd \ nfssvc \ nge \ nmdm \ ${_nsp} \ nullfs \ ${_ntb} \ ${_nvd} \ ${_nvme} \ ${_nvram} \ ${_nxge} \ oce \ otus \ ${_otusfw} \ ow \ ${_padlock} \ ${_padlock_rng} \ ${_pccard} \ ${_pcfclock} \ pcn \ ${_pf} \ ${_pflog} \ ${_pfsync} \ plip \ ${_pms} \ ppbus \ ppc \ ppi \ pps \ procfs \ proto \ pseudofs \ ${_pst} \ pty \ puc \ ${_qlxge} \ ${_qlxgb} \ ${_qlxgbe} \ ${_qlnx} \ ral \ ${_ralfw} \ ${_random_fortuna} \ ${_random_yarrow} \ ${_random_other} \ rc4 \ ${_rdma} \ ${_rdrand_rng} \ re \ rl \ rtwn \ rtwn_pci \ rtwn_usb \ ${_rtwnfw} \ ${_s3} \ ${_safe} \ ${_sbni} \ scc \ ${_scsi_low} \ sdhci \ ${_sdhci_acpi} \ sdhci_pci \ sem \ send \ ${_sf} \ ${_sfxge} \ sge \ ${_sgx} \ ${_sgx_linux} \ siba_bwn \ siftr \ siis \ sis \ sk \ smbfs \ sn \ snp \ sound \ ${_speaker} \ spigen \ ${_splash} \ ${_sppp} \ ste \ ${_stg} \ stge \ ${_sym} \ ${_syscons} \ sysvipc \ tcp \ ${_ti} \ tl \ tmpfs \ ${_toecore} \ ${_tpm} \ trm \ ${_twa} \ twe \ tws \ tx \ ${_txp} \ uart \ ubsec \ udf \ udf_iconv \ ufs \ uinput \ unionfs \ usb \ ${_vesa} \ ${_virtio} \ vge \ ${_viawd} \ videomode \ vkbd \ ${_vmm} \ ${_vmware} \ ${_vpo} \ vr \ vte \ vx \ ${_vxge} \ wb \ ${_wbwd} \ ${_wi} \ wlan \ wlan_acl \ wlan_amrr \ wlan_ccmp \ wlan_rssadapt \ wlan_tkip \ wlan_wep \ wlan_xauth \ ${_wpi} \ ${_wpifw} \ ${_x86bios} \ ${_xe} \ xl \ zlib .if ${MK_AUTOFS} != "no" || defined(ALL_MODULES) _autofs= autofs .endif .if ${MK_CDDL} != "no" || defined(ALL_MODULES) .if (${MACHINE_CPUARCH} != "arm" || ${MACHINE_ARCH:Marmv[67]*} != "") && \ ${MACHINE_CPUARCH} != "mips" && \ ${MACHINE_CPUARCH} != "sparc64" SUBDIR+= dtrace .endif SUBDIR+= opensolaris .endif .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) .if exists(${SRCTOP}/sys/opencrypto) _crypto= crypto _cryptodev= cryptodev _random_fortuna=random_fortuna _random_yarrow= random_yarrow _random_other= random_other .endif .endif .if ${MK_CUSE} != "no" || defined(ALL_MODULES) SUBDIR+= cuse .endif .if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _carp= carp _toecore= toecore _if_enc= if_enc _if_gif= if_gif _if_gre= if_gre _ipfw_pmod= ipfw_pmod .if ${MK_IPSEC_SUPPORT} != "no" _ipsec= ipsec .endif .endif .if (${MK_INET_SUPPORT} != "no" && ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _if_stf= if_stf .endif .if ${MK_INET_SUPPORT} != "no" || defined(ALL_MODULES) _if_me= if_me _ipdivert= ipdivert _ipfw= ipfw .if ${MK_INET6_SUPPORT} != "no" || defined(ALL_MODULES) _ipfw_nat64= ipfw_nat64 .endif .endif .if ${MK_INET6_SUPPORT} != "no" || defined(ALL_MODULES) _ipfw_nptv6= ipfw_nptv6 .endif .if ${MK_IPFILTER} != "no" || defined(ALL_MODULES) _ipfilter= ipfilter .endif .if ${MK_ISCSI} != "no" || defined(ALL_MODULES) SUBDIR+= cfiscsi SUBDIR+= iscsi SUBDIR+= iscsi_initiator .endif .if ${MK_NAND} != "no" || defined(ALL_MODULES) _nandfs= nandfs _nandsim= nandsim .endif .if ${MK_NETGRAPH} != "no" || defined(ALL_MODULES) _netgraph= netgraph .endif .if (${MK_PF} != "no" && (${MK_INET_SUPPORT} != "no" || \ ${MK_INET6_SUPPORT} != "no")) || defined(ALL_MODULES) _pf= pf _pflog= pflog .if ${MK_INET_SUPPORT} != "no" _pfsync= pfsync .endif .endif .if ${MK_SOURCELESS_UCODE} != "no" _bce= bce _fxp= fxp _ispfw= ispfw _sf= sf _ti= ti _txp= txp .if ${MACHINE_CPUARCH} != "mips" _mwlfw= mwlfw _otusfw= otusfw _ralfw= ralfw _rtwnfw= rtwnfw .endif .endif .if ${MK_SOURCELESS_UCODE} != "no" && ${MACHINE_CPUARCH} != "arm" && \ ${MACHINE_CPUARCH} != "mips" && \ ${MACHINE_ARCH} != "powerpc" && ${MACHINE_ARCH} != "powerpcspe" && \ ${MACHINE_CPUARCH} != "riscv" _cxgbe= cxgbe .endif .if ${MK_TESTS} != "no" || defined(ALL_MODULES) SUBDIR+= tests .endif .if ${MK_ZFS} != "no" || defined(ALL_MODULES) SUBDIR+= zfs .endif .if (${MACHINE_CPUARCH} == "mips" && ${MACHINE_ARCH:Mmips64} == "") _hwpmc_mips24k= hwpmc_mips24k _hwpmc_mips74k= hwpmc_mips74k .endif .if ${MACHINE_CPUARCH} != "aarch64" && ${MACHINE_CPUARCH} != "arm" && \ ${MACHINE_CPUARCH} != "mips" && ${MACHINE_CPUARCH} != "powerpc" && \ ${MACHINE_CPUARCH} != "riscv" _syscons= syscons _vpo= vpo .endif .if ${MACHINE_CPUARCH} != "mips" # no BUS_SPACE_UNSPECIFIED # No barrier instruction support (specific to this driver) _sym= sym # intr_disable() is a macro, causes problems .if ${MK_SOURCELESS_UCODE} != "no" _cxgb= cxgb .endif .endif .if ${MACHINE_CPUARCH} == "aarch64" _armv8crypto= armv8crypto +_efirt= efirt _em= em .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _agp= agp _an= an _aout= aout _bios= bios _bktr= bktr .if ${MK_SOURCELESS_UCODE} != "no" _bxe= bxe .endif _cardbus= cardbus _cbb= cbb _cpuctl= cpuctl _cpufreq= cpufreq _cs= cs _dpms= dpms _drm= drm _drm2= drm2 _ed= ed _em= em _ena= ena _ep= ep _et= et _exca= exca _fe= fe .if ${MK_OFED} != "no" || defined(ALL_MODULES) _ibcore= ibcore .endif _if_ndis= if_ndis _io= io .if ${MK_OFED} != "no" || defined(ALL_MODULES) _ipoib= ipoib _iser= iser .endif _ix= ix _ixv= ixv _linprocfs= linprocfs _linsysfs= linsysfs _linux= linux _nctgpio= nctgpio _ndis= ndis _pccard= pccard .if ${MK_OFED} != "no" || defined(ALL_MODULES) _rdma= rdma .endif _safe= safe _scsi_low= scsi_low _speaker= speaker _splash= splash _sppp= sppp _vmware= vmware _vxge= vxge _wbwd= wbwd _wi= wi _xe= xe _aac= aac _aacraid= aacraid _acpi= acpi .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) _aesni= aesni .endif _amd_ecc_inject=amd_ecc_inject _amdsbwd= amdsbwd _amdsmn= amdsmn _amdtemp= amdtemp _arcmsr= arcmsr _asmc= asmc _bytgpio= bytgpio _ciss= ciss _chromebook_platform= chromebook_platform _cmx= cmx _coretemp= coretemp .if ${MK_SOURCELESS_HOST} != "no" _hpt27xx= hpt27xx .endif _hptiop= hptiop .if ${MK_SOURCELESS_HOST} != "no" _hptmv= hptmv _hptnr= hptnr _hptrr= hptrr .endif _hyperv= hyperv _ichwd= ichwd _ida= ida _iir= iir _intelspi= intelspi _ipmi= ipmi _ips= ips _isci= isci _ipw= ipw _iwi= iwi _iwm= iwm _iwn= iwn _ixgb= ixgb .if ${MK_SOURCELESS_UCODE} != "no" _ipwfw= ipwfw _iwifw= iwifw _iwmfw= iwmfw _iwnfw= iwnfw .endif _mlx4= mlx4 _mlx5= mlx5 .if (${MK_INET_SUPPORT} != "no" && ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _mlx4en= mlx4en _mlx5en= mlx5en .endif .if ${MK_OFED} != "no" || defined(ALL_MODULES) _mlx4ib= mlx4ib _mlx5ib= mlx5ib .endif _mly= mly .if ${MK_OFED} != "no" || defined(ALL_MODULES) _mthca= mthca .endif _nfe= nfe _nvd= nvd _nvme= nvme _nvram= nvram _nxge= nxge .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) _padlock= padlock _padlock_rng= padlock_rng _rdrand_rng= rdrand_rng .endif _s3= s3 _sdhci_acpi= sdhci_acpi _tpm= tpm _twa= twa _vesa= vesa _viawd= viawd _virtio= virtio _wpi= wpi .if ${MK_SOURCELESS_UCODE} != "no" _wpifw= wpifw .endif _x86bios= x86bios .endif .if ${MACHINE_CPUARCH} == "amd64" _efirt= efirt _ioat= ioat _ixl= ixl _ixlv= ixlv _linux64= linux64 _linux_common= linux_common .if ${MK_SOURCELESS_UCODE} != "no" _lio= lio .endif _ntb= ntb _pms= pms _qlxge= qlxge _qlxgb= qlxgb .if ${MK_SOURCELESS_UCODE} != "no" _qlxgbe= qlxgbe _qlnx= qlnx .endif _sfxge= sfxge _sgx= sgx _sgx_linux= sgx_linux .if ${MK_BHYVE} != "no" || defined(ALL_MODULES) _vmm= vmm .endif .endif .if ${MACHINE_CPUARCH} == "i386" # XXX some of these can move to the general case when de-i386'ed # XXX some of these can move now, but are untested on other architectures. _3dfx= 3dfx _3dfx_linux= 3dfx_linux _aic= aic _apm= apm _arcnet= arcnet .if ${MK_SOURCELESS_UCODE} != "no" _ce= ce .endif _coff= coff .if ${MK_SOURCELESS_UCODE} != "no" _cp= cp .endif _elink= elink _glxiic= glxiic _glxsb= glxsb #_ibcs2= ibcs2 _mse= mse _ncr= ncr _ncv= ncv _nsp= nsp _pcfclock= pcfclock _pst= pst _sbni= sbni _stg= stg _cm= cm .if ${MK_SOURCELESS_UCODE} != "no" _ctau= ctau .endif _dpt= dpt _ex= ex .endif .if ${MACHINE_CPUARCH} == "arm" _cfi= cfi _cpsw= cpsw .endif .if ${MACHINE_CPUARCH} == "powerpc" _agp= agp _an= an _bm= bm _cardbus= cardbus _cbb= cbb _cfi= cfi _cpufreq= cpufreq _drm= drm _exca= exca _ffec= ffec _pccard= pccard _wi= wi .endif .if ${MACHINE_ARCH} == "powerpc64" _drm2= drm2 .endif .if ${MACHINE_ARCH} == "powerpc64" || ${MACHINE_ARCH} == "powerpc" # Don't build powermac_nvram for powerpcspe, it's never supported. _nvram= powermac_nvram .endif .if ${MACHINE_CPUARCH} == "sparc64" _auxio= auxio _em= em _epic= epic .endif .if (${MACHINE_CPUARCH} == "amd64" || ${MACHINE_ARCH:Marmv[67]*} != "" || \ ${MACHINE_CPUARCH} == "i386") _cloudabi32= cloudabi32 .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" _cloudabi64= cloudabi64 .endif .endif .if ${MACHINE_ARCH:Marmv[67]*} != "" _ffec= ffec .endif SUBDIR+=${MODULES_EXTRA} .for reject in ${WITHOUT_MODULES} SUBDIR:= ${SUBDIR:N${reject}} .endfor # Calling kldxref(8) for each module is expensive. .if !defined(NO_XREF) .MAKEFLAGS+= -DNO_XREF afterinstall: .PHONY @if type kldxref >/dev/null 2>&1; then \ ${ECHO} kldxref ${DESTDIR}${KMODDIR}; \ kldxref ${DESTDIR}${KMODDIR}; \ fi .endif .include "${SYSDIR}/conf/config.mk" SUBDIR:= ${SUBDIR:u:O} .include Index: projects/runtime-coverage/sys/ofed/include/rdma/ib_addr.h =================================================================== --- projects/runtime-coverage/sys/ofed/include/rdma/ib_addr.h (revision 324497) +++ projects/runtime-coverage/sys/ofed/include/rdma/ib_addr.h (revision 324498) @@ -1,319 +1,325 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef IB_ADDR_H #define IB_ADDR_H #include #include #include #include #include #include #include #include #include #include #include struct rdma_addr_client { atomic_t refcount; struct completion comp; }; /** * rdma_addr_register_client - Register an address client. */ void rdma_addr_register_client(struct rdma_addr_client *client); /** * rdma_addr_unregister_client - Deregister an address client. * @client: Client object to deregister. */ void rdma_addr_unregister_client(struct rdma_addr_client *client); struct rdma_dev_addr { unsigned char src_dev_addr[MAX_ADDR_LEN]; unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; unsigned short dev_type; int bound_dev_if; enum rdma_transport_type transport; }; /** * rdma_translate_ip - Translate a local IP address to an RDMA hardware * address. */ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, u16 *vlan_id); /** * rdma_resolve_ip - Resolve source and destination IP addresses to * RDMA hardware addresses. * @client: Address client associated with request. * @src_addr: An optional source address to use in the resolution. If a * source address is not provided, a usable address will be returned via * the callback. * @dst_addr: The destination address to resolve. * @addr: A reference to a data location that will receive the resolved * addresses. The data location must remain valid until the callback has * been invoked. * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, const unsigned char *dst_dev_addr); int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id, u32 scope_id); int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *smac, u16 *vlan_id, u32 scope_id); static inline int ip_addr_size(struct sockaddr *addr) { return addr->sa_family == AF_INET6 ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in); } static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; } static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) { dev_addr->broadcast[8] = pkey >> 8; dev_addr->broadcast[9] = (unsigned char) pkey; } static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); } static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) { return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) { uint16_t tag; if (VLAN_TAG(__DECONST(struct ifnet *, dev), &tag) != 0) return 0xffff; return tag; } static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) { switch (addr->sa_family) { case AF_INET: ipv6_addr_set_v4mapped(((struct sockaddr_in *)addr)->sin_addr.s_addr, (struct in6_addr *)gid); break; case AF_INET6: memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr, 16); + /* make sure scope ID gets zeroed inside GID */ + if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)gid->raw) || + IN6_IS_ADDR_MC_INTFACELOCAL((struct in6_addr *)gid->raw)) { + gid->raw[2] = 0; + gid->raw[3] = 0; + } break; default: return -EINVAL; } return 0; } /* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ static inline int rdma_gid2ip(struct sockaddr *out, union ib_gid *gid, uint32_t scope_id) { if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { struct sockaddr_in *out_in = (struct sockaddr_in *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin_len = sizeof(*out_in); out_in->sin_family = AF_INET; memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4); } else { struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin6_len = sizeof(*out_in); out_in->sin6_family = AF_INET6; memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); if (scope_id < 256 && IN6_IS_SCOPE_LINKLOCAL(&out_in->sin6_addr)) out_in->sin6_scope_id = scope_id; } return 0; } u32 rdma_get_ipv6_scope_id(struct ib_device *ib, u8 port_num); /* This func is called only in loopback ip address (127.0.0.1) * case in which sgid is not relevant */ static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { } static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { if (dev_addr->transport == RDMA_TRANSPORT_IB && dev_addr->dev_type != ARPHRD_INFINIBAND) iboe_addr_get_sgid(dev_addr, gid); else memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline enum ib_mtu iboe_get_mtu(int mtu) { /* * reduce IB headers from effective IBoE MTU. 28 stands for * atomic header which is the biggest possible header after BTH */ mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28; if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096)) return IB_MTU_4096; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048)) return IB_MTU_2048; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024)) return IB_MTU_1024; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512)) return IB_MTU_512; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256)) return IB_MTU_256; else return 0; } static inline int iboe_get_rate(struct net_device *dev) { if (dev->if_baudrate >= IF_Gbps(40)) return IB_RATE_40_GBPS; else if (dev->if_baudrate >= IF_Gbps(30)) return IB_RATE_30_GBPS; else if (dev->if_baudrate >= IF_Gbps(20)) return IB_RATE_20_GBPS; else if (dev->if_baudrate >= IF_Gbps(10)) return IB_RATE_10_GBPS; else return IB_RATE_PORT_CURRENT; } static inline int rdma_link_local_addr(struct in6_addr *addr) { if (addr->s6_addr32[0] == htonl(0xfe800000) && addr->s6_addr32[1] == 0) return 1; return 0; } static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) { memcpy(mac, &addr->s6_addr[8], 3); memcpy(mac + 3, &addr->s6_addr[13], 3); mac[0] ^= 2; } static inline int rdma_is_multicast_addr(struct in6_addr *addr) { return addr->s6_addr[0] == 0xff; } static inline void resolve_mcast_mac(struct in6_addr *addr, u8 *mac) { if (addr->s6_addr[0] != 0xff) return; #ifdef DUAL_MODE_MCAST_MAC if (addr->s6_addr[1] == 0x0e) /* IPv4 */ ip_eth_mc_map(addr->s6_addr32[3], mac); else #endif ipv6_eth_mc_map(addr, mac); } static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) { int i; mac[0] = 0x33; mac[1] = 0x33; for (i = 2; i < 6; ++i) mac[i] = addr->s6_addr[i + 10]; } static inline u16 rdma_get_vlan_id(union ib_gid *dgid) { u16 vid; vid = dgid->raw[11] << 8 | dgid->raw[12]; return vid < 0x1000 ? vid : 0xffff; } static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) { return VLAN_TRUNKDEV(__DECONST(struct ifnet *, dev)); } #endif /* IB_ADDR_H */ Index: projects/runtime-coverage/usr.bin/su/su.c =================================================================== --- projects/runtime-coverage/usr.bin/su/su.c (revision 324497) +++ projects/runtime-coverage/usr.bin/su/su.c (revision 324498) @@ -1,642 +1,642 @@ /* * Copyright (c) 2002, 2005 Networks Associates Technologies, Inc. * All rights reserved. * * Portions of this software were developed for the FreeBSD Project by * ThinkSec AS and NAI Labs, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 * ("CBOSS"), as part of the DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1988, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1988, 1993, 1994\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #if 0 #ifndef lint static char sccsid[] = "@(#)su.c 8.3 (Berkeley) 4/2/94"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #ifdef USE_BSM_AUDIT #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PAM_END() do { \ int local_ret; \ if (pamh != NULL) { \ local_ret = pam_setcred(pamh, PAM_DELETE_CRED); \ if (local_ret != PAM_SUCCESS) \ syslog(LOG_ERR, "pam_setcred: %s", \ pam_strerror(pamh, local_ret)); \ if (asthem) { \ local_ret = pam_close_session(pamh, 0); \ if (local_ret != PAM_SUCCESS) \ syslog(LOG_ERR, "pam_close_session: %s",\ pam_strerror(pamh, local_ret)); \ } \ local_ret = pam_end(pamh, local_ret); \ if (local_ret != PAM_SUCCESS) \ syslog(LOG_ERR, "pam_end: %s", \ pam_strerror(pamh, local_ret)); \ } \ } while (0) #define PAM_SET_ITEM(what, item) do { \ int local_ret; \ local_ret = pam_set_item(pamh, what, item); \ if (local_ret != PAM_SUCCESS) { \ syslog(LOG_ERR, "pam_set_item(" #what "): %s", \ pam_strerror(pamh, local_ret)); \ errx(1, "pam_set_item(" #what "): %s", \ pam_strerror(pamh, local_ret)); \ /* NOTREACHED */ \ } \ } while (0) enum tristate { UNSET, YES, NO }; static pam_handle_t *pamh = NULL; static char **environ_pam; static char *ontty(void); static int chshell(const char *); static void usage(void) __dead2; static void export_pam_environment(void); static int ok_to_export(const char *); extern char **environ; int main(int argc, char *argv[]) { static char *cleanenv; struct passwd *pwd = NULL; struct pam_conv conv = { openpam_ttyconv, NULL }; enum tristate iscsh; login_cap_t *lc; union { const char **a; char * const *b; } np; uid_t ruid; pid_t child_pid, child_pgrp, pid; int asme, ch, asthem, fastlogin, prio, i, retcode, statusp, setmaclabel; u_int setwhat; char *username, *class, shellbuf[MAXPATHLEN]; const char *p, *user, *shell, *mytty, **nargv; const void *v; struct sigaction sa, sa_int, sa_quit, sa_pipe; int temp, fds[2]; #ifdef USE_BSM_AUDIT const char *aerr; au_id_t auid; #endif - shell = class = cleanenv = NULL; + p = shell = class = cleanenv = NULL; asme = asthem = fastlogin = statusp = 0; user = "root"; iscsh = UNSET; setmaclabel = 0; while ((ch = getopt(argc, argv, "-flmsc:")) != -1) switch ((char)ch) { case 'f': fastlogin = 1; break; case '-': case 'l': asme = 0; asthem = 1; break; case 'm': asme = 1; asthem = 0; break; case 's': setmaclabel = 1; break; case 'c': class = optarg; break; case '?': default: usage(); /* NOTREACHED */ } if (optind < argc) user = argv[optind++]; if (user == NULL) usage(); /* NOTREACHED */ /* * Try to provide more helpful debugging output if su(1) is running * non-setuid, or was run from a file system not mounted setuid. */ if (geteuid() != 0) errx(1, "not running setuid"); #ifdef USE_BSM_AUDIT if (getauid(&auid) < 0 && errno != ENOSYS) { syslog(LOG_AUTH | LOG_ERR, "getauid: %s", strerror(errno)); errx(1, "Permission denied"); } #endif if (strlen(user) > MAXLOGNAME - 1) { #ifdef USE_BSM_AUDIT if (audit_submit(AUE_su, auid, EPERM, 1, "username too long: '%s'", user)) errx(1, "Permission denied"); #endif errx(1, "username too long"); } nargv = malloc(sizeof(char *) * (size_t)(argc + 4)); if (nargv == NULL) errx(1, "malloc failure"); nargv[argc + 3] = NULL; for (i = argc; i >= optind; i--) nargv[i + 3] = argv[i]; np.a = &nargv[i + 3]; argv += optind; errno = 0; prio = getpriority(PRIO_PROCESS, 0); if (errno) prio = 0; setpriority(PRIO_PROCESS, 0, -2); openlog("su", LOG_CONS, LOG_AUTH); /* get current login name, real uid and shell */ ruid = getuid(); username = getlogin(); if (username != NULL) pwd = getpwnam(username); if (pwd == NULL || pwd->pw_uid != ruid) pwd = getpwuid(ruid); if (pwd == NULL) { #ifdef USE_BSM_AUDIT if (audit_submit(AUE_su, auid, EPERM, 1, "unable to determine invoking subject: '%s'", username)) errx(1, "Permission denied"); #endif errx(1, "who are you?"); } username = strdup(pwd->pw_name); if (username == NULL) err(1, "strdup failure"); if (asme) { if (pwd->pw_shell != NULL && *pwd->pw_shell != '\0') { /* must copy - pwd memory is recycled */ shell = strncpy(shellbuf, pwd->pw_shell, sizeof(shellbuf)); shellbuf[sizeof(shellbuf) - 1] = '\0'; } else { shell = _PATH_BSHELL; iscsh = NO; } } /* Do the whole PAM startup thing */ retcode = pam_start("su", user, &conv, &pamh); if (retcode != PAM_SUCCESS) { syslog(LOG_ERR, "pam_start: %s", pam_strerror(pamh, retcode)); errx(1, "pam_start: %s", pam_strerror(pamh, retcode)); } PAM_SET_ITEM(PAM_RUSER, username); mytty = ttyname(STDERR_FILENO); if (!mytty) mytty = "tty"; PAM_SET_ITEM(PAM_TTY, mytty); retcode = pam_authenticate(pamh, 0); if (retcode != PAM_SUCCESS) { #ifdef USE_BSM_AUDIT if (audit_submit(AUE_su, auid, EPERM, 1, "bad su %s to %s on %s", username, user, mytty)) errx(1, "Permission denied"); #endif syslog(LOG_AUTH|LOG_WARNING, "BAD SU %s to %s on %s", username, user, mytty); errx(1, "Sorry"); } #ifdef USE_BSM_AUDIT if (audit_submit(AUE_su, auid, 0, 0, "successful authentication")) errx(1, "Permission denied"); #endif retcode = pam_get_item(pamh, PAM_USER, &v); if (retcode == PAM_SUCCESS) user = v; else syslog(LOG_ERR, "pam_get_item(PAM_USER): %s", pam_strerror(pamh, retcode)); pwd = getpwnam(user); if (pwd == NULL) { #ifdef USE_BSM_AUDIT if (audit_submit(AUE_su, auid, EPERM, 1, "unknown subject: %s", user)) errx(1, "Permission denied"); #endif errx(1, "unknown login: %s", user); } retcode = pam_acct_mgmt(pamh, 0); if (retcode == PAM_NEW_AUTHTOK_REQD) { retcode = pam_chauthtok(pamh, PAM_CHANGE_EXPIRED_AUTHTOK); if (retcode != PAM_SUCCESS) { #ifdef USE_BSM_AUDIT aerr = pam_strerror(pamh, retcode); if (aerr == NULL) aerr = "Unknown PAM error"; if (audit_submit(AUE_su, auid, EPERM, 1, "pam_chauthtok: %s", aerr)) errx(1, "Permission denied"); #endif syslog(LOG_ERR, "pam_chauthtok: %s", pam_strerror(pamh, retcode)); errx(1, "Sorry"); } } if (retcode != PAM_SUCCESS) { #ifdef USE_BSM_AUDIT if (audit_submit(AUE_su, auid, EPERM, 1, "pam_acct_mgmt: %s", pam_strerror(pamh, retcode))) errx(1, "Permission denied"); #endif syslog(LOG_ERR, "pam_acct_mgmt: %s", pam_strerror(pamh, retcode)); errx(1, "Sorry"); } /* get target login information */ if (class == NULL) lc = login_getpwclass(pwd); else { if (ruid != 0) { #ifdef USE_BSM_AUDIT if (audit_submit(AUE_su, auid, EPERM, 1, "only root may use -c")) errx(1, "Permission denied"); #endif errx(1, "only root may use -c"); } lc = login_getclass(class); if (lc == NULL) err(1, "login_getclass"); if (lc->lc_class == NULL || strcmp(class, lc->lc_class) != 0) errx(1, "unknown class: %s", class); } /* if asme and non-standard target shell, must be root */ if (asme) { if (ruid != 0 && !chshell(pwd->pw_shell)) errx(1, "permission denied (shell)"); } else if (pwd->pw_shell && *pwd->pw_shell) { shell = pwd->pw_shell; iscsh = UNSET; } else { shell = _PATH_BSHELL; iscsh = NO; } /* if we're forking a csh, we want to slightly muck the args */ if (iscsh == UNSET) { p = strrchr(shell, '/'); if (p) ++p; else p = shell; iscsh = strcmp(p, "csh") ? (strcmp(p, "tcsh") ? NO : YES) : YES; } setpriority(PRIO_PROCESS, 0, prio); /* * PAM modules might add supplementary groups in pam_setcred(), so * initialize them first. */ if (setusercontext(lc, pwd, pwd->pw_uid, LOGIN_SETGROUP) < 0) err(1, "setusercontext"); retcode = pam_setcred(pamh, PAM_ESTABLISH_CRED); if (retcode != PAM_SUCCESS) { syslog(LOG_ERR, "pam_setcred: %s", pam_strerror(pamh, retcode)); errx(1, "failed to establish credentials."); } if (asthem) { retcode = pam_open_session(pamh, 0); if (retcode != PAM_SUCCESS) { syslog(LOG_ERR, "pam_open_session: %s", pam_strerror(pamh, retcode)); errx(1, "failed to open session."); } } /* * We must fork() before setuid() because we need to call * pam_setcred(pamh, PAM_DELETE_CRED) as root. */ sa.sa_flags = SA_RESTART; sa.sa_handler = SIG_IGN; sigemptyset(&sa.sa_mask); sigaction(SIGINT, &sa, &sa_int); sigaction(SIGQUIT, &sa, &sa_quit); sigaction(SIGPIPE, &sa, &sa_pipe); sa.sa_handler = SIG_DFL; sigaction(SIGTSTP, &sa, NULL); statusp = 1; if (pipe(fds) == -1) { PAM_END(); err(1, "pipe"); } child_pid = fork(); switch (child_pid) { default: sa.sa_handler = SIG_IGN; sigaction(SIGTTOU, &sa, NULL); close(fds[0]); setpgid(child_pid, child_pid); if (tcgetpgrp(STDERR_FILENO) == getpgrp()) tcsetpgrp(STDERR_FILENO, child_pid); close(fds[1]); sigaction(SIGPIPE, &sa_pipe, NULL); while ((pid = waitpid(child_pid, &statusp, WUNTRACED)) != -1) { if (WIFSTOPPED(statusp)) { child_pgrp = getpgid(child_pid); if (tcgetpgrp(STDERR_FILENO) == child_pgrp) tcsetpgrp(STDERR_FILENO, getpgrp()); kill(getpid(), SIGSTOP); if (tcgetpgrp(STDERR_FILENO) == getpgrp()) { child_pgrp = getpgid(child_pid); tcsetpgrp(STDERR_FILENO, child_pgrp); } kill(child_pid, SIGCONT); statusp = 1; continue; } break; } tcsetpgrp(STDERR_FILENO, getpgrp()); if (pid == -1) err(1, "waitpid"); PAM_END(); exit(WEXITSTATUS(statusp)); case -1: PAM_END(); err(1, "fork"); case 0: close(fds[1]); read(fds[0], &temp, 1); close(fds[0]); sigaction(SIGPIPE, &sa_pipe, NULL); sigaction(SIGINT, &sa_int, NULL); sigaction(SIGQUIT, &sa_quit, NULL); /* * Set all user context except for: Environmental variables * Umask Login records (wtmp, etc) Path */ setwhat = LOGIN_SETALL & ~(LOGIN_SETENV | LOGIN_SETUMASK | LOGIN_SETLOGIN | LOGIN_SETPATH | LOGIN_SETGROUP | LOGIN_SETMAC); /* * If -s is present, also set the MAC label. */ if (setmaclabel) setwhat |= LOGIN_SETMAC; /* * Don't touch resource/priority settings if -m has been used * or -l and -c hasn't, and we're not su'ing to root. */ if ((asme || (!asthem && class == NULL)) && pwd->pw_uid) setwhat &= ~(LOGIN_SETPRIORITY | LOGIN_SETRESOURCES); if (setusercontext(lc, pwd, pwd->pw_uid, setwhat) < 0) err(1, "setusercontext"); if (!asme) { if (asthem) { p = getenv("TERM"); environ = &cleanenv; } if (asthem || pwd->pw_uid) setenv("USER", pwd->pw_name, 1); setenv("HOME", pwd->pw_dir, 1); setenv("SHELL", shell, 1); if (asthem) { /* * Add any environmental variables that the * PAM modules may have set. */ environ_pam = pam_getenvlist(pamh); if (environ_pam) export_pam_environment(); /* set the su'd user's environment & umask */ setusercontext(lc, pwd, pwd->pw_uid, LOGIN_SETPATH | LOGIN_SETUMASK | LOGIN_SETENV); if (p) setenv("TERM", p, 1); p = pam_getenv(pamh, "HOME"); if (chdir(p ? p : pwd->pw_dir) < 0) errx(1, "no directory"); } } login_close(lc); if (iscsh == YES) { if (fastlogin) *np.a-- = "-f"; if (asme) *np.a-- = "-m"; } /* csh strips the first character... */ *np.a = asthem ? "-su" : iscsh == YES ? "_su" : "su"; if (ruid != 0) syslog(LOG_NOTICE, "%s to %s%s", username, user, ontty()); execv(shell, np.b); err(1, "%s", shell); } } static void export_pam_environment(void) { char **pp; char *p; for (pp = environ_pam; *pp != NULL; pp++) { if (ok_to_export(*pp)) { p = strchr(*pp, '='); *p = '\0'; setenv(*pp, p + 1, 1); } free(*pp); } } /* * Sanity checks on PAM environmental variables: * - Make sure there is an '=' in the string. * - Make sure the string doesn't run on too long. * - Do not export certain variables. This list was taken from the * Solaris pam_putenv(3) man page. * Note that if the user is chrooted, PAM may have a better idea than we * do of where her home directory is. */ static int ok_to_export(const char *s) { static const char *noexport[] = { "SHELL", /* "HOME", */ "LOGNAME", "MAIL", "CDPATH", "IFS", "PATH", NULL }; const char **pp; size_t n; if (strlen(s) > 1024 || strchr(s, '=') == NULL) return 0; if (strncmp(s, "LD_", 3) == 0) return 0; for (pp = noexport; *pp != NULL; pp++) { n = strlen(*pp); if (s[n] == '=' && strncmp(s, *pp, n) == 0) return 0; } return 1; } static void usage(void) { fprintf(stderr, "usage: su [-] [-flms] [-c class] [login [args]]\n"); exit(1); /* NOTREACHED */ } static int chshell(const char *sh) { int r; char *cp; r = 0; setusershell(); while ((cp = getusershell()) != NULL && !r) r = (strcmp(cp, sh) == 0); endusershell(); return r; } static char * ontty(void) { char *p; static char buf[MAXPATHLEN + 4]; buf[0] = 0; p = ttyname(STDERR_FILENO); if (p) snprintf(buf, sizeof(buf), " on %s", p); return buf; } Index: projects/runtime-coverage =================================================================== --- projects/runtime-coverage (revision 324497) +++ projects/runtime-coverage (revision 324498) Property changes on: projects/runtime-coverage ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r324483-324497