Index: user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.cpp =================================================================== --- user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.cpp (revision 247191) +++ user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.cpp (revision 247192) @@ -1,6410 +1,6434 @@ //===--- Tools.cpp - Tools Implementations --------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "Tools.h" #include "clang/Driver/Action.h" #include "clang/Driver/Arg.h" #include "clang/Driver/ArgList.h" #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Job.h" #include "clang/Driver/Option.h" #include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" #include "clang/Basic/ObjCRuntime.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Host.h" #include "llvm/Support/Process.h" #include "llvm/Support/ErrorHandling.h" #include "InputInfo.h" #include "SanitizerArgs.h" #include "ToolChains.h" using namespace clang::driver; using namespace clang::driver::tools; using namespace clang; /// CheckPreprocessingOptions - Perform some validation of preprocessing /// arguments that is shared with gcc. static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) { if (Arg *A = Args.getLastArg(options::OPT_C, options::OPT_CC)) if (!Args.hasArg(options::OPT_E) && !D.CCCIsCPP) D.Diag(diag::err_drv_argument_only_allowed_with) << A->getAsString(Args) << "-E"; } /// CheckCodeGenerationOptions - Perform some validation of code generation /// arguments that is shared with gcc. static void CheckCodeGenerationOptions(const Driver &D, const ArgList &Args) { // In gcc, only ARM checks this, but it seems reasonable to check universally. if (Args.hasArg(options::OPT_static)) if (const Arg *A = Args.getLastArg(options::OPT_dynamic, options::OPT_mdynamic_no_pic)) D.Diag(diag::err_drv_argument_not_allowed_with) << A->getAsString(Args) << "-static"; } // Quote target names for inclusion in GNU Make dependency files. // Only the characters '$', '#', ' ', '\t' are quoted. static void QuoteTarget(StringRef Target, SmallVectorImpl &Res) { for (unsigned i = 0, e = Target.size(); i != e; ++i) { switch (Target[i]) { case ' ': case '\t': // Escape the preceding backslashes for (int j = i - 1; j >= 0 && Target[j] == '\\'; --j) Res.push_back('\\'); // Escape the space/tab Res.push_back('\\'); break; case '$': Res.push_back('$'); break; case '#': Res.push_back('\\'); break; default: break; } Res.push_back(Target[i]); } } static void addDirectoryList(const ArgList &Args, ArgStringList &CmdArgs, const char *ArgName, const char *EnvVar) { const char *DirList = ::getenv(EnvVar); bool CombinedArg = false; if (!DirList) return; // Nothing to do. StringRef Name(ArgName); if (Name.equals("-I") || Name.equals("-L")) CombinedArg = true; StringRef Dirs(DirList); if (Dirs.empty()) // Empty string should not add '.'. return; StringRef::size_type Delim; while ((Delim = Dirs.find(llvm::sys::PathSeparator)) != StringRef::npos) { if (Delim == 0) { // Leading colon. if (CombinedArg) { CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + ".")); } else { CmdArgs.push_back(ArgName); CmdArgs.push_back("."); } } else { if (CombinedArg) { CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + Dirs.substr(0, Delim))); } else { CmdArgs.push_back(ArgName); CmdArgs.push_back(Args.MakeArgString(Dirs.substr(0, Delim))); } } Dirs = Dirs.substr(Delim + 1); } if (Dirs.empty()) { // Trailing colon. if (CombinedArg) { CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + ".")); } else { CmdArgs.push_back(ArgName); CmdArgs.push_back("."); } } else { // Add the last path. if (CombinedArg) { CmdArgs.push_back(Args.MakeArgString(std::string(ArgName) + Dirs)); } else { CmdArgs.push_back(ArgName); CmdArgs.push_back(Args.MakeArgString(Dirs)); } } } static void AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs, const ArgList &Args, ArgStringList &CmdArgs) { const Driver &D = TC.getDriver(); // Add extra linker input arguments which are not treated as inputs // (constructed via -Xarch_). Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; if (!TC.HasNativeLLVMSupport()) { // Don't try to pass LLVM inputs unless we have native support. if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR || II.getType() == types::TY_LLVM_BC || II.getType() == types::TY_LTO_BC) D.Diag(diag::err_drv_no_linker_llvm_support) << TC.getTripleString(); } // Add filenames immediately. if (II.isFilename()) { CmdArgs.push_back(II.getFilename()); continue; } // Otherwise, this is a linker input argument. const Arg &A = II.getInputArg(); // Handle reserved library options. if (A.getOption().matches(options::OPT_Z_reserved_lib_stdcxx)) { TC.AddCXXStdlibLibArgs(Args, CmdArgs); } else if (A.getOption().matches(options::OPT_Z_reserved_lib_cckext)) { TC.AddCCKextLibArgs(Args, CmdArgs); } else A.renderAsInput(Args, CmdArgs); } // LIBRARY_PATH - included following the user specified library paths. addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH"); } /// \brief Determine whether Objective-C automated reference counting is /// enabled. static bool isObjCAutoRefCount(const ArgList &Args) { return Args.hasFlag(options::OPT_fobjc_arc, options::OPT_fno_objc_arc, false); } /// \brief Determine whether we are linking the ObjC runtime. static bool isObjCRuntimeLinked(const ArgList &Args) { if (isObjCAutoRefCount(Args)) { Args.ClaimAllArgs(options::OPT_fobjc_link_runtime); return true; } return Args.hasArg(options::OPT_fobjc_link_runtime); } static void addProfileRT(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs, llvm::Triple Triple) { if (!(Args.hasArg(options::OPT_fprofile_arcs) || Args.hasArg(options::OPT_fprofile_generate) || Args.hasArg(options::OPT_fcreate_profile) || Args.hasArg(options::OPT_coverage))) return; // GCC links libgcov.a by adding -L/gcc/lib/gcc// -lgcov to // the link line. We cannot do the same thing because unlike gcov there is a // libprofile_rt.so. We used to use the -l:libprofile_rt.a syntax, but that is // not supported by old linkers. std::string ProfileRT = std::string(TC.getDriver().Dir) + "/../lib/libprofile_rt.a"; CmdArgs.push_back(Args.MakeArgString(ProfileRT)); } static bool forwardToGCC(const Option &O) { return !O.hasFlag(options::NoForward) && !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput); } void Clang::AddPreprocessingOptions(Compilation &C, const Driver &D, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, const InputInfoList &Inputs) const { Arg *A; CheckPreprocessingOptions(D, Args); Args.AddLastArg(CmdArgs, options::OPT_C); Args.AddLastArg(CmdArgs, options::OPT_CC); // Handle dependency file generation. if ((A = Args.getLastArg(options::OPT_M, options::OPT_MM)) || (A = Args.getLastArg(options::OPT_MD)) || (A = Args.getLastArg(options::OPT_MMD))) { // Determine the output location. const char *DepFile; if (Arg *MF = Args.getLastArg(options::OPT_MF)) { DepFile = MF->getValue(); C.addFailureResultFile(DepFile); } else if (Output.getType() == types::TY_Dependencies) { DepFile = Output.getFilename(); } else if (A->getOption().matches(options::OPT_M) || A->getOption().matches(options::OPT_MM)) { DepFile = "-"; } else { DepFile = darwin::CC1::getDependencyFileName(Args, Inputs); C.addFailureResultFile(DepFile); } CmdArgs.push_back("-dependency-file"); CmdArgs.push_back(DepFile); // Add a default target if one wasn't specified. if (!Args.hasArg(options::OPT_MT) && !Args.hasArg(options::OPT_MQ)) { const char *DepTarget; // If user provided -o, that is the dependency target, except // when we are only generating a dependency file. Arg *OutputOpt = Args.getLastArg(options::OPT_o); if (OutputOpt && Output.getType() != types::TY_Dependencies) { DepTarget = OutputOpt->getValue(); } else { // Otherwise derive from the base input. // // FIXME: This should use the computed output file location. SmallString<128> P(Inputs[0].getBaseInput()); llvm::sys::path::replace_extension(P, "o"); DepTarget = Args.MakeArgString(llvm::sys::path::filename(P)); } CmdArgs.push_back("-MT"); SmallString<128> Quoted; QuoteTarget(DepTarget, Quoted); CmdArgs.push_back(Args.MakeArgString(Quoted)); } if (A->getOption().matches(options::OPT_M) || A->getOption().matches(options::OPT_MD)) CmdArgs.push_back("-sys-header-deps"); } if (Args.hasArg(options::OPT_MG)) { if (!A || A->getOption().matches(options::OPT_MD) || A->getOption().matches(options::OPT_MMD)) D.Diag(diag::err_drv_mg_requires_m_or_mm); CmdArgs.push_back("-MG"); } Args.AddLastArg(CmdArgs, options::OPT_MP); // Convert all -MQ args to -MT for (arg_iterator it = Args.filtered_begin(options::OPT_MT, options::OPT_MQ), ie = Args.filtered_end(); it != ie; ++it) { const Arg *A = *it; A->claim(); if (A->getOption().matches(options::OPT_MQ)) { CmdArgs.push_back("-MT"); SmallString<128> Quoted; QuoteTarget(A->getValue(), Quoted); CmdArgs.push_back(Args.MakeArgString(Quoted)); // -MT flag - no change } else { A->render(Args, CmdArgs); } } // Add -i* options, and automatically translate to // -include-pch/-include-pth for transparent PCH support. It's // wonky, but we include looking for .gch so we can support seamless // replacement into a build system already set up to be generating // .gch files. bool RenderedImplicitInclude = false; for (arg_iterator it = Args.filtered_begin(options::OPT_clang_i_Group), ie = Args.filtered_end(); it != ie; ++it) { const Arg *A = it; if (A->getOption().matches(options::OPT_include)) { bool IsFirstImplicitInclude = !RenderedImplicitInclude; RenderedImplicitInclude = true; // Use PCH if the user requested it. bool UsePCH = D.CCCUsePCH; bool FoundPTH = false; bool FoundPCH = false; llvm::sys::Path P(A->getValue()); bool Exists; if (UsePCH) { P.appendSuffix("pch"); if (!llvm::sys::fs::exists(P.str(), Exists) && Exists) FoundPCH = true; else P.eraseSuffix(); } if (!FoundPCH) { P.appendSuffix("pth"); if (!llvm::sys::fs::exists(P.str(), Exists) && Exists) FoundPTH = true; else P.eraseSuffix(); } if (!FoundPCH && !FoundPTH) { P.appendSuffix("gch"); if (!llvm::sys::fs::exists(P.str(), Exists) && Exists) { FoundPCH = UsePCH; FoundPTH = !UsePCH; } else P.eraseSuffix(); } if (FoundPCH || FoundPTH) { if (IsFirstImplicitInclude) { A->claim(); if (UsePCH) CmdArgs.push_back("-include-pch"); else CmdArgs.push_back("-include-pth"); CmdArgs.push_back(Args.MakeArgString(P.str())); continue; } else { // Ignore the PCH if not first on command line and emit warning. D.Diag(diag::warn_drv_pch_not_first_include) << P.str() << A->getAsString(Args); } } } // Not translated, render as usual. A->claim(); A->render(Args, CmdArgs); } Args.AddAllArgs(CmdArgs, options::OPT_D, options::OPT_U); Args.AddAllArgs(CmdArgs, options::OPT_I_Group, options::OPT_F, options::OPT_index_header_map); // Add -Wp, and -Xassembler if using the preprocessor. // FIXME: There is a very unfortunate problem here, some troubled // souls abuse -Wp, to pass preprocessor options in gcc syntax. To // really support that we would have to parse and then translate // those options. :( Args.AddAllArgValues(CmdArgs, options::OPT_Wp_COMMA, options::OPT_Xpreprocessor); // -I- is a deprecated GCC feature, reject it. if (Arg *A = Args.getLastArg(options::OPT_I_)) D.Diag(diag::err_drv_I_dash_not_supported) << A->getAsString(Args); // If we have a --sysroot, and don't have an explicit -isysroot flag, add an // -isysroot to the CC1 invocation. StringRef sysroot = C.getSysRoot(); if (sysroot != "") { if (!Args.hasArg(options::OPT_isysroot)) { CmdArgs.push_back("-isysroot"); CmdArgs.push_back(C.getArgs().MakeArgString(sysroot)); } } // If a module path was provided, pass it along. Otherwise, use a temporary // directory. if (Arg *A = Args.getLastArg(options::OPT_fmodule_cache_path)) { A->claim(); A->render(Args, CmdArgs); } else { SmallString<128> DefaultModuleCache; llvm::sys::path::system_temp_directory(/*erasedOnReboot=*/false, DefaultModuleCache); llvm::sys::path::append(DefaultModuleCache, "clang-module-cache"); CmdArgs.push_back("-fmodule-cache-path"); CmdArgs.push_back(Args.MakeArgString(DefaultModuleCache)); } // Parse additional include paths from environment variables. // FIXME: We should probably sink the logic for handling these from the // frontend into the driver. It will allow deleting 4 otherwise unused flags. // CPATH - included following the user specified includes (but prior to // builtin and standard includes). addDirectoryList(Args, CmdArgs, "-I", "CPATH"); // C_INCLUDE_PATH - system includes enabled when compiling C. addDirectoryList(Args, CmdArgs, "-c-isystem", "C_INCLUDE_PATH"); // CPLUS_INCLUDE_PATH - system includes enabled when compiling C++. addDirectoryList(Args, CmdArgs, "-cxx-isystem", "CPLUS_INCLUDE_PATH"); // OBJC_INCLUDE_PATH - system includes enabled when compiling ObjC. addDirectoryList(Args, CmdArgs, "-objc-isystem", "OBJC_INCLUDE_PATH"); // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++. addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH"); // Add C++ include arguments, if needed. if (types::isCXX(Inputs[0].getType())) getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs); // Add system include arguments. getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs); } /// getLLVMArchSuffixForARM - Get the LLVM arch name to use for a particular /// CPU. // // FIXME: This is redundant with -mcpu, why does LLVM use this. // FIXME: tblgen this, or kill it! static const char *getLLVMArchSuffixForARM(StringRef CPU) { return llvm::StringSwitch(CPU) .Cases("arm7tdmi", "arm7tdmi-s", "arm710t", "v4t") .Cases("arm720t", "arm9", "arm9tdmi", "v4t") .Cases("arm920", "arm920t", "arm922t", "v4t") .Cases("arm940t", "ep9312","v4t") .Cases("arm10tdmi", "arm1020t", "v5") .Cases("arm9e", "arm926ej-s", "arm946e-s", "v5e") .Cases("arm966e-s", "arm968e-s", "arm10e", "v5e") .Cases("arm1020e", "arm1022e", "xscale", "iwmmxt", "v5e") .Cases("arm1136j-s", "arm1136jf-s", "arm1176jz-s", "v6") .Cases("arm1176jzf-s", "mpcorenovfp", "mpcore", "v6") .Cases("arm1156t2-s", "arm1156t2f-s", "v6t2") .Cases("cortex-a8", "cortex-a9", "cortex-a15", "v7") .Case("cortex-m3", "v7m") .Case("cortex-m4", "v7m") .Case("cortex-m0", "v6m") .Case("cortex-a9-mp", "v7f") .Case("swift", "v7s") .Default(""); } /// getARMTargetCPU - Get the (LLVM) name of the ARM cpu we are targeting. // // FIXME: tblgen this. static std::string getARMTargetCPU(const ArgList &Args, const llvm::Triple &Triple) { // FIXME: Warn on inconsistent use of -mcpu and -march. // If we have -mcpu=, use that. if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { StringRef MCPU = A->getValue(); // Handle -mcpu=native. if (MCPU == "native") return llvm::sys::getHostCPUName(); else return MCPU; } StringRef MArch; if (Arg *A = Args.getLastArg(options::OPT_march_EQ)) { // Otherwise, if we have -march= choose the base CPU for that arch. MArch = A->getValue(); } else { // Otherwise, use the Arch from the triple. MArch = Triple.getArchName(); } // Handle -march=native. std::string NativeMArch; if (MArch == "native") { std::string CPU = llvm::sys::getHostCPUName(); if (CPU != "generic") { // Translate the native cpu into the architecture. The switch below will // then chose the minimum cpu for that arch. NativeMArch = std::string("arm") + getLLVMArchSuffixForARM(CPU); MArch = NativeMArch; } } return llvm::StringSwitch(MArch) .Cases("armv2", "armv2a","arm2") .Case("armv3", "arm6") .Case("armv3m", "arm7m") .Cases("armv4", "armv4t", "arm7tdmi") .Cases("armv5", "armv5t", "arm10tdmi") .Cases("armv5e", "armv5te", "arm1022e") .Case("armv5tej", "arm926ej-s") .Cases("armv6", "armv6k", "arm1136jf-s") .Case("armv6j", "arm1136j-s") .Cases("armv6z", "armv6zk", "arm1176jzf-s") .Case("armv6t2", "arm1156t2-s") .Cases("armv7", "armv7a", "armv7-a", "cortex-a8") .Cases("armv7f", "armv7-f", "cortex-a9-mp") .Cases("armv7s", "armv7-s", "swift") .Cases("armv7r", "armv7-r", "cortex-r4") .Cases("armv7m", "armv7-m", "cortex-m3") .Case("ep9312", "ep9312") .Case("iwmmxt", "iwmmxt") .Case("xscale", "xscale") .Cases("armv6m", "armv6-m", "cortex-m0") // If all else failed, return the most base CPU LLVM supports. .Default("arm7tdmi"); } // FIXME: Move to target hook. static bool isSignedCharDefault(const llvm::Triple &Triple) { switch (Triple.getArch()) { default: return true; case llvm::Triple::arm: case llvm::Triple::ppc: case llvm::Triple::ppc64: if (Triple.isOSDarwin()) return true; return false; } } // Handle -mfpu=. // // FIXME: Centralize feature selection, defaulting shouldn't be also in the // frontend target. static void addFPUArgs(const Driver &D, const Arg *A, const ArgList &Args, ArgStringList &CmdArgs) { StringRef FPU = A->getValue(); // Set the target features based on the FPU. if (FPU == "fpa" || FPU == "fpe2" || FPU == "fpe3" || FPU == "maverick") { // Disable any default FPU support. CmdArgs.push_back("-target-feature"); CmdArgs.push_back("-vfp2"); CmdArgs.push_back("-target-feature"); CmdArgs.push_back("-vfp3"); CmdArgs.push_back("-target-feature"); CmdArgs.push_back("-neon"); } else if (FPU == "vfp3-d16" || FPU == "vfpv3-d16") { CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+vfp3"); CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+d16"); CmdArgs.push_back("-target-feature"); CmdArgs.push_back("-neon"); } else if (FPU == "vfp") { CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+vfp2"); CmdArgs.push_back("-target-feature"); CmdArgs.push_back("-neon"); } else if (FPU == "vfp3" || FPU == "vfpv3") { CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+vfp3"); CmdArgs.push_back("-target-feature"); CmdArgs.push_back("-neon"); } else if (FPU == "neon") { CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+neon"); } else D.Diag(diag::err_drv_clang_unsupported) << A->getAsString(Args); } // Handle -mfpmath=. static void addFPMathArgs(const Driver &D, const Arg *A, const ArgList &Args, ArgStringList &CmdArgs, StringRef CPU) { StringRef FPMath = A->getValue(); // Set the target features based on the FPMath. if (FPMath == "neon") { CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+neonfp"); if (CPU != "cortex-a8" && CPU != "cortex-a9" && CPU != "cortex-a9-mp" && CPU != "cortex-a15") D.Diag(diag::err_drv_invalid_feature) << "-mfpmath=neon" << CPU; } else if (FPMath == "vfp" || FPMath == "vfp2" || FPMath == "vfp3" || FPMath == "vfp4") { CmdArgs.push_back("-target-feature"); CmdArgs.push_back("-neonfp"); // FIXME: Add warnings when disabling a feature not present for a given CPU. } else D.Diag(diag::err_drv_clang_unsupported) << A->getAsString(Args); } // Select the float ABI as determined by -msoft-float, -mhard-float, and // -mfloat-abi=. static StringRef getARMFloatABI(const Driver &D, const ArgList &Args, const llvm::Triple &Triple) { StringRef FloatABI; if (Arg *A = Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float, options::OPT_mfloat_abi_EQ)) { if (A->getOption().matches(options::OPT_msoft_float)) FloatABI = "soft"; else if (A->getOption().matches(options::OPT_mhard_float)) FloatABI = "hard"; else { FloatABI = A->getValue(); if (FloatABI != "soft" && FloatABI != "softfp" && FloatABI != "hard") { D.Diag(diag::err_drv_invalid_mfloat_abi) << A->getAsString(Args); FloatABI = "soft"; } } } // If unspecified, choose the default based on the platform. if (FloatABI.empty()) { switch (Triple.getOS()) { case llvm::Triple::Darwin: case llvm::Triple::MacOSX: case llvm::Triple::IOS: { // Darwin defaults to "softfp" for v6 and v7. // // FIXME: Factor out an ARM class so we can cache the arch somewhere. std::string ArchName = getLLVMArchSuffixForARM(getARMTargetCPU(Args, Triple)); if (StringRef(ArchName).startswith("v6") || StringRef(ArchName).startswith("v7")) FloatABI = "softfp"; else FloatABI = "soft"; break; } case llvm::Triple::FreeBSD: // FreeBSD defaults to soft float FloatABI = "soft"; break; default: switch(Triple.getEnvironment()) { case llvm::Triple::GNUEABIHF: FloatABI = "hard"; break; case llvm::Triple::GNUEABI: FloatABI = "softfp"; break; case llvm::Triple::EABI: // EABI is always AAPCS, and if it was not marked 'hard', it's softfp FloatABI = "softfp"; break; case llvm::Triple::Android: { std::string ArchName = getLLVMArchSuffixForARM(getARMTargetCPU(Args, Triple)); if (StringRef(ArchName).startswith("v7")) FloatABI = "softfp"; else FloatABI = "soft"; break; } default: // Assume "soft", but warn the user we are guessing. FloatABI = "soft"; D.Diag(diag::warn_drv_assuming_mfloat_abi_is) << "soft"; break; } } } return FloatABI; } void Clang::AddARMTargetArgs(const ArgList &Args, ArgStringList &CmdArgs, bool KernelOrKext) const { const Driver &D = getToolChain().getDriver(); // Get the effective triple, which takes into account the deployment target. std::string TripleStr = getToolChain().ComputeEffectiveClangTriple(Args); llvm::Triple Triple(TripleStr); std::string CPUName = getARMTargetCPU(Args, Triple); // Select the ABI to use. // // FIXME: Support -meabi. const char *ABIName = 0; if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) { ABIName = A->getValue(); } else if (Triple.isOSDarwin()) { // The backend is hardwired to assume AAPCS for M-class processors, ensure // the frontend matches that. if (StringRef(CPUName).startswith("cortex-m")) { ABIName = "aapcs"; } else { ABIName = "apcs-gnu"; } } else { // Select the default based on the platform. switch(Triple.getEnvironment()) { case llvm::Triple::Android: case llvm::Triple::GNUEABI: case llvm::Triple::GNUEABIHF: ABIName = "aapcs-linux"; break; case llvm::Triple::EABI: ABIName = "aapcs"; break; default: ABIName = "apcs-gnu"; } } CmdArgs.push_back("-target-abi"); CmdArgs.push_back(ABIName); // Set the CPU based on -march= and -mcpu=. CmdArgs.push_back("-target-cpu"); CmdArgs.push_back(Args.MakeArgString(CPUName)); // Determine floating point ABI from the options & target defaults. StringRef FloatABI = getARMFloatABI(D, Args, Triple); if (FloatABI == "soft") { // Floating point operations and argument passing are soft. // // FIXME: This changes CPP defines, we need -target-soft-float. CmdArgs.push_back("-msoft-float"); CmdArgs.push_back("-mfloat-abi"); CmdArgs.push_back("soft"); } else if (FloatABI == "softfp") { // Floating point operations are hard, but argument passing is soft. CmdArgs.push_back("-mfloat-abi"); CmdArgs.push_back("soft"); } else { // Floating point operations and argument passing are hard. assert(FloatABI == "hard" && "Invalid float abi!"); CmdArgs.push_back("-mfloat-abi"); CmdArgs.push_back("hard"); } // Set appropriate target features for floating point mode. // // FIXME: Note, this is a hack, the LLVM backend doesn't actually use these // yet (it uses the -mfloat-abi and -msoft-float options above), and it is // stripped out by the ARM target. // Use software floating point operations? if (FloatABI == "soft") { CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+soft-float"); } // Use software floating point argument passing? if (FloatABI != "hard") { CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+soft-float-abi"); } // Honor -mfpu=. if (const Arg *A = Args.getLastArg(options::OPT_mfpu_EQ)) addFPUArgs(D, A, Args, CmdArgs); // Honor -mfpmath=. if (const Arg *A = Args.getLastArg(options::OPT_mfpmath_EQ)) addFPMathArgs(D, A, Args, CmdArgs, getARMTargetCPU(Args, Triple)); // Setting -msoft-float effectively disables NEON because of the GCC // implementation, although the same isn't true of VFP or VFP3. if (FloatABI == "soft") { CmdArgs.push_back("-target-feature"); CmdArgs.push_back("-neon"); } // Kernel code has more strict alignment requirements. if (KernelOrKext) { if (Triple.getOS() != llvm::Triple::IOS || Triple.isOSVersionLT(6)) { CmdArgs.push_back("-backend-option"); CmdArgs.push_back("-arm-long-calls"); } CmdArgs.push_back("-backend-option"); CmdArgs.push_back("-arm-strict-align"); // The kext linker doesn't know how to deal with movw/movt. CmdArgs.push_back("-backend-option"); CmdArgs.push_back("-arm-darwin-use-movt=0"); } // Setting -mno-global-merge disables the codegen global merge pass. Setting // -mglobal-merge has no effect as the pass is enabled by default. if (Arg *A = Args.getLastArg(options::OPT_mglobal_merge, options::OPT_mno_global_merge)) { if (A->getOption().matches(options::OPT_mno_global_merge)) CmdArgs.push_back("-mno-global-merge"); } if (Args.hasArg(options::OPT_mno_implicit_float)) CmdArgs.push_back("-no-implicit-float"); } // Translate MIPS CPU name alias option to CPU name. static StringRef getMipsCPUFromAlias(const Arg &A) { if (A.getOption().matches(options::OPT_mips32)) return "mips32"; if (A.getOption().matches(options::OPT_mips32r2)) return "mips32r2"; if (A.getOption().matches(options::OPT_mips64)) return "mips64"; if (A.getOption().matches(options::OPT_mips64r2)) return "mips64r2"; llvm_unreachable("Unexpected option"); return ""; } // Get CPU and ABI names. They are not independent // so we have to calculate them together. static void getMipsCPUAndABI(const ArgList &Args, const ToolChain &TC, StringRef &CPUName, StringRef &ABIName) { const char *DefMips32CPU = "mips32"; const char *DefMips64CPU = "mips64"; if (Arg *A = Args.getLastArg(options::OPT_march_EQ, options::OPT_mcpu_EQ, options::OPT_mips_CPUs_Group)) { if (A->getOption().matches(options::OPT_mips_CPUs_Group)) CPUName = getMipsCPUFromAlias(*A); else CPUName = A->getValue(); } if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) ABIName = A->getValue(); // Setup default CPU and ABI names. if (CPUName.empty() && ABIName.empty()) { switch (TC.getTriple().getArch()) { default: llvm_unreachable("Unexpected triple arch name"); case llvm::Triple::mips: case llvm::Triple::mipsel: CPUName = DefMips32CPU; break; case llvm::Triple::mips64: case llvm::Triple::mips64el: CPUName = DefMips64CPU; break; } } if (!ABIName.empty()) { // Deduce CPU name from ABI name. CPUName = llvm::StringSwitch(ABIName) .Cases("o32", "eabi", DefMips32CPU) .Cases("n32", "n64", DefMips64CPU) .Default(""); } else if (!CPUName.empty()) { // Deduce ABI name from CPU name. ABIName = llvm::StringSwitch(CPUName) .Cases("mips32", "mips32r2", "o32") .Cases("mips64", "mips64r2", "n64") .Default(""); } // FIXME: Warn on inconsistent cpu and abi usage. } // Select the MIPS float ABI as determined by -msoft-float, -mhard-float, // and -mfloat-abi=. static StringRef getMipsFloatABI(const Driver &D, const ArgList &Args) { // Select the float ABI as determined by -msoft-float, -mhard-float, // and -mfloat-abi=. StringRef FloatABI; if (Arg *A = Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float, options::OPT_mfloat_abi_EQ)) { if (A->getOption().matches(options::OPT_msoft_float)) FloatABI = "soft"; else if (A->getOption().matches(options::OPT_mhard_float)) FloatABI = "hard"; else { FloatABI = A->getValue(); if (FloatABI != "soft" && FloatABI != "single" && FloatABI != "hard") { D.Diag(diag::err_drv_invalid_mfloat_abi) << A->getAsString(Args); FloatABI = "hard"; } } } // If unspecified, choose the default based on the platform. if (FloatABI.empty()) { // Assume "hard", because it's a default value used by gcc. // When we start to recognize specific target MIPS processors, // we will be able to select the default more correctly. FloatABI = "hard"; } return FloatABI; } static void AddTargetFeature(const ArgList &Args, ArgStringList &CmdArgs, OptSpecifier OnOpt, OptSpecifier OffOpt, StringRef FeatureName) { if (Arg *A = Args.getLastArg(OnOpt, OffOpt)) { CmdArgs.push_back("-target-feature"); if (A->getOption().matches(OnOpt)) CmdArgs.push_back(Args.MakeArgString("+" + FeatureName)); else CmdArgs.push_back(Args.MakeArgString("-" + FeatureName)); } } void Clang::AddMIPSTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const { const Driver &D = getToolChain().getDriver(); StringRef CPUName; StringRef ABIName; getMipsCPUAndABI(Args, getToolChain(), CPUName, ABIName); CmdArgs.push_back("-target-cpu"); CmdArgs.push_back(CPUName.data()); CmdArgs.push_back("-target-abi"); CmdArgs.push_back(ABIName.data()); StringRef FloatABI = getMipsFloatABI(D, Args); if (FloatABI == "soft") { // Floating point operations and argument passing are soft. CmdArgs.push_back("-msoft-float"); CmdArgs.push_back("-mfloat-abi"); CmdArgs.push_back("soft"); // FIXME: Note, this is a hack. We need to pass the selected float // mode to the MipsTargetInfoBase to define appropriate macros there. // Now it is the only method. CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+soft-float"); } else if (FloatABI == "single") { // Restrict the use of hardware floating-point // instructions to 32-bit operations. CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+single-float"); } else { // Floating point operations and argument passing are hard. assert(FloatABI == "hard" && "Invalid float abi!"); CmdArgs.push_back("-mfloat-abi"); CmdArgs.push_back("hard"); } AddTargetFeature(Args, CmdArgs, options::OPT_mips16, options::OPT_mno_mips16, "mips16"); AddTargetFeature(Args, CmdArgs, options::OPT_mdsp, options::OPT_mno_dsp, "dsp"); AddTargetFeature(Args, CmdArgs, options::OPT_mdspr2, options::OPT_mno_dspr2, "dspr2"); if (Arg *A = Args.getLastArg(options::OPT_G)) { StringRef v = A->getValue(); CmdArgs.push_back("-mllvm"); CmdArgs.push_back(Args.MakeArgString("-mips-ssection-threshold=" + v)); A->claim(); } } /// getPPCTargetCPU - Get the (LLVM) name of the PowerPC cpu we are targeting. static std::string getPPCTargetCPU(const ArgList &Args) { if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { StringRef CPUName = A->getValue(); if (CPUName == "native") { std::string CPU = llvm::sys::getHostCPUName(); if (!CPU.empty() && CPU != "generic") return CPU; else return ""; } return llvm::StringSwitch(CPUName) .Case("common", "generic") .Case("440", "440") .Case("440fp", "440") .Case("450", "450") .Case("601", "601") .Case("602", "602") .Case("603", "603") .Case("603e", "603e") .Case("603ev", "603ev") .Case("604", "604") .Case("604e", "604e") .Case("620", "620") .Case("G3", "g3") .Case("7400", "7400") .Case("G4", "g4") .Case("7450", "7450") .Case("G4+", "g4+") .Case("750", "750") .Case("970", "970") .Case("G5", "g5") .Case("a2", "a2") .Case("e500mc", "e500mc") .Case("e5500", "e5500") .Case("power6", "pwr6") .Case("power7", "pwr7") .Case("powerpc", "ppc") .Case("powerpc64", "ppc64") .Default(""); } return ""; } void Clang::AddPPCTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const { std::string TargetCPUName = getPPCTargetCPU(Args); // LLVM may default to generating code for the native CPU, // but, like gcc, we default to a more generic option for // each architecture. (except on Darwin) llvm::Triple Triple = getToolChain().getTriple(); if (TargetCPUName.empty() && !Triple.isOSDarwin()) { if (Triple.getArch() == llvm::Triple::ppc64) TargetCPUName = "ppc64"; else TargetCPUName = "ppc"; } if (!TargetCPUName.empty()) { CmdArgs.push_back("-target-cpu"); CmdArgs.push_back(Args.MakeArgString(TargetCPUName.c_str())); } } void Clang::AddSparcTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const { const Driver &D = getToolChain().getDriver(); if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { CmdArgs.push_back("-target-cpu"); CmdArgs.push_back(A->getValue()); } // Select the float ABI as determined by -msoft-float, -mhard-float, and StringRef FloatABI; if (Arg *A = Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float)) { if (A->getOption().matches(options::OPT_msoft_float)) FloatABI = "soft"; else if (A->getOption().matches(options::OPT_mhard_float)) FloatABI = "hard"; } // If unspecified, choose the default based on the platform. if (FloatABI.empty()) { switch (getToolChain().getTriple().getOS()) { default: // Assume "soft", but warn the user we are guessing. FloatABI = "soft"; D.Diag(diag::warn_drv_assuming_mfloat_abi_is) << "soft"; break; } } if (FloatABI == "soft") { // Floating point operations and argument passing are soft. // // FIXME: This changes CPP defines, we need -target-soft-float. CmdArgs.push_back("-msoft-float"); CmdArgs.push_back("-target-feature"); CmdArgs.push_back("+soft-float"); } else { assert(FloatABI == "hard" && "Invalid float abi!"); CmdArgs.push_back("-mhard-float"); } } +static const char *getX86TargetCPU(const ArgList &Args, + const llvm::Triple &Triple) { + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { + if (StringRef(A->getValue()) != "native") + return A->getValue(); + + // FIXME: Reject attempts to use -march=native unless the target matches + // the host. + // + // FIXME: We should also incorporate the detected target features for use + // with -native. + std::string CPU = llvm::sys::getHostCPUName(); + if (!CPU.empty() && CPU != "generic") + return Args.MakeArgString(CPU); + } + + // Select the default CPU if none was given (or detection failed). + + if (Triple.getArch() != llvm::Triple::x86_64 && + Triple.getArch() != llvm::Triple::x86) + return 0; // This routine is only handling x86 targets. + + bool Is64Bit = Triple.getArch() == llvm::Triple::x86_64; + + // FIXME: Need target hooks. + if (Triple.isOSDarwin()) + return Is64Bit ? "core2" : "yonah"; + + // Everything else goes to x86-64 in 64-bit mode. + if (Is64Bit) + return "x86-64"; + + if (Triple.getOSName().startswith("haiku")) + return "i586"; + if (Triple.getOSName().startswith("openbsd")) + return "i486"; + if (Triple.getOSName().startswith("bitrig")) + return "i686"; + if (Triple.getOSName().startswith("freebsd")) + return "i486"; + if (Triple.getOSName().startswith("netbsd")) + return "i486"; + // All x86 devices running Android have core2 as their common + // denominator. This makes a better choice than pentium4. + if (Triple.getEnvironment() == llvm::Triple::Android) + return "core2"; + + // Fallback to p4. + return "pentium4"; +} + void Clang::AddX86TargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const { - const bool isAndroid = - getToolChain().getTriple().getEnvironment() == llvm::Triple::Android; if (!Args.hasFlag(options::OPT_mred_zone, options::OPT_mno_red_zone, true) || Args.hasArg(options::OPT_mkernel) || Args.hasArg(options::OPT_fapple_kext)) CmdArgs.push_back("-disable-red-zone"); if (Args.hasFlag(options::OPT_msoft_float, options::OPT_mno_soft_float, false)) CmdArgs.push_back("-no-implicit-float"); - const char *CPUName = 0; - if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { - if (StringRef(A->getValue()) == "native") { - // FIXME: Reject attempts to use -march=native unless the target matches - // the host. - // - // FIXME: We should also incorporate the detected target features for use - // with -native. - std::string CPU = llvm::sys::getHostCPUName(); - if (!CPU.empty() && CPU != "generic") - CPUName = Args.MakeArgString(CPU); - } else - CPUName = A->getValue(); - } - - // Select the default CPU if none was given (or detection failed). - if (!CPUName) { - // FIXME: Need target hooks. - if (getToolChain().getTriple().isOSDarwin()) { - if (getToolChain().getArch() == llvm::Triple::x86_64) - CPUName = "core2"; - else if (getToolChain().getArch() == llvm::Triple::x86) - CPUName = "yonah"; - } else if (getToolChain().getOS().startswith("haiku")) { - if (getToolChain().getArch() == llvm::Triple::x86_64) - CPUName = "x86-64"; - else if (getToolChain().getArch() == llvm::Triple::x86) - CPUName = "i586"; - } else if (getToolChain().getOS().startswith("openbsd")) { - if (getToolChain().getArch() == llvm::Triple::x86_64) - CPUName = "x86-64"; - else if (getToolChain().getArch() == llvm::Triple::x86) - CPUName = "i486"; - } else if (getToolChain().getOS().startswith("bitrig")) { - if (getToolChain().getArch() == llvm::Triple::x86_64) - CPUName = "x86-64"; - else if (getToolChain().getArch() == llvm::Triple::x86) - CPUName = "i686"; - } else if (getToolChain().getOS().startswith("freebsd")) { - if (getToolChain().getArch() == llvm::Triple::x86_64) - CPUName = "x86-64"; - else if (getToolChain().getArch() == llvm::Triple::x86) - CPUName = "i486"; - } else if (getToolChain().getOS().startswith("netbsd")) { - if (getToolChain().getArch() == llvm::Triple::x86_64) - CPUName = "x86-64"; - else if (getToolChain().getArch() == llvm::Triple::x86) - CPUName = "i486"; - } else { - if (getToolChain().getArch() == llvm::Triple::x86_64) - CPUName = "x86-64"; - else if (getToolChain().getArch() == llvm::Triple::x86) - // All x86 devices running Android have core2 as their common - // denominator. This makes a better choice than pentium4. - CPUName = isAndroid ? "core2" : "pentium4"; - } - } - - if (CPUName) { + if (const char *CPUName = getX86TargetCPU(Args, getToolChain().getTriple())) { CmdArgs.push_back("-target-cpu"); CmdArgs.push_back(CPUName); } // The required algorithm here is slightly strange: the options are applied // in order (so -mno-sse -msse2 disables SSE3), but any option that gets // directly overridden later is ignored (so "-mno-sse -msse2 -mno-sse2 -msse" // is equivalent to "-mno-sse2 -msse"). The -cc1 handling deals with the // former correctly, but not the latter; handle directly-overridden // attributes here. llvm::StringMap PrevFeature; std::vector Features; for (arg_iterator it = Args.filtered_begin(options::OPT_m_x86_Features_Group), ie = Args.filtered_end(); it != ie; ++it) { StringRef Name = (*it)->getOption().getName(); (*it)->claim(); // Skip over "-m". assert(Name.startswith("m") && "Invalid feature name."); Name = Name.substr(1); bool IsNegative = Name.startswith("no-"); if (IsNegative) Name = Name.substr(3); unsigned& Prev = PrevFeature[Name]; if (Prev) Features[Prev - 1] = 0; Prev = Features.size() + 1; Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name)); } for (unsigned i = 0; i < Features.size(); i++) { if (Features[i]) { CmdArgs.push_back("-target-feature"); CmdArgs.push_back(Features[i]); } } } static Arg* getLastHexagonArchArg (const ArgList &Args) { Arg * A = NULL; for (ArgList::const_iterator it = Args.begin(), ie = Args.end(); it != ie; ++it) { if ((*it)->getOption().matches(options::OPT_march_EQ) || (*it)->getOption().matches(options::OPT_mcpu_EQ)) { A = *it; A->claim(); } else if ((*it)->getOption().matches(options::OPT_m_Joined)){ StringRef Value = (*it)->getValue(0); if (Value.startswith("v")) { A = *it; A->claim(); } } } return A; } static StringRef getHexagonTargetCPU(const ArgList &Args) { Arg *A; llvm::StringRef WhichHexagon; // Select the default CPU (v4) if none was given or detection failed. if ((A = getLastHexagonArchArg (Args))) { WhichHexagon = A->getValue(); if (WhichHexagon == "") return "v4"; else return WhichHexagon; } else return "v4"; } void Clang::AddHexagonTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const { llvm::Triple Triple = getToolChain().getTriple(); CmdArgs.push_back("-target-cpu"); CmdArgs.push_back(Args.MakeArgString("hexagon" + getHexagonTargetCPU(Args))); CmdArgs.push_back("-fno-signed-char"); CmdArgs.push_back("-nobuiltininc"); if (Args.hasArg(options::OPT_mqdsp6_compat)) CmdArgs.push_back("-mqdsp6-compat"); if (Arg *A = Args.getLastArg(options::OPT_G, options::OPT_msmall_data_threshold_EQ)) { std::string SmallDataThreshold="-small-data-threshold="; SmallDataThreshold += A->getValue(); CmdArgs.push_back ("-mllvm"); CmdArgs.push_back(Args.MakeArgString(SmallDataThreshold)); A->claim(); } if (!Args.hasArg(options::OPT_fno_short_enums)) CmdArgs.push_back("-fshort-enums"); if (Args.getLastArg(options::OPT_mieee_rnd_near)) { CmdArgs.push_back ("-mllvm"); CmdArgs.push_back ("-enable-hexagon-ieee-rnd-near"); } CmdArgs.push_back ("-mllvm"); CmdArgs.push_back ("-machine-sink-split=0"); } static bool shouldUseExceptionTablesForObjCExceptions(const ObjCRuntime &runtime, const llvm::Triple &Triple) { // We use the zero-cost exception tables for Objective-C if the non-fragile // ABI is enabled or when compiling for x86_64 and ARM on Snow Leopard and // later. if (runtime.isNonFragile()) return true; if (!Triple.isOSDarwin()) return false; return (!Triple.isMacOSXVersionLT(10,5) && (Triple.getArch() == llvm::Triple::x86_64 || Triple.getArch() == llvm::Triple::arm)); } /// addExceptionArgs - Adds exception related arguments to the driver command /// arguments. There's a master flag, -fexceptions and also language specific /// flags to enable/disable C++ and Objective-C exceptions. /// This makes it possible to for example disable C++ exceptions but enable /// Objective-C exceptions. static void addExceptionArgs(const ArgList &Args, types::ID InputType, const llvm::Triple &Triple, bool KernelOrKext, const ObjCRuntime &objcRuntime, ArgStringList &CmdArgs) { if (KernelOrKext) { // -mkernel and -fapple-kext imply no exceptions, so claim exception related // arguments now to avoid warnings about unused arguments. Args.ClaimAllArgs(options::OPT_fexceptions); Args.ClaimAllArgs(options::OPT_fno_exceptions); Args.ClaimAllArgs(options::OPT_fobjc_exceptions); Args.ClaimAllArgs(options::OPT_fno_objc_exceptions); Args.ClaimAllArgs(options::OPT_fcxx_exceptions); Args.ClaimAllArgs(options::OPT_fno_cxx_exceptions); return; } // Exceptions are enabled by default. bool ExceptionsEnabled = true; // This keeps track of whether exceptions were explicitly turned on or off. bool DidHaveExplicitExceptionFlag = false; if (Arg *A = Args.getLastArg(options::OPT_fexceptions, options::OPT_fno_exceptions)) { if (A->getOption().matches(options::OPT_fexceptions)) ExceptionsEnabled = true; else ExceptionsEnabled = false; DidHaveExplicitExceptionFlag = true; } bool ShouldUseExceptionTables = false; // Exception tables and cleanups can be enabled with -fexceptions even if the // language itself doesn't support exceptions. if (ExceptionsEnabled && DidHaveExplicitExceptionFlag) ShouldUseExceptionTables = true; // Obj-C exceptions are enabled by default, regardless of -fexceptions. This // is not necessarily sensible, but follows GCC. if (types::isObjC(InputType) && Args.hasFlag(options::OPT_fobjc_exceptions, options::OPT_fno_objc_exceptions, true)) { CmdArgs.push_back("-fobjc-exceptions"); ShouldUseExceptionTables |= shouldUseExceptionTablesForObjCExceptions(objcRuntime, Triple); } if (types::isCXX(InputType)) { bool CXXExceptionsEnabled = ExceptionsEnabled; if (Arg *A = Args.getLastArg(options::OPT_fcxx_exceptions, options::OPT_fno_cxx_exceptions, options::OPT_fexceptions, options::OPT_fno_exceptions)) { if (A->getOption().matches(options::OPT_fcxx_exceptions)) CXXExceptionsEnabled = true; else if (A->getOption().matches(options::OPT_fno_cxx_exceptions)) CXXExceptionsEnabled = false; } if (CXXExceptionsEnabled) { CmdArgs.push_back("-fcxx-exceptions"); ShouldUseExceptionTables = true; } } if (ShouldUseExceptionTables) CmdArgs.push_back("-fexceptions"); } static bool ShouldDisableCFI(const ArgList &Args, const ToolChain &TC) { bool Default = true; if (TC.getTriple().isOSDarwin()) { // The native darwin assembler doesn't support cfi directives, so // we disable them if we think the .s file will be passed to it. Default = Args.hasFlag(options::OPT_integrated_as, options::OPT_no_integrated_as, TC.IsIntegratedAssemblerDefault()); } return !Args.hasFlag(options::OPT_fdwarf2_cfi_asm, options::OPT_fno_dwarf2_cfi_asm, Default); } static bool ShouldDisableDwarfDirectory(const ArgList &Args, const ToolChain &TC) { bool IsIADefault = TC.IsIntegratedAssemblerDefault(); bool UseIntegratedAs = Args.hasFlag(options::OPT_integrated_as, options::OPT_no_integrated_as, IsIADefault); bool UseDwarfDirectory = Args.hasFlag(options::OPT_fdwarf_directory_asm, options::OPT_fno_dwarf_directory_asm, UseIntegratedAs); return !UseDwarfDirectory; } /// \brief Check whether the given input tree contains any compilation actions. static bool ContainsCompileAction(const Action *A) { if (isa(A)) return true; for (Action::const_iterator it = A->begin(), ie = A->end(); it != ie; ++it) if (ContainsCompileAction(*it)) return true; return false; } /// \brief Check if -relax-all should be passed to the internal assembler. /// This is done by default when compiling non-assembler source with -O0. static bool UseRelaxAll(Compilation &C, const ArgList &Args) { bool RelaxDefault = true; if (Arg *A = Args.getLastArg(options::OPT_O_Group)) RelaxDefault = A->getOption().matches(options::OPT_O0); if (RelaxDefault) { RelaxDefault = false; for (ActionList::const_iterator it = C.getActions().begin(), ie = C.getActions().end(); it != ie; ++it) { if (ContainsCompileAction(*it)) { RelaxDefault = true; break; } } } return Args.hasFlag(options::OPT_mrelax_all, options::OPT_mno_relax_all, RelaxDefault); } SanitizerArgs::SanitizerArgs(const Driver &D, const ArgList &Args) { Kind = 0; const Arg *AsanArg, *TsanArg, *UbsanArg; for (ArgList::const_iterator I = Args.begin(), E = Args.end(); I != E; ++I) { unsigned Add = 0, Remove = 0; const char *DeprecatedReplacement = 0; if ((*I)->getOption().matches(options::OPT_faddress_sanitizer)) { Add = Address; DeprecatedReplacement = "-fsanitize=address"; } else if ((*I)->getOption().matches(options::OPT_fno_address_sanitizer)) { Remove = Address; DeprecatedReplacement = "-fno-sanitize=address"; } else if ((*I)->getOption().matches(options::OPT_fthread_sanitizer)) { Add = Thread; DeprecatedReplacement = "-fsanitize=thread"; } else if ((*I)->getOption().matches(options::OPT_fno_thread_sanitizer)) { Remove = Thread; DeprecatedReplacement = "-fno-sanitize=thread"; } else if ((*I)->getOption().matches(options::OPT_fcatch_undefined_behavior)) { Add = Undefined; DeprecatedReplacement = "-fsanitize=undefined"; } else if ((*I)->getOption().matches(options::OPT_fsanitize_EQ)) { Add = parse(D, *I); } else if ((*I)->getOption().matches(options::OPT_fno_sanitize_EQ)) { Remove = parse(D, *I); } else { continue; } (*I)->claim(); Kind |= Add; Kind &= ~Remove; if (Add & NeedsAsanRt) AsanArg = *I; if (Add & NeedsTsanRt) TsanArg = *I; if (Add & NeedsUbsanRt) UbsanArg = *I; // If this is a deprecated synonym, produce a warning directing users // towards the new spelling. if (DeprecatedReplacement) D.Diag(diag::warn_drv_deprecated_arg) << (*I)->getAsString(Args) << DeprecatedReplacement; } // Only one runtime library can be used at once. // FIXME: Allow Ubsan to be combined with the other two. bool NeedsAsan = needsAsanRt(); bool NeedsTsan = needsTsanRt(); bool NeedsUbsan = needsUbsanRt(); if (NeedsAsan + NeedsTsan + NeedsUbsan > 1) D.Diag(diag::err_drv_argument_not_allowed_with) << describeSanitizeArg(Args, NeedsAsan ? AsanArg : TsanArg, NeedsAsan ? NeedsAsanRt : NeedsTsanRt) << describeSanitizeArg(Args, NeedsUbsan ? UbsanArg : TsanArg, NeedsUbsan ? NeedsUbsanRt : NeedsTsanRt); } /// If AddressSanitizer is enabled, add appropriate linker flags (Linux). /// This needs to be called before we add the C run-time (malloc, etc). static void addAsanRTLinux(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { if(TC.getTriple().getEnvironment() == llvm::Triple::Android) { if (!Args.hasArg(options::OPT_shared)) { if (!Args.hasArg(options::OPT_pie)) TC.getDriver().Diag(diag::err_drv_asan_android_requires_pie); } SmallString<128> LibAsan(TC.getDriver().ResourceDir); llvm::sys::path::append(LibAsan, "lib", "linux", (Twine("libclang_rt.asan-") + TC.getArchName() + "-android.so")); CmdArgs.push_back(Args.MakeArgString(LibAsan)); } else { if (!Args.hasArg(options::OPT_shared)) { // LibAsan is "libclang_rt.asan-.a" in the Linux library // resource directory. SmallString<128> LibAsan(TC.getDriver().ResourceDir); llvm::sys::path::append(LibAsan, "lib", "linux", (Twine("libclang_rt.asan-") + TC.getArchName() + ".a")); CmdArgs.push_back(Args.MakeArgString(LibAsan)); CmdArgs.push_back("-lpthread"); CmdArgs.push_back("-ldl"); CmdArgs.push_back("-export-dynamic"); } } } /// If ThreadSanitizer is enabled, add appropriate linker flags (Linux). /// This needs to be called before we add the C run-time (malloc, etc). static void addTsanRTLinux(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { if (!Args.hasArg(options::OPT_shared)) { // LibTsan is "libclang_rt.tsan-.a" in the Linux library // resource directory. SmallString<128> LibTsan(TC.getDriver().ResourceDir); llvm::sys::path::append(LibTsan, "lib", "linux", (Twine("libclang_rt.tsan-") + TC.getArchName() + ".a")); CmdArgs.push_back(Args.MakeArgString(LibTsan)); CmdArgs.push_back("-lpthread"); CmdArgs.push_back("-ldl"); CmdArgs.push_back("-export-dynamic"); } } /// If UndefinedBehaviorSanitizer is enabled, add appropriate linker flags /// (Linux). static void addUbsanRTLinux(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { if (!Args.hasArg(options::OPT_shared)) { // LibUbsan is "libclang_rt.ubsan-.a" in the Linux library // resource directory. SmallString<128> LibUbsan(TC.getDriver().ResourceDir); llvm::sys::path::append(LibUbsan, "lib", "linux", (Twine("libclang_rt.ubsan-") + TC.getArchName() + ".a")); CmdArgs.push_back(Args.MakeArgString(LibUbsan)); CmdArgs.push_back("-lpthread"); } } static bool shouldUseFramePointer(const ArgList &Args, const llvm::Triple &Triple) { if (Arg *A = Args.getLastArg(options::OPT_fno_omit_frame_pointer, options::OPT_fomit_frame_pointer)) return A->getOption().matches(options::OPT_fno_omit_frame_pointer); // Don't use a frame pointer on linux x86 and x86_64 if optimizing. if ((Triple.getArch() == llvm::Triple::x86_64 || Triple.getArch() == llvm::Triple::x86) && Triple.getOS() == llvm::Triple::Linux) { if (Arg *A = Args.getLastArg(options::OPT_O_Group)) if (!A->getOption().matches(options::OPT_O0)) return false; } return true; } void Clang::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { bool KernelOrKext = Args.hasArg(options::OPT_mkernel, options::OPT_fapple_kext); const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; assert(Inputs.size() == 1 && "Unable to handle multiple inputs."); // Invoke ourselves in -cc1 mode. // // FIXME: Implement custom jobs for internal actions. CmdArgs.push_back("-cc1"); // Add the "effective" target triple. CmdArgs.push_back("-triple"); std::string TripleStr = getToolChain().ComputeEffectiveClangTriple(Args); CmdArgs.push_back(Args.MakeArgString(TripleStr)); // Select the appropriate action. RewriteKind rewriteKind = RK_None; if (isa(JA)) { assert(JA.getType() == types::TY_Plist && "Invalid output type."); CmdArgs.push_back("-analyze"); } else if (isa(JA)) { CmdArgs.push_back("-migrate"); } else if (isa(JA)) { if (Output.getType() == types::TY_Dependencies) CmdArgs.push_back("-Eonly"); else CmdArgs.push_back("-E"); } else if (isa(JA)) { CmdArgs.push_back("-emit-obj"); if (UseRelaxAll(C, Args)) CmdArgs.push_back("-mrelax-all"); // When using an integrated assembler, translate -Wa, and -Xassembler // options. for (arg_iterator it = Args.filtered_begin(options::OPT_Wa_COMMA, options::OPT_Xassembler), ie = Args.filtered_end(); it != ie; ++it) { const Arg *A = *it; A->claim(); for (unsigned i = 0, e = A->getNumValues(); i != e; ++i) { StringRef Value = A->getValue(i); if (Value == "-force_cpusubtype_ALL") { // Do nothing, this is the default and we don't support anything else. } else if (Value == "-L") { CmdArgs.push_back("-msave-temp-labels"); } else if (Value == "--fatal-warnings") { CmdArgs.push_back("-mllvm"); CmdArgs.push_back("-fatal-assembler-warnings"); } else if (Value == "--noexecstack") { CmdArgs.push_back("-mnoexecstack"); } else { D.Diag(diag::err_drv_unsupported_option_argument) << A->getOption().getName() << Value; } } } // Also ignore explicit -force_cpusubtype_ALL option. (void) Args.hasArg(options::OPT_force__cpusubtype__ALL); } else if (isa(JA)) { // Use PCH if the user requested it. bool UsePCH = D.CCCUsePCH; if (JA.getType() == types::TY_Nothing) CmdArgs.push_back("-fsyntax-only"); else if (UsePCH) CmdArgs.push_back("-emit-pch"); else CmdArgs.push_back("-emit-pth"); } else { assert(isa(JA) && "Invalid action for clang tool."); if (JA.getType() == types::TY_Nothing) { CmdArgs.push_back("-fsyntax-only"); } else if (JA.getType() == types::TY_LLVM_IR || JA.getType() == types::TY_LTO_IR) { CmdArgs.push_back("-emit-llvm"); } else if (JA.getType() == types::TY_LLVM_BC || JA.getType() == types::TY_LTO_BC) { CmdArgs.push_back("-emit-llvm-bc"); } else if (JA.getType() == types::TY_PP_Asm) { CmdArgs.push_back("-S"); } else if (JA.getType() == types::TY_AST) { CmdArgs.push_back("-emit-pch"); } else if (JA.getType() == types::TY_RewrittenObjC) { CmdArgs.push_back("-rewrite-objc"); rewriteKind = RK_NonFragile; } else if (JA.getType() == types::TY_RewrittenLegacyObjC) { CmdArgs.push_back("-rewrite-objc"); rewriteKind = RK_Fragile; } else { assert(JA.getType() == types::TY_PP_Asm && "Unexpected output type!"); } } // The make clang go fast button. CmdArgs.push_back("-disable-free"); // Disable the verification pass in -asserts builds. #ifdef NDEBUG CmdArgs.push_back("-disable-llvm-verifier"); #endif // Set the main file name, so that debug info works even with // -save-temps. CmdArgs.push_back("-main-file-name"); CmdArgs.push_back(darwin::CC1::getBaseInputName(Args, Inputs)); // Some flags which affect the language (via preprocessor // defines). See darwin::CC1::AddCPPArgs. if (Args.hasArg(options::OPT_static)) CmdArgs.push_back("-static-define"); if (isa(JA)) { // Enable region store model by default. CmdArgs.push_back("-analyzer-store=region"); // Treat blocks as analysis entry points. CmdArgs.push_back("-analyzer-opt-analyze-nested-blocks"); CmdArgs.push_back("-analyzer-eagerly-assume"); // Add default argument set. if (!Args.hasArg(options::OPT__analyzer_no_default_checks)) { CmdArgs.push_back("-analyzer-checker=core"); if (getToolChain().getTriple().getOS() != llvm::Triple::Win32) CmdArgs.push_back("-analyzer-checker=unix"); if (getToolChain().getTriple().getVendor() == llvm::Triple::Apple) CmdArgs.push_back("-analyzer-checker=osx"); CmdArgs.push_back("-analyzer-checker=deadcode"); // Enable the following experimental checkers for testing. CmdArgs.push_back("-analyzer-checker=security.insecureAPI.UncheckedReturn"); CmdArgs.push_back("-analyzer-checker=security.insecureAPI.getpw"); CmdArgs.push_back("-analyzer-checker=security.insecureAPI.gets"); CmdArgs.push_back("-analyzer-checker=security.insecureAPI.mktemp"); CmdArgs.push_back("-analyzer-checker=security.insecureAPI.mkstemp"); CmdArgs.push_back("-analyzer-checker=security.insecureAPI.vfork"); } // Set the output format. The default is plist, for (lame) historical // reasons. CmdArgs.push_back("-analyzer-output"); if (Arg *A = Args.getLastArg(options::OPT__analyzer_output)) CmdArgs.push_back(A->getValue()); else CmdArgs.push_back("plist"); // Disable the presentation of standard compiler warnings when // using --analyze. We only want to show static analyzer diagnostics // or frontend errors. CmdArgs.push_back("-w"); // Add -Xanalyzer arguments when running as analyzer. Args.AddAllArgValues(CmdArgs, options::OPT_Xanalyzer); } CheckCodeGenerationOptions(D, Args); // For the PIC and PIE flag options, this logic is different from the legacy // logic in very old versions of GCC, as that logic was just a bug no one had // ever fixed. This logic is both more rational and consistent with GCC's new // logic now that the bugs are fixed. The last argument relating to either // PIC or PIE wins, and no other argument is used. If the last argument is // any flavor of the '-fno-...' arguments, both PIC and PIE are disabled. Any // PIE option implicitly enables PIC at the same level. bool PIE = false; bool PIC = getToolChain().isPICDefault(); bool IsPICLevelTwo = PIC; if (Arg *A = Args.getLastArg(options::OPT_fPIC, options::OPT_fno_PIC, options::OPT_fpic, options::OPT_fno_pic, options::OPT_fPIE, options::OPT_fno_PIE, options::OPT_fpie, options::OPT_fno_pie)) { Option O = A->getOption(); if (O.matches(options::OPT_fPIC) || O.matches(options::OPT_fpic) || O.matches(options::OPT_fPIE) || O.matches(options::OPT_fpie)) { PIE = O.matches(options::OPT_fPIE) || O.matches(options::OPT_fpie); PIC = PIE || O.matches(options::OPT_fPIC) || O.matches(options::OPT_fpic); IsPICLevelTwo = O.matches(options::OPT_fPIE) || O.matches(options::OPT_fPIC); } else { PIE = PIC = false; } } // Check whether the tool chain trumps the PIC-ness decision. If the PIC-ness // is forced, then neither PIC nor PIE flags will have no effect. if (getToolChain().isPICDefaultForced()) { PIE = false; PIC = getToolChain().isPICDefault(); IsPICLevelTwo = PIC; } // Inroduce a Darwin-specific hack. If the default is PIC but the flags // specified while enabling PIC enabled level 1 PIC, just force it back to // level 2 PIC instead. This matches the behavior of Darwin GCC (based on my // informal testing). if (PIC && getToolChain().getTriple().isOSDarwin()) IsPICLevelTwo |= getToolChain().isPICDefault(); // Note that these flags are trump-cards. Regardless of the order w.r.t. the // PIC or PIE options above, if these show up, PIC is disabled. llvm::Triple Triple(TripleStr); if ((Args.hasArg(options::OPT_mkernel) || Args.hasArg(options::OPT_fapple_kext)) && (Triple.getOS() != llvm::Triple::IOS || Triple.isOSVersionLT(6))) PIC = PIE = false; if (Args.hasArg(options::OPT_static)) PIC = PIE = false; if (Arg *A = Args.getLastArg(options::OPT_mdynamic_no_pic)) { // This is a very special mode. It trumps the other modes, almost no one // uses it, and it isn't even valid on any OS but Darwin. if (!getToolChain().getTriple().isOSDarwin()) D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getSpelling() << getToolChain().getTriple().str(); // FIXME: Warn when this flag trumps some other PIC or PIE flag. CmdArgs.push_back("-mrelocation-model"); CmdArgs.push_back("dynamic-no-pic"); // Only a forced PIC mode can cause the actual compile to have PIC defines // etc., no flags are sufficient. This behavior was selected to closely // match that of llvm-gcc and Apple GCC before that. if (getToolChain().isPICDefault() && getToolChain().isPICDefaultForced()) { CmdArgs.push_back("-pic-level"); CmdArgs.push_back("2"); } } else { // Currently, LLVM only knows about PIC vs. static; the PIE differences are // handled in Clang's IRGen by the -pie-level flag. CmdArgs.push_back("-mrelocation-model"); CmdArgs.push_back(PIC ? "pic" : "static"); if (PIC) { CmdArgs.push_back("-pic-level"); CmdArgs.push_back(IsPICLevelTwo ? "2" : "1"); if (PIE) { CmdArgs.push_back("-pie-level"); CmdArgs.push_back(IsPICLevelTwo ? "2" : "1"); } } } if (!Args.hasFlag(options::OPT_fmerge_all_constants, options::OPT_fno_merge_all_constants)) CmdArgs.push_back("-fno-merge-all-constants"); // LLVM Code Generator Options. if (Arg *A = Args.getLastArg(options::OPT_mregparm_EQ)) { CmdArgs.push_back("-mregparm"); CmdArgs.push_back(A->getValue()); } if (Args.hasFlag(options::OPT_mrtd, options::OPT_mno_rtd, false)) CmdArgs.push_back("-mrtd"); if (shouldUseFramePointer(Args, getToolChain().getTriple())) CmdArgs.push_back("-mdisable-fp-elim"); if (!Args.hasFlag(options::OPT_fzero_initialized_in_bss, options::OPT_fno_zero_initialized_in_bss)) CmdArgs.push_back("-mno-zero-initialized-in-bss"); if (!Args.hasFlag(options::OPT_fstrict_aliasing, options::OPT_fno_strict_aliasing, getToolChain().IsStrictAliasingDefault())) CmdArgs.push_back("-relaxed-aliasing"); if (Args.hasFlag(options::OPT_fstrict_enums, options::OPT_fno_strict_enums, false)) CmdArgs.push_back("-fstrict-enums"); if (!Args.hasFlag(options::OPT_foptimize_sibling_calls, options::OPT_fno_optimize_sibling_calls)) CmdArgs.push_back("-mdisable-tail-calls"); // Handle various floating point optimization flags, mapping them to the // appropriate LLVM code generation flags. The pattern for all of these is to // default off the codegen optimizations, and if any flag enables them and no // flag disables them after the flag enabling them, enable the codegen // optimization. This is complicated by several "umbrella" flags. if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, options::OPT_ffinite_math_only, options::OPT_fno_finite_math_only, options::OPT_fhonor_infinities, options::OPT_fno_honor_infinities)) if (A->getOption().getID() != options::OPT_fno_fast_math && A->getOption().getID() != options::OPT_fno_finite_math_only && A->getOption().getID() != options::OPT_fhonor_infinities) CmdArgs.push_back("-menable-no-infs"); if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, options::OPT_ffinite_math_only, options::OPT_fno_finite_math_only, options::OPT_fhonor_nans, options::OPT_fno_honor_nans)) if (A->getOption().getID() != options::OPT_fno_fast_math && A->getOption().getID() != options::OPT_fno_finite_math_only && A->getOption().getID() != options::OPT_fhonor_nans) CmdArgs.push_back("-menable-no-nans"); // -fmath-errno is the default on some platforms, e.g. BSD-derived OSes. bool MathErrno = getToolChain().IsMathErrnoDefault(); if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, options::OPT_fmath_errno, options::OPT_fno_math_errno)) MathErrno = A->getOption().getID() == options::OPT_fmath_errno; if (MathErrno) CmdArgs.push_back("-fmath-errno"); // There are several flags which require disabling very specific // optimizations. Any of these being disabled forces us to turn off the // entire set of LLVM optimizations, so collect them through all the flag // madness. bool AssociativeMath = false; if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, options::OPT_funsafe_math_optimizations, options::OPT_fno_unsafe_math_optimizations, options::OPT_fassociative_math, options::OPT_fno_associative_math)) if (A->getOption().getID() != options::OPT_fno_fast_math && A->getOption().getID() != options::OPT_fno_unsafe_math_optimizations && A->getOption().getID() != options::OPT_fno_associative_math) AssociativeMath = true; bool ReciprocalMath = false; if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, options::OPT_funsafe_math_optimizations, options::OPT_fno_unsafe_math_optimizations, options::OPT_freciprocal_math, options::OPT_fno_reciprocal_math)) if (A->getOption().getID() != options::OPT_fno_fast_math && A->getOption().getID() != options::OPT_fno_unsafe_math_optimizations && A->getOption().getID() != options::OPT_fno_reciprocal_math) ReciprocalMath = true; bool SignedZeros = true; if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, options::OPT_funsafe_math_optimizations, options::OPT_fno_unsafe_math_optimizations, options::OPT_fsigned_zeros, options::OPT_fno_signed_zeros)) if (A->getOption().getID() != options::OPT_fno_fast_math && A->getOption().getID() != options::OPT_fno_unsafe_math_optimizations && A->getOption().getID() != options::OPT_fsigned_zeros) SignedZeros = false; bool TrappingMath = true; if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, options::OPT_funsafe_math_optimizations, options::OPT_fno_unsafe_math_optimizations, options::OPT_ftrapping_math, options::OPT_fno_trapping_math)) if (A->getOption().getID() != options::OPT_fno_fast_math && A->getOption().getID() != options::OPT_fno_unsafe_math_optimizations && A->getOption().getID() != options::OPT_ftrapping_math) TrappingMath = false; if (!MathErrno && AssociativeMath && ReciprocalMath && !SignedZeros && !TrappingMath) CmdArgs.push_back("-menable-unsafe-fp-math"); // Validate and pass through -fp-contract option. if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, options::OPT_ffp_contract)) { if (A->getOption().getID() == options::OPT_ffp_contract) { StringRef Val = A->getValue(); if (Val == "fast" || Val == "on" || Val == "off") { CmdArgs.push_back(Args.MakeArgString("-ffp-contract=" + Val)); } else { D.Diag(diag::err_drv_unsupported_option_argument) << A->getOption().getName() << Val; } } else if (A->getOption().getID() == options::OPT_ffast_math) { // If fast-math is set then set the fp-contract mode to fast. CmdArgs.push_back(Args.MakeArgString("-ffp-contract=fast")); } } // We separately look for the '-ffast-math' and '-ffinite-math-only' flags, // and if we find them, tell the frontend to provide the appropriate // preprocessor macros. This is distinct from enabling any optimizations as // these options induce language changes which must survive serialization // and deserialization, etc. if (Arg *A = Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math)) if (A->getOption().matches(options::OPT_ffast_math)) CmdArgs.push_back("-ffast-math"); if (Arg *A = Args.getLastArg(options::OPT_ffinite_math_only, options::OPT_fno_fast_math)) if (A->getOption().matches(options::OPT_ffinite_math_only)) CmdArgs.push_back("-ffinite-math-only"); // Decide whether to use verbose asm. Verbose assembly is the default on // toolchains which have the integrated assembler on by default. bool IsVerboseAsmDefault = getToolChain().IsIntegratedAssemblerDefault(); if (Args.hasFlag(options::OPT_fverbose_asm, options::OPT_fno_verbose_asm, IsVerboseAsmDefault) || Args.hasArg(options::OPT_dA)) CmdArgs.push_back("-masm-verbose"); if (Args.hasArg(options::OPT_fdebug_pass_structure)) { CmdArgs.push_back("-mdebug-pass"); CmdArgs.push_back("Structure"); } if (Args.hasArg(options::OPT_fdebug_pass_arguments)) { CmdArgs.push_back("-mdebug-pass"); CmdArgs.push_back("Arguments"); } // Enable -mconstructor-aliases except on darwin, where we have to // work around a linker bug; see . if (!getToolChain().getTriple().isOSDarwin()) CmdArgs.push_back("-mconstructor-aliases"); // Darwin's kernel doesn't support guard variables; just die if we // try to use them. if (KernelOrKext && getToolChain().getTriple().isOSDarwin()) CmdArgs.push_back("-fforbid-guard-variables"); if (Args.hasArg(options::OPT_mms_bitfields)) { CmdArgs.push_back("-mms-bitfields"); } // This is a coarse approximation of what llvm-gcc actually does, both // -fasynchronous-unwind-tables and -fnon-call-exceptions interact in more // complicated ways. bool AsynchronousUnwindTables = Args.hasFlag(options::OPT_fasynchronous_unwind_tables, options::OPT_fno_asynchronous_unwind_tables, getToolChain().IsUnwindTablesDefault() && !KernelOrKext); if (Args.hasFlag(options::OPT_funwind_tables, options::OPT_fno_unwind_tables, AsynchronousUnwindTables)) CmdArgs.push_back("-munwind-tables"); getToolChain().addClangTargetOptions(CmdArgs); if (Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) { CmdArgs.push_back("-mlimit-float-precision"); CmdArgs.push_back(A->getValue()); } // FIXME: Handle -mtune=. (void) Args.hasArg(options::OPT_mtune_EQ); if (Arg *A = Args.getLastArg(options::OPT_mcmodel_EQ)) { CmdArgs.push_back("-mcode-model"); CmdArgs.push_back(A->getValue()); } // Add target specific cpu and features flags. switch(getToolChain().getTriple().getArch()) { default: break; case llvm::Triple::arm: case llvm::Triple::thumb: AddARMTargetArgs(Args, CmdArgs, KernelOrKext); break; case llvm::Triple::mips: case llvm::Triple::mipsel: case llvm::Triple::mips64: case llvm::Triple::mips64el: AddMIPSTargetArgs(Args, CmdArgs); break; case llvm::Triple::ppc: case llvm::Triple::ppc64: AddPPCTargetArgs(Args, CmdArgs); break; case llvm::Triple::sparc: AddSparcTargetArgs(Args, CmdArgs); break; case llvm::Triple::x86: case llvm::Triple::x86_64: AddX86TargetArgs(Args, CmdArgs); break; case llvm::Triple::hexagon: AddHexagonTargetArgs(Args, CmdArgs); break; } // Pass the linker version in use. if (Arg *A = Args.getLastArg(options::OPT_mlinker_version_EQ)) { CmdArgs.push_back("-target-linker-version"); CmdArgs.push_back(A->getValue()); } // -mno-omit-leaf-frame-pointer is the default on Darwin. if (Args.hasFlag(options::OPT_momit_leaf_frame_pointer, options::OPT_mno_omit_leaf_frame_pointer, !getToolChain().getTriple().isOSDarwin())) CmdArgs.push_back("-momit-leaf-frame-pointer"); // Explicitly error on some things we know we don't support and can't just // ignore. types::ID InputType = Inputs[0].getType(); if (!Args.hasArg(options::OPT_fallow_unsupported)) { Arg *Unsupported; if (types::isCXX(InputType) && getToolChain().getTriple().isOSDarwin() && getToolChain().getTriple().getArch() == llvm::Triple::x86) { if ((Unsupported = Args.getLastArg(options::OPT_fapple_kext)) || (Unsupported = Args.getLastArg(options::OPT_mkernel))) D.Diag(diag::err_drv_clang_unsupported_opt_cxx_darwin_i386) << Unsupported->getOption().getName(); } } Args.AddAllArgs(CmdArgs, options::OPT_v); Args.AddLastArg(CmdArgs, options::OPT_H); if (D.CCPrintHeaders && !D.CCGenDiagnostics) { CmdArgs.push_back("-header-include-file"); CmdArgs.push_back(D.CCPrintHeadersFilename ? D.CCPrintHeadersFilename : "-"); } Args.AddLastArg(CmdArgs, options::OPT_P); Args.AddLastArg(CmdArgs, options::OPT_print_ivar_layout); if (D.CCLogDiagnostics && !D.CCGenDiagnostics) { CmdArgs.push_back("-diagnostic-log-file"); CmdArgs.push_back(D.CCLogDiagnosticsFilename ? D.CCLogDiagnosticsFilename : "-"); } // Use the last option from "-g" group. "-gline-tables-only" is // preserved, all other debug options are substituted with "-g". Args.ClaimAllArgs(options::OPT_g_Group); if (Arg *A = Args.getLastArg(options::OPT_g_Group)) { if (A->getOption().matches(options::OPT_gline_tables_only)) { CmdArgs.push_back("-gline-tables-only"); } else if (!A->getOption().matches(options::OPT_g0) && !A->getOption().matches(options::OPT_ggdb0)) { CmdArgs.push_back("-g"); } } // We ignore flags -gstrict-dwarf and -grecord-gcc-switches for now. Args.ClaimAllArgs(options::OPT_g_flags_Group); if (Args.hasArg(options::OPT_gcolumn_info)) CmdArgs.push_back("-dwarf-column-info"); Args.AddAllArgs(CmdArgs, options::OPT_ffunction_sections); Args.AddAllArgs(CmdArgs, options::OPT_fdata_sections); Args.AddAllArgs(CmdArgs, options::OPT_finstrument_functions); if (Args.hasArg(options::OPT_ftest_coverage) || Args.hasArg(options::OPT_coverage)) CmdArgs.push_back("-femit-coverage-notes"); if (Args.hasArg(options::OPT_fprofile_arcs) || Args.hasArg(options::OPT_coverage)) CmdArgs.push_back("-femit-coverage-data"); if (C.getArgs().hasArg(options::OPT_c) || C.getArgs().hasArg(options::OPT_S)) { if (Output.isFilename()) { CmdArgs.push_back("-coverage-file"); SmallString<128> absFilename(Output.getFilename()); llvm::sys::fs::make_absolute(absFilename); CmdArgs.push_back(Args.MakeArgString(absFilename)); } } // Pass options for controlling the default header search paths. if (Args.hasArg(options::OPT_nostdinc)) { CmdArgs.push_back("-nostdsysteminc"); CmdArgs.push_back("-nobuiltininc"); } else { if (Args.hasArg(options::OPT_nostdlibinc)) CmdArgs.push_back("-nostdsysteminc"); Args.AddLastArg(CmdArgs, options::OPT_nostdincxx); Args.AddLastArg(CmdArgs, options::OPT_nobuiltininc); } // Pass the path to compiler resource files. CmdArgs.push_back("-resource-dir"); CmdArgs.push_back(D.ResourceDir.c_str()); Args.AddLastArg(CmdArgs, options::OPT_working_directory); bool ARCMTEnabled = false; if (!Args.hasArg(options::OPT_fno_objc_arc)) { if (const Arg *A = Args.getLastArg(options::OPT_ccc_arcmt_check, options::OPT_ccc_arcmt_modify, options::OPT_ccc_arcmt_migrate)) { ARCMTEnabled = true; switch (A->getOption().getID()) { default: llvm_unreachable("missed a case"); case options::OPT_ccc_arcmt_check: CmdArgs.push_back("-arcmt-check"); break; case options::OPT_ccc_arcmt_modify: CmdArgs.push_back("-arcmt-modify"); break; case options::OPT_ccc_arcmt_migrate: CmdArgs.push_back("-arcmt-migrate"); CmdArgs.push_back("-mt-migrate-directory"); CmdArgs.push_back(A->getValue()); Args.AddLastArg(CmdArgs, options::OPT_arcmt_migrate_report_output); Args.AddLastArg(CmdArgs, options::OPT_arcmt_migrate_emit_arc_errors); break; } } } if (const Arg *A = Args.getLastArg(options::OPT_ccc_objcmt_migrate)) { if (ARCMTEnabled) { D.Diag(diag::err_drv_argument_not_allowed_with) << A->getAsString(Args) << "-ccc-arcmt-migrate"; } CmdArgs.push_back("-mt-migrate-directory"); CmdArgs.push_back(A->getValue()); if (!Args.hasArg(options::OPT_objcmt_migrate_literals, options::OPT_objcmt_migrate_subscripting)) { // None specified, means enable them all. CmdArgs.push_back("-objcmt-migrate-literals"); CmdArgs.push_back("-objcmt-migrate-subscripting"); } else { Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_literals); Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_subscripting); } } // Add preprocessing options like -I, -D, etc. if we are using the // preprocessor. // // FIXME: Support -fpreprocessed if (types::getPreprocessedType(InputType) != types::TY_INVALID) AddPreprocessingOptions(C, D, Args, CmdArgs, Output, Inputs); // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes // that "The compiler can only warn and ignore the option if not recognized". // When building with ccache, it will pass -D options to clang even on // preprocessed inputs and configure concludes that -fPIC is not supported. Args.ClaimAllArgs(options::OPT_D); // Manually translate -O to -O2 and -O4 to -O3; let clang reject // others. if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { if (A->getOption().matches(options::OPT_O4)) CmdArgs.push_back("-O3"); else if (A->getOption().matches(options::OPT_O) && A->getValue()[0] == '\0') CmdArgs.push_back("-O2"); else A->render(Args, CmdArgs); } Args.AddAllArgs(CmdArgs, options::OPT_W_Group); if (Args.hasFlag(options::OPT_pedantic, options::OPT_no_pedantic, false)) CmdArgs.push_back("-pedantic"); Args.AddLastArg(CmdArgs, options::OPT_pedantic_errors); Args.AddLastArg(CmdArgs, options::OPT_w); // Handle -{std, ansi, trigraphs} -- take the last of -{std, ansi} // (-ansi is equivalent to -std=c89). // // If a std is supplied, only add -trigraphs if it follows the // option. if (Arg *Std = Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi)) { if (Std->getOption().matches(options::OPT_ansi)) if (types::isCXX(InputType)) CmdArgs.push_back("-std=c++98"); else CmdArgs.push_back("-std=c89"); else Std->render(Args, CmdArgs); if (Arg *A = Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi, options::OPT_trigraphs)) if (A != Std) A->render(Args, CmdArgs); } else { // Honor -std-default. // // FIXME: Clang doesn't correctly handle -std= when the input language // doesn't match. For the time being just ignore this for C++ inputs; // eventually we want to do all the standard defaulting here instead of // splitting it between the driver and clang -cc1. if (!types::isCXX(InputType)) Args.AddAllArgsTranslated(CmdArgs, options::OPT_std_default_EQ, "-std=", /*Joined=*/true); else if (getToolChain().getTriple().getOS() == llvm::Triple::Win32) CmdArgs.push_back("-std=c++11"); Args.AddLastArg(CmdArgs, options::OPT_trigraphs); } // Map the bizarre '-Wwrite-strings' flag to a more sensible // '-fconst-strings'; this better indicates its actual behavior. if (Args.hasFlag(options::OPT_Wwrite_strings, options::OPT_Wno_write_strings, false)) { // For perfect compatibility with GCC, we do this even in the presence of // '-w'. This flag names something other than a warning for GCC. CmdArgs.push_back("-fconst-strings"); } // GCC provides a macro definition '__DEPRECATED' when -Wdeprecated is active // during C++ compilation, which it is by default. GCC keeps this define even // in the presence of '-w', match this behavior bug-for-bug. if (types::isCXX(InputType) && Args.hasFlag(options::OPT_Wdeprecated, options::OPT_Wno_deprecated, true)) { CmdArgs.push_back("-fdeprecated-macro"); } // Translate GCC's misnamer '-fasm' arguments to '-fgnu-keywords'. if (Arg *Asm = Args.getLastArg(options::OPT_fasm, options::OPT_fno_asm)) { if (Asm->getOption().matches(options::OPT_fasm)) CmdArgs.push_back("-fgnu-keywords"); else CmdArgs.push_back("-fno-gnu-keywords"); } if (ShouldDisableCFI(Args, getToolChain())) CmdArgs.push_back("-fno-dwarf2-cfi-asm"); if (ShouldDisableDwarfDirectory(Args, getToolChain())) CmdArgs.push_back("-fno-dwarf-directory-asm"); if (const char *pwd = ::getenv("PWD")) { // GCC also verifies that stat(pwd) and stat(".") have the same inode // number. Not doing those because stats are slow, but we could. if (llvm::sys::path::is_absolute(pwd)) { std::string CompDir = pwd; CmdArgs.push_back("-fdebug-compilation-dir"); CmdArgs.push_back(Args.MakeArgString(CompDir)); } } if (Arg *A = Args.getLastArg(options::OPT_ftemplate_depth_, options::OPT_ftemplate_depth_EQ)) { CmdArgs.push_back("-ftemplate-depth"); CmdArgs.push_back(A->getValue()); } if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_depth_EQ)) { CmdArgs.push_back("-fconstexpr-depth"); CmdArgs.push_back(A->getValue()); } if (Arg *A = Args.getLastArg(options::OPT_Wlarge_by_value_copy_EQ, options::OPT_Wlarge_by_value_copy_def)) { if (A->getNumValues()) { StringRef bytes = A->getValue(); CmdArgs.push_back(Args.MakeArgString("-Wlarge-by-value-copy=" + bytes)); } else CmdArgs.push_back("-Wlarge-by-value-copy=64"); // default value } if (Arg *A = Args.getLastArg(options::OPT_fbounds_checking, options::OPT_fbounds_checking_EQ)) { if (A->getNumValues()) { StringRef val = A->getValue(); CmdArgs.push_back(Args.MakeArgString("-fbounds-checking=" + val)); } else CmdArgs.push_back("-fbounds-checking=1"); } if (Args.hasArg(options::OPT_relocatable_pch)) CmdArgs.push_back("-relocatable-pch"); if (Arg *A = Args.getLastArg(options::OPT_fconstant_string_class_EQ)) { CmdArgs.push_back("-fconstant-string-class"); CmdArgs.push_back(A->getValue()); } if (Arg *A = Args.getLastArg(options::OPT_ftabstop_EQ)) { CmdArgs.push_back("-ftabstop"); CmdArgs.push_back(A->getValue()); } CmdArgs.push_back("-ferror-limit"); if (Arg *A = Args.getLastArg(options::OPT_ferror_limit_EQ)) CmdArgs.push_back(A->getValue()); else CmdArgs.push_back("19"); if (Arg *A = Args.getLastArg(options::OPT_fmacro_backtrace_limit_EQ)) { CmdArgs.push_back("-fmacro-backtrace-limit"); CmdArgs.push_back(A->getValue()); } if (Arg *A = Args.getLastArg(options::OPT_ftemplate_backtrace_limit_EQ)) { CmdArgs.push_back("-ftemplate-backtrace-limit"); CmdArgs.push_back(A->getValue()); } if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_backtrace_limit_EQ)) { CmdArgs.push_back("-fconstexpr-backtrace-limit"); CmdArgs.push_back(A->getValue()); } // Pass -fmessage-length=. CmdArgs.push_back("-fmessage-length"); if (Arg *A = Args.getLastArg(options::OPT_fmessage_length_EQ)) { CmdArgs.push_back(A->getValue()); } else { // If -fmessage-length=N was not specified, determine whether this is a // terminal and, if so, implicitly define -fmessage-length appropriately. unsigned N = llvm::sys::Process::StandardErrColumns(); CmdArgs.push_back(Args.MakeArgString(Twine(N))); } if (const Arg *A = Args.getLastArg(options::OPT_fvisibility_EQ)) { CmdArgs.push_back("-fvisibility"); CmdArgs.push_back(A->getValue()); } Args.AddLastArg(CmdArgs, options::OPT_fvisibility_inlines_hidden); Args.AddLastArg(CmdArgs, options::OPT_ftlsmodel_EQ); // -fhosted is default. if (Args.hasFlag(options::OPT_ffreestanding, options::OPT_fhosted, false) || KernelOrKext) CmdArgs.push_back("-ffreestanding"); // Forward -f (flag) options which we can pass directly. Args.AddLastArg(CmdArgs, options::OPT_femit_all_decls); Args.AddLastArg(CmdArgs, options::OPT_fformat_extensions); Args.AddLastArg(CmdArgs, options::OPT_fheinous_gnu_extensions); Args.AddLastArg(CmdArgs, options::OPT_flimit_debug_info); Args.AddLastArg(CmdArgs, options::OPT_fno_limit_debug_info); Args.AddLastArg(CmdArgs, options::OPT_fno_operator_names); Args.AddLastArg(CmdArgs, options::OPT_faltivec); Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_show_template_tree); Args.AddLastArg(CmdArgs, options::OPT_fno_elide_type); SanitizerArgs Sanitize(D, Args); Sanitize.addArgs(Args, CmdArgs); // Report and error for -faltivec on anything other then PowerPC. if (const Arg *A = Args.getLastArg(options::OPT_faltivec)) if (!(getToolChain().getTriple().getArch() == llvm::Triple::ppc || getToolChain().getTriple().getArch() == llvm::Triple::ppc64)) D.Diag(diag::err_drv_argument_only_allowed_with) << A->getAsString(Args) << "ppc/ppc64"; if (getToolChain().SupportsProfiling()) Args.AddLastArg(CmdArgs, options::OPT_pg); // -flax-vector-conversions is default. if (!Args.hasFlag(options::OPT_flax_vector_conversions, options::OPT_fno_lax_vector_conversions)) CmdArgs.push_back("-fno-lax-vector-conversions"); if (Args.getLastArg(options::OPT_fapple_kext)) CmdArgs.push_back("-fapple-kext"); if (Args.hasFlag(options::OPT_frewrite_includes, options::OPT_fno_rewrite_includes, false)) CmdArgs.push_back("-frewrite-includes"); Args.AddLastArg(CmdArgs, options::OPT_fobjc_sender_dependent_dispatch); Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_print_source_range_info); Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_parseable_fixits); Args.AddLastArg(CmdArgs, options::OPT_ftime_report); Args.AddLastArg(CmdArgs, options::OPT_ftrapv); if (Arg *A = Args.getLastArg(options::OPT_ftrapv_handler_EQ)) { CmdArgs.push_back("-ftrapv-handler"); CmdArgs.push_back(A->getValue()); } Args.AddLastArg(CmdArgs, options::OPT_ftrap_function_EQ); // -fno-strict-overflow implies -fwrapv if it isn't disabled, but // -fstrict-overflow won't turn off an explicitly enabled -fwrapv. if (Arg *A = Args.getLastArg(options::OPT_fwrapv, options::OPT_fno_wrapv)) { if (A->getOption().matches(options::OPT_fwrapv)) CmdArgs.push_back("-fwrapv"); } else if (Arg *A = Args.getLastArg(options::OPT_fstrict_overflow, options::OPT_fno_strict_overflow)) { if (A->getOption().matches(options::OPT_fno_strict_overflow)) CmdArgs.push_back("-fwrapv"); } Args.AddLastArg(CmdArgs, options::OPT_fwritable_strings); Args.AddLastArg(CmdArgs, options::OPT_funroll_loops); Args.AddLastArg(CmdArgs, options::OPT_pthread); // -stack-protector=0 is default. unsigned StackProtectorLevel = 0; if (Arg *A = Args.getLastArg(options::OPT_fno_stack_protector, options::OPT_fstack_protector_all, options::OPT_fstack_protector)) { if (A->getOption().matches(options::OPT_fstack_protector)) StackProtectorLevel = 1; else if (A->getOption().matches(options::OPT_fstack_protector_all)) StackProtectorLevel = 2; } else { StackProtectorLevel = getToolChain().GetDefaultStackProtectorLevel(KernelOrKext); } if (StackProtectorLevel) { CmdArgs.push_back("-stack-protector"); CmdArgs.push_back(Args.MakeArgString(Twine(StackProtectorLevel))); } // --param ssp-buffer-size= for (arg_iterator it = Args.filtered_begin(options::OPT__param), ie = Args.filtered_end(); it != ie; ++it) { StringRef Str((*it)->getValue()); if (Str.startswith("ssp-buffer-size=")) { if (StackProtectorLevel) { CmdArgs.push_back("-stack-protector-buffer-size"); // FIXME: Verify the argument is a valid integer. CmdArgs.push_back(Args.MakeArgString(Str.drop_front(16))); } (*it)->claim(); } } // Translate -mstackrealign if (Args.hasFlag(options::OPT_mstackrealign, options::OPT_mno_stackrealign, false)) { CmdArgs.push_back("-backend-option"); CmdArgs.push_back("-force-align-stack"); } if (!Args.hasFlag(options::OPT_mno_stackrealign, options::OPT_mstackrealign, false)) { CmdArgs.push_back(Args.MakeArgString("-mstackrealign")); } if (Args.hasArg(options::OPT_mstack_alignment)) { StringRef alignment = Args.getLastArgValue(options::OPT_mstack_alignment); CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + alignment)); } if (Args.hasArg(options::OPT_mstrict_align)) { CmdArgs.push_back("-backend-option"); CmdArgs.push_back("-arm-strict-align"); } // Forward -f options with positive and negative forms; we translate // these by hand. if (Args.hasArg(options::OPT_mkernel)) { if (!Args.hasArg(options::OPT_fapple_kext) && types::isCXX(InputType)) CmdArgs.push_back("-fapple-kext"); if (!Args.hasArg(options::OPT_fbuiltin)) CmdArgs.push_back("-fno-builtin"); Args.ClaimAllArgs(options::OPT_fno_builtin); } // -fbuiltin is default. else if (!Args.hasFlag(options::OPT_fbuiltin, options::OPT_fno_builtin)) CmdArgs.push_back("-fno-builtin"); if (!Args.hasFlag(options::OPT_fassume_sane_operator_new, options::OPT_fno_assume_sane_operator_new)) CmdArgs.push_back("-fno-assume-sane-operator-new"); // -fblocks=0 is default. if (Args.hasFlag(options::OPT_fblocks, options::OPT_fno_blocks, getToolChain().IsBlocksDefault()) || (Args.hasArg(options::OPT_fgnu_runtime) && Args.hasArg(options::OPT_fobjc_nonfragile_abi) && !Args.hasArg(options::OPT_fno_blocks))) { CmdArgs.push_back("-fblocks"); if (!Args.hasArg(options::OPT_fgnu_runtime) && !getToolChain().hasBlocksRuntime()) CmdArgs.push_back("-fblocks-runtime-optional"); } // -fmodules enables modules (off by default). However, for C++/Objective-C++, // users must also pass -fcxx-modules. The latter flag will disappear once the // modules implementation is solid for C++/Objective-C++ programs as well. if (Args.hasFlag(options::OPT_fmodules, options::OPT_fno_modules, false)) { bool AllowedInCXX = Args.hasFlag(options::OPT_fcxx_modules, options::OPT_fno_cxx_modules, false); if (AllowedInCXX || !types::isCXX(InputType)) CmdArgs.push_back("-fmodules"); } // -faccess-control is default. if (Args.hasFlag(options::OPT_fno_access_control, options::OPT_faccess_control, false)) CmdArgs.push_back("-fno-access-control"); // -felide-constructors is the default. if (Args.hasFlag(options::OPT_fno_elide_constructors, options::OPT_felide_constructors, false)) CmdArgs.push_back("-fno-elide-constructors"); // -frtti is default. if (!Args.hasFlag(options::OPT_frtti, options::OPT_fno_rtti) || KernelOrKext) { CmdArgs.push_back("-fno-rtti"); // -fno-rtti cannot usefully be combined with -fsanitize=vptr. if (Sanitize.sanitizesVptr()) { std::string NoRttiArg = Args.getLastArg(options::OPT_mkernel, options::OPT_fapple_kext, options::OPT_fno_rtti)->getAsString(Args); D.Diag(diag::err_drv_argument_not_allowed_with) << "-fsanitize=vptr" << NoRttiArg; } } // -fshort-enums=0 is default for all architectures except Hexagon. if (Args.hasFlag(options::OPT_fshort_enums, options::OPT_fno_short_enums, getToolChain().getTriple().getArch() == llvm::Triple::hexagon)) CmdArgs.push_back("-fshort-enums"); // -fsigned-char is default. if (!Args.hasFlag(options::OPT_fsigned_char, options::OPT_funsigned_char, isSignedCharDefault(getToolChain().getTriple()))) CmdArgs.push_back("-fno-signed-char"); // -fthreadsafe-static is default. if (!Args.hasFlag(options::OPT_fthreadsafe_statics, options::OPT_fno_threadsafe_statics)) CmdArgs.push_back("-fno-threadsafe-statics"); // -fuse-cxa-atexit is default. if (!Args.hasFlag(options::OPT_fuse_cxa_atexit, options::OPT_fno_use_cxa_atexit, getToolChain().getTriple().getOS() != llvm::Triple::Cygwin && getToolChain().getTriple().getOS() != llvm::Triple::MinGW32 && getToolChain().getTriple().getArch() != llvm::Triple::hexagon) || KernelOrKext) CmdArgs.push_back("-fno-use-cxa-atexit"); // -fms-extensions=0 is default. if (Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions, getToolChain().getTriple().getOS() == llvm::Triple::Win32)) CmdArgs.push_back("-fms-extensions"); // -fms-inline-asm. if (Args.hasArg(options::OPT_fenable_experimental_ms_inline_asm)) CmdArgs.push_back("-fenable-experimental-ms-inline-asm"); // -fms-compatibility=0 is default. if (Args.hasFlag(options::OPT_fms_compatibility, options::OPT_fno_ms_compatibility, (getToolChain().getTriple().getOS() == llvm::Triple::Win32 && Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions, true)))) CmdArgs.push_back("-fms-compatibility"); // -fmsc-version=1300 is default. if (Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions, getToolChain().getTriple().getOS() == llvm::Triple::Win32) || Args.hasArg(options::OPT_fmsc_version)) { StringRef msc_ver = Args.getLastArgValue(options::OPT_fmsc_version); if (msc_ver.empty()) CmdArgs.push_back("-fmsc-version=1300"); else CmdArgs.push_back(Args.MakeArgString("-fmsc-version=" + msc_ver)); } // -fborland-extensions=0 is default. if (Args.hasFlag(options::OPT_fborland_extensions, options::OPT_fno_borland_extensions, false)) CmdArgs.push_back("-fborland-extensions"); // -fno-delayed-template-parsing is default, except for Windows where MSVC STL // needs it. if (Args.hasFlag(options::OPT_fdelayed_template_parsing, options::OPT_fno_delayed_template_parsing, getToolChain().getTriple().getOS() == llvm::Triple::Win32)) CmdArgs.push_back("-fdelayed-template-parsing"); // -fgnu-keywords default varies depending on language; only pass if // specified. if (Arg *A = Args.getLastArg(options::OPT_fgnu_keywords, options::OPT_fno_gnu_keywords)) A->render(Args, CmdArgs); if (Args.hasFlag(options::OPT_fgnu89_inline, options::OPT_fno_gnu89_inline, false)) CmdArgs.push_back("-fgnu89-inline"); if (Args.hasArg(options::OPT_fno_inline)) CmdArgs.push_back("-fno-inline"); if (Args.hasArg(options::OPT_fno_inline_functions)) CmdArgs.push_back("-fno-inline-functions"); ObjCRuntime objcRuntime = AddObjCRuntimeArgs(Args, CmdArgs, rewriteKind); // -fobjc-dispatch-method is only relevant with the nonfragile-abi, and // legacy is the default. if (objcRuntime.isNonFragile()) { if (!Args.hasFlag(options::OPT_fobjc_legacy_dispatch, options::OPT_fno_objc_legacy_dispatch, objcRuntime.isLegacyDispatchDefaultForArch( getToolChain().getTriple().getArch()))) { if (getToolChain().UseObjCMixedDispatch()) CmdArgs.push_back("-fobjc-dispatch-method=mixed"); else CmdArgs.push_back("-fobjc-dispatch-method=non-legacy"); } } // -fobjc-default-synthesize-properties=1 is default. This only has an effect // if the nonfragile objc abi is used. if (getToolChain().IsObjCDefaultSynthPropertiesDefault()) { CmdArgs.push_back("-fobjc-default-synthesize-properties"); } // -fencode-extended-block-signature=1 is default. if (getToolChain().IsEncodeExtendedBlockSignatureDefault()) { CmdArgs.push_back("-fencode-extended-block-signature"); } // Allow -fno-objc-arr to trump -fobjc-arr/-fobjc-arc. // NOTE: This logic is duplicated in ToolChains.cpp. bool ARC = isObjCAutoRefCount(Args); if (ARC) { getToolChain().CheckObjCARC(); CmdArgs.push_back("-fobjc-arc"); // FIXME: It seems like this entire block, and several around it should be // wrapped in isObjC, but for now we just use it here as this is where it // was being used previously. if (types::isCXX(InputType) && types::isObjC(InputType)) { if (getToolChain().GetCXXStdlibType(Args) == ToolChain::CST_Libcxx) CmdArgs.push_back("-fobjc-arc-cxxlib=libc++"); else CmdArgs.push_back("-fobjc-arc-cxxlib=libstdc++"); } // Allow the user to enable full exceptions code emission. // We define off for Objective-CC, on for Objective-C++. if (Args.hasFlag(options::OPT_fobjc_arc_exceptions, options::OPT_fno_objc_arc_exceptions, /*default*/ types::isCXX(InputType))) CmdArgs.push_back("-fobjc-arc-exceptions"); } // -fobjc-infer-related-result-type is the default, except in the Objective-C // rewriter. if (rewriteKind != RK_None) CmdArgs.push_back("-fno-objc-infer-related-result-type"); // Handle -fobjc-gc and -fobjc-gc-only. They are exclusive, and -fobjc-gc-only // takes precedence. const Arg *GCArg = Args.getLastArg(options::OPT_fobjc_gc_only); if (!GCArg) GCArg = Args.getLastArg(options::OPT_fobjc_gc); if (GCArg) { if (ARC) { D.Diag(diag::err_drv_objc_gc_arr) << GCArg->getAsString(Args); } else if (getToolChain().SupportsObjCGC()) { GCArg->render(Args, CmdArgs); } else { // FIXME: We should move this to a hard error. D.Diag(diag::warn_drv_objc_gc_unsupported) << GCArg->getAsString(Args); } } // Add exception args. addExceptionArgs(Args, InputType, getToolChain().getTriple(), KernelOrKext, objcRuntime, CmdArgs); if (getToolChain().UseSjLjExceptions()) CmdArgs.push_back("-fsjlj-exceptions"); // C++ "sane" operator new. if (!Args.hasFlag(options::OPT_fassume_sane_operator_new, options::OPT_fno_assume_sane_operator_new)) CmdArgs.push_back("-fno-assume-sane-operator-new"); // -fconstant-cfstrings is default, and may be subject to argument translation // on Darwin. if (!Args.hasFlag(options::OPT_fconstant_cfstrings, options::OPT_fno_constant_cfstrings) || !Args.hasFlag(options::OPT_mconstant_cfstrings, options::OPT_mno_constant_cfstrings)) CmdArgs.push_back("-fno-constant-cfstrings"); // -fshort-wchar default varies depending on platform; only // pass if specified. if (Arg *A = Args.getLastArg(options::OPT_fshort_wchar)) A->render(Args, CmdArgs); // -fno-pascal-strings is default, only pass non-default. If the tool chain // happened to translate to -mpascal-strings, we want to back translate here. // // FIXME: This is gross; that translation should be pulled from the // tool chain. if (Args.hasFlag(options::OPT_fpascal_strings, options::OPT_fno_pascal_strings, false) || Args.hasFlag(options::OPT_mpascal_strings, options::OPT_mno_pascal_strings, false)) CmdArgs.push_back("-fpascal-strings"); // Honor -fpack-struct= and -fpack-struct, if given. Note that // -fno-pack-struct doesn't apply to -fpack-struct=. if (Arg *A = Args.getLastArg(options::OPT_fpack_struct_EQ)) { std::string PackStructStr = "-fpack-struct="; PackStructStr += A->getValue(); CmdArgs.push_back(Args.MakeArgString(PackStructStr)); } else if (Args.hasFlag(options::OPT_fpack_struct, options::OPT_fno_pack_struct, false)) { CmdArgs.push_back("-fpack-struct=1"); } if (Args.hasArg(options::OPT_mkernel) || Args.hasArg(options::OPT_fapple_kext)) { if (!Args.hasArg(options::OPT_fcommon)) CmdArgs.push_back("-fno-common"); Args.ClaimAllArgs(options::OPT_fno_common); } // -fcommon is default, only pass non-default. else if (!Args.hasFlag(options::OPT_fcommon, options::OPT_fno_common)) CmdArgs.push_back("-fno-common"); // -fsigned-bitfields is default, and clang doesn't yet support // -funsigned-bitfields. if (!Args.hasFlag(options::OPT_fsigned_bitfields, options::OPT_funsigned_bitfields)) D.Diag(diag::warn_drv_clang_unsupported) << Args.getLastArg(options::OPT_funsigned_bitfields)->getAsString(Args); // -fsigned-bitfields is default, and clang doesn't support -fno-for-scope. if (!Args.hasFlag(options::OPT_ffor_scope, options::OPT_fno_for_scope)) D.Diag(diag::err_drv_clang_unsupported) << Args.getLastArg(options::OPT_fno_for_scope)->getAsString(Args); // -fcaret-diagnostics is default. if (!Args.hasFlag(options::OPT_fcaret_diagnostics, options::OPT_fno_caret_diagnostics, true)) CmdArgs.push_back("-fno-caret-diagnostics"); // -fdiagnostics-fixit-info is default, only pass non-default. if (!Args.hasFlag(options::OPT_fdiagnostics_fixit_info, options::OPT_fno_diagnostics_fixit_info)) CmdArgs.push_back("-fno-diagnostics-fixit-info"); // Enable -fdiagnostics-show-option by default. if (Args.hasFlag(options::OPT_fdiagnostics_show_option, options::OPT_fno_diagnostics_show_option)) CmdArgs.push_back("-fdiagnostics-show-option"); if (const Arg *A = Args.getLastArg(options::OPT_fdiagnostics_show_category_EQ)) { CmdArgs.push_back("-fdiagnostics-show-category"); CmdArgs.push_back(A->getValue()); } if (const Arg *A = Args.getLastArg(options::OPT_fdiagnostics_format_EQ)) { CmdArgs.push_back("-fdiagnostics-format"); CmdArgs.push_back(A->getValue()); } if (Arg *A = Args.getLastArg( options::OPT_fdiagnostics_show_note_include_stack, options::OPT_fno_diagnostics_show_note_include_stack)) { if (A->getOption().matches( options::OPT_fdiagnostics_show_note_include_stack)) CmdArgs.push_back("-fdiagnostics-show-note-include-stack"); else CmdArgs.push_back("-fno-diagnostics-show-note-include-stack"); } // Color diagnostics are the default, unless the terminal doesn't support // them. if (Args.hasFlag(options::OPT_fcolor_diagnostics, options::OPT_fno_color_diagnostics, llvm::sys::Process::StandardErrHasColors())) CmdArgs.push_back("-fcolor-diagnostics"); if (!Args.hasFlag(options::OPT_fshow_source_location, options::OPT_fno_show_source_location)) CmdArgs.push_back("-fno-show-source-location"); if (!Args.hasFlag(options::OPT_fshow_column, options::OPT_fno_show_column, true)) CmdArgs.push_back("-fno-show-column"); if (!Args.hasFlag(options::OPT_fspell_checking, options::OPT_fno_spell_checking)) CmdArgs.push_back("-fno-spell-checking"); // Silently ignore -fasm-blocks for now. (void) Args.hasFlag(options::OPT_fasm_blocks, options::OPT_fno_asm_blocks, false); if (Arg *A = Args.getLastArg(options::OPT_fshow_overloads_EQ)) A->render(Args, CmdArgs); // -fdollars-in-identifiers default varies depending on platform and // language; only pass if specified. if (Arg *A = Args.getLastArg(options::OPT_fdollars_in_identifiers, options::OPT_fno_dollars_in_identifiers)) { if (A->getOption().matches(options::OPT_fdollars_in_identifiers)) CmdArgs.push_back("-fdollars-in-identifiers"); else CmdArgs.push_back("-fno-dollars-in-identifiers"); } // -funit-at-a-time is default, and we don't support -fno-unit-at-a-time for // practical purposes. if (Arg *A = Args.getLastArg(options::OPT_funit_at_a_time, options::OPT_fno_unit_at_a_time)) { if (A->getOption().matches(options::OPT_fno_unit_at_a_time)) D.Diag(diag::warn_drv_clang_unsupported) << A->getAsString(Args); } if (Args.hasFlag(options::OPT_fapple_pragma_pack, options::OPT_fno_apple_pragma_pack, false)) CmdArgs.push_back("-fapple-pragma-pack"); // Default to -fno-builtin-str{cat,cpy} on Darwin for ARM. // // FIXME: This is disabled until clang -cc1 supports -fno-builtin-foo. PR4941. #if 0 if (getToolChain().getTriple().isOSDarwin() && (getToolChain().getTriple().getArch() == llvm::Triple::arm || getToolChain().getTriple().getArch() == llvm::Triple::thumb)) { if (!Args.hasArg(options::OPT_fbuiltin_strcat)) CmdArgs.push_back("-fno-builtin-strcat"); if (!Args.hasArg(options::OPT_fbuiltin_strcpy)) CmdArgs.push_back("-fno-builtin-strcpy"); } #endif // Only allow -traditional or -traditional-cpp outside in preprocessing modes. if (Arg *A = Args.getLastArg(options::OPT_traditional, options::OPT_traditional_cpp)) { if (isa(JA)) CmdArgs.push_back("-traditional-cpp"); else D.Diag(diag::err_drv_clang_unsupported) << A->getAsString(Args); } Args.AddLastArg(CmdArgs, options::OPT_dM); Args.AddLastArg(CmdArgs, options::OPT_dD); // Handle serialized diagnostics. if (Arg *A = Args.getLastArg(options::OPT__serialize_diags)) { CmdArgs.push_back("-serialize-diagnostic-file"); CmdArgs.push_back(Args.MakeArgString(A->getValue())); } if (Args.hasArg(options::OPT_fretain_comments_from_system_headers)) CmdArgs.push_back("-fretain-comments-from-system-headers"); // Forward -Xclang arguments to -cc1, and -mllvm arguments to the LLVM option // parser. Args.AddAllArgValues(CmdArgs, options::OPT_Xclang); for (arg_iterator it = Args.filtered_begin(options::OPT_mllvm), ie = Args.filtered_end(); it != ie; ++it) { (*it)->claim(); // We translate this by hand to the -cc1 argument, since nightly test uses // it and developers have been trained to spell it with -mllvm. if (StringRef((*it)->getValue(0)) == "-disable-llvm-optzns") CmdArgs.push_back("-disable-llvm-optzns"); else (*it)->render(Args, CmdArgs); } if (Output.getType() == types::TY_Dependencies) { // Handled with other dependency code. } else if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Invalid output."); } for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back("-x"); if (Args.hasArg(options::OPT_rewrite_objc)) CmdArgs.push_back(types::getTypeName(types::TY_PP_ObjCXX)); else CmdArgs.push_back(types::getTypeName(II.getType())); if (II.isFilename()) CmdArgs.push_back(II.getFilename()); else II.getInputArg().renderAsInput(Args, CmdArgs); } Args.AddAllArgs(CmdArgs, options::OPT_undef); const char *Exec = getToolChain().getDriver().getClangProgramPath(); // Optionally embed the -cc1 level arguments into the debug info, for build // analysis. if (getToolChain().UseDwarfDebugFlags()) { ArgStringList OriginalArgs; for (ArgList::const_iterator it = Args.begin(), ie = Args.end(); it != ie; ++it) (*it)->render(Args, OriginalArgs); SmallString<256> Flags; Flags += Exec; for (unsigned i = 0, e = OriginalArgs.size(); i != e; ++i) { Flags += " "; Flags += OriginalArgs[i]; } CmdArgs.push_back("-dwarf-debug-flags"); CmdArgs.push_back(Args.MakeArgString(Flags.str())); } C.addCommand(new Command(JA, *this, Exec, CmdArgs)); if (Arg *A = Args.getLastArg(options::OPT_pg)) if (Args.hasArg(options::OPT_fomit_frame_pointer)) D.Diag(diag::err_drv_argument_not_allowed_with) << "-fomit-frame-pointer" << A->getAsString(Args); // Claim some arguments which clang supports automatically. // -fpch-preprocess is used with gcc to add a special marker in the output to // include the PCH file. Clang's PTH solution is completely transparent, so we // do not need to deal with it at all. Args.ClaimAllArgs(options::OPT_fpch_preprocess); // Claim some arguments which clang doesn't support, but we don't // care to warn the user about. Args.ClaimAllArgs(options::OPT_clang_ignored_f_Group); Args.ClaimAllArgs(options::OPT_clang_ignored_m_Group); // Disable warnings for clang -E -use-gold-plugin -emit-llvm foo.c Args.ClaimAllArgs(options::OPT_use_gold_plugin); Args.ClaimAllArgs(options::OPT_emit_llvm); } void ClangAs::AddARMTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const { const Driver &D = getToolChain().getDriver(); llvm::Triple Triple = getToolChain().getTriple(); // Set the CPU based on -march= and -mcpu=. CmdArgs.push_back("-target-cpu"); CmdArgs.push_back(Args.MakeArgString(getARMTargetCPU(Args, Triple))); // Honor -mfpu=. if (const Arg *A = Args.getLastArg(options::OPT_mfpu_EQ)) addFPUArgs(D, A, Args, CmdArgs); // Honor -mfpmath=. if (const Arg *A = Args.getLastArg(options::OPT_mfpmath_EQ)) addFPMathArgs(D, A, Args, CmdArgs, getARMTargetCPU(Args, Triple)); } +void ClangAs::AddX86TargetArgs(const ArgList &Args, + ArgStringList &CmdArgs) const { + // Set the CPU based on -march=. + if (const char *CPUName = getX86TargetCPU(Args, getToolChain().getTriple())) { + CmdArgs.push_back("-target-cpu"); + CmdArgs.push_back(CPUName); + } +} + /// Add options related to the Objective-C runtime/ABI. /// /// Returns true if the runtime is non-fragile. ObjCRuntime Clang::AddObjCRuntimeArgs(const ArgList &args, ArgStringList &cmdArgs, RewriteKind rewriteKind) const { // Look for the controlling runtime option. Arg *runtimeArg = args.getLastArg(options::OPT_fnext_runtime, options::OPT_fgnu_runtime, options::OPT_fobjc_runtime_EQ); // Just forward -fobjc-runtime= to the frontend. This supercedes // options about fragility. if (runtimeArg && runtimeArg->getOption().matches(options::OPT_fobjc_runtime_EQ)) { ObjCRuntime runtime; StringRef value = runtimeArg->getValue(); if (runtime.tryParse(value)) { getToolChain().getDriver().Diag(diag::err_drv_unknown_objc_runtime) << value; } runtimeArg->render(args, cmdArgs); return runtime; } // Otherwise, we'll need the ABI "version". Version numbers are // slightly confusing for historical reasons: // 1 - Traditional "fragile" ABI // 2 - Non-fragile ABI, version 1 // 3 - Non-fragile ABI, version 2 unsigned objcABIVersion = 1; // If -fobjc-abi-version= is present, use that to set the version. if (Arg *abiArg = args.getLastArg(options::OPT_fobjc_abi_version_EQ)) { StringRef value = abiArg->getValue(); if (value == "1") objcABIVersion = 1; else if (value == "2") objcABIVersion = 2; else if (value == "3") objcABIVersion = 3; else getToolChain().getDriver().Diag(diag::err_drv_clang_unsupported) << value; } else { // Otherwise, determine if we are using the non-fragile ABI. bool nonFragileABIIsDefault = (rewriteKind == RK_NonFragile || (rewriteKind == RK_None && getToolChain().IsObjCNonFragileABIDefault())); if (args.hasFlag(options::OPT_fobjc_nonfragile_abi, options::OPT_fno_objc_nonfragile_abi, nonFragileABIIsDefault)) { // Determine the non-fragile ABI version to use. #ifdef DISABLE_DEFAULT_NONFRAGILEABI_TWO unsigned nonFragileABIVersion = 1; #else unsigned nonFragileABIVersion = 2; #endif if (Arg *abiArg = args.getLastArg( options::OPT_fobjc_nonfragile_abi_version_EQ)) { StringRef value = abiArg->getValue(); if (value == "1") nonFragileABIVersion = 1; else if (value == "2") nonFragileABIVersion = 2; else getToolChain().getDriver().Diag(diag::err_drv_clang_unsupported) << value; } objcABIVersion = 1 + nonFragileABIVersion; } else { objcABIVersion = 1; } } // We don't actually care about the ABI version other than whether // it's non-fragile. bool isNonFragile = objcABIVersion != 1; // If we have no runtime argument, ask the toolchain for its default runtime. // However, the rewriter only really supports the Mac runtime, so assume that. ObjCRuntime runtime; if (!runtimeArg) { switch (rewriteKind) { case RK_None: runtime = getToolChain().getDefaultObjCRuntime(isNonFragile); break; case RK_Fragile: runtime = ObjCRuntime(ObjCRuntime::FragileMacOSX, VersionTuple()); break; case RK_NonFragile: runtime = ObjCRuntime(ObjCRuntime::MacOSX, VersionTuple()); break; } // -fnext-runtime } else if (runtimeArg->getOption().matches(options::OPT_fnext_runtime)) { // On Darwin, make this use the default behavior for the toolchain. if (getToolChain().getTriple().isOSDarwin()) { runtime = getToolChain().getDefaultObjCRuntime(isNonFragile); // Otherwise, build for a generic macosx port. } else { runtime = ObjCRuntime(ObjCRuntime::MacOSX, VersionTuple()); } // -fgnu-runtime } else { assert(runtimeArg->getOption().matches(options::OPT_fgnu_runtime)); // Legacy behaviour is to target the gnustep runtime if we are i // non-fragile mode or the GCC runtime in fragile mode. if (isNonFragile) runtime = ObjCRuntime(ObjCRuntime::GNUstep, VersionTuple(1,6)); else runtime = ObjCRuntime(ObjCRuntime::GCC, VersionTuple()); } cmdArgs.push_back(args.MakeArgString( "-fobjc-runtime=" + runtime.getAsString())); return runtime; } void ClangAs::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; assert(Inputs.size() == 1 && "Unexpected number of inputs."); const InputInfo &Input = Inputs[0]; // Don't warn about "clang -w -c foo.s" Args.ClaimAllArgs(options::OPT_w); // and "clang -emit-llvm -c foo.s" Args.ClaimAllArgs(options::OPT_emit_llvm); // and "clang -use-gold-plugin -c foo.s" Args.ClaimAllArgs(options::OPT_use_gold_plugin); // Invoke ourselves in -cc1as mode. // // FIXME: Implement custom jobs for internal actions. CmdArgs.push_back("-cc1as"); // Add the "effective" target triple. CmdArgs.push_back("-triple"); std::string TripleStr = getToolChain().ComputeEffectiveClangTriple(Args, Input.getType()); CmdArgs.push_back(Args.MakeArgString(TripleStr)); // Set the output mode, we currently only expect to be used as a real // assembler. CmdArgs.push_back("-filetype"); CmdArgs.push_back("obj"); if (UseRelaxAll(C, Args)) CmdArgs.push_back("-relax-all"); // Add target specific cpu and features flags. switch(getToolChain().getTriple().getArch()) { default: break; case llvm::Triple::arm: case llvm::Triple::thumb: AddARMTargetArgs(Args, CmdArgs); break; + + case llvm::Triple::x86: + case llvm::Triple::x86_64: + AddX86TargetArgs(Args, CmdArgs); + break; } // Ignore explicit -force_cpusubtype_ALL option. (void) Args.hasArg(options::OPT_force__cpusubtype__ALL); // Determine the original source input. const Action *SourceAction = &JA; while (SourceAction->getKind() != Action::InputClass) { assert(!SourceAction->getInputs().empty() && "unexpected root action!"); SourceAction = SourceAction->getInputs()[0]; } // Forward -g, assuming we are dealing with an actual assembly file. if (SourceAction->getType() == types::TY_Asm || SourceAction->getType() == types::TY_PP_Asm) { Args.ClaimAllArgs(options::OPT_g_Group); if (Arg *A = Args.getLastArg(options::OPT_g_Group)) if (!A->getOption().matches(options::OPT_g0)) CmdArgs.push_back("-g"); } // Optionally embed the -cc1as level arguments into the debug info, for build // analysis. if (getToolChain().UseDwarfDebugFlags()) { ArgStringList OriginalArgs; for (ArgList::const_iterator it = Args.begin(), ie = Args.end(); it != ie; ++it) (*it)->render(Args, OriginalArgs); SmallString<256> Flags; const char *Exec = getToolChain().getDriver().getClangProgramPath(); Flags += Exec; for (unsigned i = 0, e = OriginalArgs.size(); i != e; ++i) { Flags += " "; Flags += OriginalArgs[i]; } CmdArgs.push_back("-dwarf-debug-flags"); CmdArgs.push_back(Args.MakeArgString(Flags.str())); } // FIXME: Add -static support, once we have it. Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); Args.AddAllArgs(CmdArgs, options::OPT_mllvm); assert(Output.isFilename() && "Unexpected lipo output."); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); assert(Input.isFilename() && "Invalid input."); CmdArgs.push_back(Input.getFilename()); const char *Exec = getToolChain().getDriver().getClangProgramPath(); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void gcc::Common::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; for (ArgList::const_iterator it = Args.begin(), ie = Args.end(); it != ie; ++it) { Arg *A = *it; if (forwardToGCC(A->getOption())) { // Don't forward any -g arguments to assembly steps. if (isa(JA) && A->getOption().matches(options::OPT_g_Group)) continue; // It is unfortunate that we have to claim here, as this means // we will basically never report anything interesting for // platforms using a generic gcc, even if we are just using gcc // to get to the assembler. A->claim(); A->render(Args, CmdArgs); } } RenderExtraToolArgs(JA, CmdArgs); // If using a driver driver, force the arch. llvm::Triple::ArchType Arch = getToolChain().getArch(); if (getToolChain().getTriple().isOSDarwin()) { CmdArgs.push_back("-arch"); // FIXME: Remove these special cases. if (Arch == llvm::Triple::ppc) CmdArgs.push_back("ppc"); else if (Arch == llvm::Triple::ppc64) CmdArgs.push_back("ppc64"); else CmdArgs.push_back(Args.MakeArgString(getToolChain().getArchName())); } // Try to force gcc to match the tool chain we want, if we recognize // the arch. // // FIXME: The triple class should directly provide the information we want // here. if (Arch == llvm::Triple::x86 || Arch == llvm::Triple::ppc) CmdArgs.push_back("-m32"); else if (Arch == llvm::Triple::x86_64 || Arch == llvm::Triple::x86_64) CmdArgs.push_back("-m64"); if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Unexpected output"); CmdArgs.push_back("-fsyntax-only"); } Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); // Only pass -x if gcc will understand it; otherwise hope gcc // understands the suffix correctly. The main use case this would go // wrong in is for linker inputs if they happened to have an odd // suffix; really the only way to get this to happen is a command // like '-x foobar a.c' which will treat a.c like a linker input. // // FIXME: For the linker case specifically, can we safely convert // inputs into '-Wl,' options? for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; // Don't try to pass LLVM or AST inputs to a generic gcc. if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR || II.getType() == types::TY_LLVM_BC || II.getType() == types::TY_LTO_BC) D.Diag(diag::err_drv_no_linker_llvm_support) << getToolChain().getTripleString(); else if (II.getType() == types::TY_AST) D.Diag(diag::err_drv_no_ast_support) << getToolChain().getTripleString(); if (types::canTypeBeUserSpecified(II.getType())) { CmdArgs.push_back("-x"); CmdArgs.push_back(types::getTypeName(II.getType())); } if (II.isFilename()) CmdArgs.push_back(II.getFilename()); else { const Arg &A = II.getInputArg(); // Reverse translate some rewritten options. if (A.getOption().matches(options::OPT_Z_reserved_lib_stdcxx)) { CmdArgs.push_back("-lstdc++"); continue; } // Don't render as input, we need gcc to do the translations. A.render(Args, CmdArgs); } } const std::string customGCCName = D.getCCCGenericGCCName(); const char *GCCName; if (!customGCCName.empty()) GCCName = customGCCName.c_str(); else if (D.CCCIsCXX) { GCCName = "g++"; } else GCCName = "gcc"; const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath(GCCName)); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void gcc::Preprocess::RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const { CmdArgs.push_back("-E"); } void gcc::Precompile::RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const { // The type is good enough. } void gcc::Compile::RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const { const Driver &D = getToolChain().getDriver(); // If -flto, etc. are present then make sure not to force assembly output. if (JA.getType() == types::TY_LLVM_IR || JA.getType() == types::TY_LTO_IR || JA.getType() == types::TY_LLVM_BC || JA.getType() == types::TY_LTO_BC) CmdArgs.push_back("-c"); else { if (JA.getType() != types::TY_PP_Asm) D.Diag(diag::err_drv_invalid_gcc_output_type) << getTypeName(JA.getType()); CmdArgs.push_back("-S"); } } void gcc::Assemble::RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const { CmdArgs.push_back("-c"); } void gcc::Link::RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const { // The types are (hopefully) good enough. } // Hexagon tools start. void hexagon::Assemble::RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const { } void hexagon::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; std::string MarchString = "-march="; MarchString += getHexagonTargetCPU(Args); CmdArgs.push_back(Args.MakeArgString(MarchString)); RenderExtraToolArgs(JA, CmdArgs); if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Unexpected output"); CmdArgs.push_back("-fsyntax-only"); } // Only pass -x if gcc will understand it; otherwise hope gcc // understands the suffix correctly. The main use case this would go // wrong in is for linker inputs if they happened to have an odd // suffix; really the only way to get this to happen is a command // like '-x foobar a.c' which will treat a.c like a linker input. // // FIXME: For the linker case specifically, can we safely convert // inputs into '-Wl,' options? for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; // Don't try to pass LLVM or AST inputs to a generic gcc. if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR || II.getType() == types::TY_LLVM_BC || II.getType() == types::TY_LTO_BC) D.Diag(clang::diag::err_drv_no_linker_llvm_support) << getToolChain().getTripleString(); else if (II.getType() == types::TY_AST) D.Diag(clang::diag::err_drv_no_ast_support) << getToolChain().getTripleString(); if (II.isFilename()) CmdArgs.push_back(II.getFilename()); else // Don't render as input, we need gcc to do the translations. FIXME: Pranav: What is this ? II.getInputArg().render(Args, CmdArgs); } const char *GCCName = "hexagon-as"; const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath(GCCName)); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void hexagon::Link::RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const { // The types are (hopefully) good enough. } void hexagon::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; for (ArgList::const_iterator it = Args.begin(), ie = Args.end(); it != ie; ++it) { Arg *A = *it; if (forwardToGCC(A->getOption())) { // Don't forward any -g arguments to assembly steps. if (isa(JA) && A->getOption().matches(options::OPT_g_Group)) continue; // It is unfortunate that we have to claim here, as this means // we will basically never report anything interesting for // platforms using a generic gcc, even if we are just using gcc // to get to the assembler. A->claim(); A->render(Args, CmdArgs); } } RenderExtraToolArgs(JA, CmdArgs); // Add Arch Information Arg *A; if ((A = getLastHexagonArchArg(Args))) { if (A->getOption().matches(options::OPT_m_Joined)) A->render(Args, CmdArgs); else CmdArgs.push_back (Args.MakeArgString("-m" + getHexagonTargetCPU(Args))); } else { CmdArgs.push_back (Args.MakeArgString("-m" + getHexagonTargetCPU(Args))); } CmdArgs.push_back("-mqdsp6-compat"); const char *GCCName; if (C.getDriver().CCCIsCXX) GCCName = "hexagon-g++"; else GCCName = "hexagon-gcc"; const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath(GCCName)); if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; // Don't try to pass LLVM or AST inputs to a generic gcc. if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR || II.getType() == types::TY_LLVM_BC || II.getType() == types::TY_LTO_BC) D.Diag(clang::diag::err_drv_no_linker_llvm_support) << getToolChain().getTripleString(); else if (II.getType() == types::TY_AST) D.Diag(clang::diag::err_drv_no_ast_support) << getToolChain().getTripleString(); if (II.isFilename()) CmdArgs.push_back(II.getFilename()); else // Don't render as input, we need gcc to do the translations. FIXME: Pranav: What is this ? II.getInputArg().render(Args, CmdArgs); } C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } // Hexagon tools end. llvm::Triple::ArchType darwin::getArchTypeForDarwinArchName(StringRef Str) { // See arch(3) and llvm-gcc's driver-driver.c. We don't implement support for // archs which Darwin doesn't use. // The matching this routine does is fairly pointless, since it is neither the // complete architecture list, nor a reasonable subset. The problem is that // historically the driver driver accepts this and also ties its -march= // handling to the architecture name, so we need to be careful before removing // support for it. // This code must be kept in sync with Clang's Darwin specific argument // translation. return llvm::StringSwitch(Str) .Cases("ppc", "ppc601", "ppc603", "ppc604", "ppc604e", llvm::Triple::ppc) .Cases("ppc750", "ppc7400", "ppc7450", "ppc970", llvm::Triple::ppc) .Case("ppc64", llvm::Triple::ppc64) .Cases("i386", "i486", "i486SX", "i586", "i686", llvm::Triple::x86) .Cases("pentium", "pentpro", "pentIIm3", "pentIIm5", "pentium4", llvm::Triple::x86) .Case("x86_64", llvm::Triple::x86_64) // This is derived from the driver driver. .Cases("arm", "armv4t", "armv5", "armv6", llvm::Triple::arm) .Cases("armv7", "armv7f", "armv7k", "armv7s", "xscale", llvm::Triple::arm) .Case("r600", llvm::Triple::r600) .Case("nvptx", llvm::Triple::nvptx) .Case("nvptx64", llvm::Triple::nvptx64) .Case("amdil", llvm::Triple::amdil) .Case("spir", llvm::Triple::spir) .Default(llvm::Triple::UnknownArch); } const char *darwin::CC1::getCC1Name(types::ID Type) const { switch (Type) { default: llvm_unreachable("Unexpected type for Darwin CC1 tool."); case types::TY_Asm: case types::TY_C: case types::TY_CHeader: case types::TY_PP_C: case types::TY_PP_CHeader: return "cc1"; case types::TY_ObjC: case types::TY_ObjCHeader: case types::TY_PP_ObjC: case types::TY_PP_ObjC_Alias: case types::TY_PP_ObjCHeader: return "cc1obj"; case types::TY_CXX: case types::TY_CXXHeader: case types::TY_PP_CXX: case types::TY_PP_CXXHeader: return "cc1plus"; case types::TY_ObjCXX: case types::TY_ObjCXXHeader: case types::TY_PP_ObjCXX: case types::TY_PP_ObjCXX_Alias: case types::TY_PP_ObjCXXHeader: return "cc1objplus"; } } void darwin::CC1::anchor() {} const char *darwin::CC1::getBaseInputName(const ArgList &Args, const InputInfoList &Inputs) { return Args.MakeArgString( llvm::sys::path::filename(Inputs[0].getBaseInput())); } const char *darwin::CC1::getBaseInputStem(const ArgList &Args, const InputInfoList &Inputs) { const char *Str = getBaseInputName(Args, Inputs); if (const char *End = strrchr(Str, '.')) return Args.MakeArgString(std::string(Str, End)); return Str; } const char * darwin::CC1::getDependencyFileName(const ArgList &Args, const InputInfoList &Inputs) { // FIXME: Think about this more. std::string Res; if (Arg *OutputOpt = Args.getLastArg(options::OPT_o)) { std::string Str(OutputOpt->getValue()); Res = Str.substr(0, Str.rfind('.')); } else { Res = darwin::CC1::getBaseInputStem(Args, Inputs); } return Args.MakeArgString(Res + ".d"); } void darwin::CC1::RemoveCC1UnsupportedArgs(ArgStringList &CmdArgs) const { for (ArgStringList::iterator it = CmdArgs.begin(), ie = CmdArgs.end(); it != ie;) { StringRef Option = *it; bool RemoveOption = false; // Erase both -fmodule-cache-path and its argument. if (Option.equals("-fmodule-cache-path") && it+2 != ie) { it = CmdArgs.erase(it, it+2); ie = CmdArgs.end(); continue; } // Remove unsupported -f options. if (Option.startswith("-f")) { // Remove -f/-fno- to reduce the number of cases. if (Option.startswith("-fno-")) Option = Option.substr(5); else Option = Option.substr(2); RemoveOption = llvm::StringSwitch(Option) .Case("altivec", true) .Case("modules", true) .Case("diagnostics-show-note-include-stack", true) .Default(false); } // Handle machine specific options. if (Option.startswith("-m")) { RemoveOption = llvm::StringSwitch(Option) .Case("-mthumb", true) .Case("-mno-thumb", true) .Case("-mno-fused-madd", true) .Case("-mlong-branch", true) .Case("-mlongcall", true) .Case("-mcpu=G4", true) .Case("-mcpu=G5", true) .Default(false); } // Handle warning options. if (Option.startswith("-W")) { // Remove -W/-Wno- to reduce the number of cases. if (Option.startswith("-Wno-")) Option = Option.substr(5); else Option = Option.substr(2); RemoveOption = llvm::StringSwitch(Option) .Case("address-of-temporary", true) .Case("ambiguous-member-template", true) .Case("analyzer-incompatible-plugin", true) .Case("array-bounds", true) .Case("array-bounds-pointer-arithmetic", true) .Case("bind-to-temporary-copy", true) .Case("bitwise-op-parentheses", true) .Case("bool-conversions", true) .Case("builtin-macro-redefined", true) .Case("c++-hex-floats", true) .Case("c++0x-compat", true) .Case("c++0x-extensions", true) .Case("c++0x-narrowing", true) .Case("c++11-compat", true) .Case("c++11-extensions", true) .Case("c++11-narrowing", true) .Case("conditional-uninitialized", true) .Case("constant-conversion", true) .Case("conversion-null", true) .Case("CFString-literal", true) .Case("constant-logical-operand", true) .Case("custom-atomic-properties", true) .Case("default-arg-special-member", true) .Case("delegating-ctor-cycles", true) .Case("delete-non-virtual-dtor", true) .Case("deprecated-implementations", true) .Case("deprecated-writable-strings", true) .Case("distributed-object-modifiers", true) .Case("duplicate-method-arg", true) .Case("dynamic-class-memaccess", true) .Case("enum-compare", true) .Case("enum-conversion", true) .Case("exit-time-destructors", true) .Case("gnu", true) .Case("gnu-designator", true) .Case("header-hygiene", true) .Case("idiomatic-parentheses", true) .Case("ignored-qualifiers", true) .Case("implicit-atomic-properties", true) .Case("incompatible-pointer-types", true) .Case("incomplete-implementation", true) .Case("int-conversion", true) .Case("initializer-overrides", true) .Case("invalid-noreturn", true) .Case("invalid-token-paste", true) .Case("language-extension-token", true) .Case("literal-conversion", true) .Case("literal-range", true) .Case("local-type-template-args", true) .Case("logical-op-parentheses", true) .Case("method-signatures", true) .Case("microsoft", true) .Case("mismatched-tags", true) .Case("missing-method-return-type", true) .Case("non-pod-varargs", true) .Case("nonfragile-abi2", true) .Case("null-arithmetic", true) .Case("null-dereference", true) .Case("out-of-line-declaration", true) .Case("overriding-method-mismatch", true) .Case("readonly-setter-attrs", true) .Case("return-stack-address", true) .Case("self-assign", true) .Case("semicolon-before-method-body", true) .Case("sentinel", true) .Case("shift-overflow", true) .Case("shift-sign-overflow", true) .Case("sign-conversion", true) .Case("sizeof-array-argument", true) .Case("sizeof-pointer-memaccess", true) .Case("string-compare", true) .Case("super-class-method-mismatch", true) .Case("tautological-compare", true) .Case("typedef-redefinition", true) .Case("typename-missing", true) .Case("undefined-reinterpret-cast", true) .Case("unknown-warning-option", true) .Case("unnamed-type-template-args", true) .Case("unneeded-internal-declaration", true) .Case("unneeded-member-function", true) .Case("unused-comparison", true) .Case("unused-exception-parameter", true) .Case("unused-member-function", true) .Case("unused-result", true) .Case("vector-conversions", true) .Case("vla", true) .Case("used-but-marked-unused", true) .Case("weak-vtables", true) .Default(false); } // if (Option.startswith("-W")) if (RemoveOption) { it = CmdArgs.erase(it); ie = CmdArgs.end(); } else { ++it; } } } void darwin::CC1::AddCC1Args(const ArgList &Args, ArgStringList &CmdArgs) const { const Driver &D = getToolChain().getDriver(); CheckCodeGenerationOptions(D, Args); // Derived from cc1 spec. if ((!Args.hasArg(options::OPT_mkernel) || (getDarwinToolChain().isTargetIPhoneOS() && !getDarwinToolChain().isIPhoneOSVersionLT(6, 0))) && !Args.hasArg(options::OPT_static) && !Args.hasArg(options::OPT_mdynamic_no_pic)) CmdArgs.push_back("-fPIC"); if (getToolChain().getTriple().getArch() == llvm::Triple::arm || getToolChain().getTriple().getArch() == llvm::Triple::thumb) { if (!Args.hasArg(options::OPT_fbuiltin_strcat)) CmdArgs.push_back("-fno-builtin-strcat"); if (!Args.hasArg(options::OPT_fbuiltin_strcpy)) CmdArgs.push_back("-fno-builtin-strcpy"); } if (Args.hasArg(options::OPT_g_Flag) && !Args.hasArg(options::OPT_fno_eliminate_unused_debug_symbols)) CmdArgs.push_back("-feliminate-unused-debug-symbols"); } void darwin::CC1::AddCC1OptionsArgs(const ArgList &Args, ArgStringList &CmdArgs, const InputInfoList &Inputs, const ArgStringList &OutputArgs) const { const Driver &D = getToolChain().getDriver(); // Derived from cc1_options spec. if (Args.hasArg(options::OPT_fast) || Args.hasArg(options::OPT_fastf) || Args.hasArg(options::OPT_fastcp)) CmdArgs.push_back("-O3"); if (Arg *A = Args.getLastArg(options::OPT_pg)) if (Args.hasArg(options::OPT_fomit_frame_pointer)) D.Diag(diag::err_drv_argument_not_allowed_with) << A->getAsString(Args) << "-fomit-frame-pointer"; AddCC1Args(Args, CmdArgs); if (!Args.hasArg(options::OPT_Q)) CmdArgs.push_back("-quiet"); CmdArgs.push_back("-dumpbase"); CmdArgs.push_back(darwin::CC1::getBaseInputName(Args, Inputs)); Args.AddAllArgs(CmdArgs, options::OPT_d_Group); Args.AddAllArgs(CmdArgs, options::OPT_m_Group); Args.AddAllArgs(CmdArgs, options::OPT_a_Group); // FIXME: The goal is to use the user provided -o if that is our // final output, otherwise to drive from the original input // name. Find a clean way to go about this. if ((Args.hasArg(options::OPT_c) || Args.hasArg(options::OPT_S)) && Args.hasArg(options::OPT_o)) { Arg *OutputOpt = Args.getLastArg(options::OPT_o); CmdArgs.push_back("-auxbase-strip"); CmdArgs.push_back(OutputOpt->getValue()); } else { CmdArgs.push_back("-auxbase"); CmdArgs.push_back(darwin::CC1::getBaseInputStem(Args, Inputs)); } Args.AddAllArgs(CmdArgs, options::OPT_g_Group); Args.AddAllArgs(CmdArgs, options::OPT_O); // FIXME: -Wall is getting some special treatment. Investigate. Args.AddAllArgs(CmdArgs, options::OPT_W_Group, options::OPT_pedantic_Group); Args.AddLastArg(CmdArgs, options::OPT_w); Args.AddAllArgs(CmdArgs, options::OPT_std_EQ, options::OPT_ansi, options::OPT_trigraphs); if (!Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi)) { // Honor -std-default. Args.AddAllArgsTranslated(CmdArgs, options::OPT_std_default_EQ, "-std=", /*Joined=*/true); } if (Args.hasArg(options::OPT_v)) CmdArgs.push_back("-version"); if (Args.hasArg(options::OPT_pg) && getToolChain().SupportsProfiling()) CmdArgs.push_back("-p"); Args.AddLastArg(CmdArgs, options::OPT_p); // The driver treats -fsyntax-only specially. if (getToolChain().getTriple().getArch() == llvm::Triple::arm || getToolChain().getTriple().getArch() == llvm::Triple::thumb) { // Removes -fbuiltin-str{cat,cpy}; these aren't recognized by cc1 but are // used to inhibit the default -fno-builtin-str{cat,cpy}. // // FIXME: Should we grow a better way to deal with "removing" args? for (arg_iterator it = Args.filtered_begin(options::OPT_f_Group, options::OPT_fsyntax_only), ie = Args.filtered_end(); it != ie; ++it) { if (!(*it)->getOption().matches(options::OPT_fbuiltin_strcat) && !(*it)->getOption().matches(options::OPT_fbuiltin_strcpy)) { (*it)->claim(); (*it)->render(Args, CmdArgs); } } } else Args.AddAllArgs(CmdArgs, options::OPT_f_Group, options::OPT_fsyntax_only); // Claim Clang only -f options, they aren't worth warning about. Args.ClaimAllArgs(options::OPT_f_clang_Group); Args.AddAllArgs(CmdArgs, options::OPT_undef); if (Args.hasArg(options::OPT_Qn)) CmdArgs.push_back("-fno-ident"); // FIXME: This isn't correct. //Args.AddLastArg(CmdArgs, options::OPT__help) //Args.AddLastArg(CmdArgs, options::OPT__targetHelp) CmdArgs.append(OutputArgs.begin(), OutputArgs.end()); // FIXME: Still don't get what is happening here. Investigate. Args.AddAllArgs(CmdArgs, options::OPT__param); if (Args.hasArg(options::OPT_fmudflap) || Args.hasArg(options::OPT_fmudflapth)) { CmdArgs.push_back("-fno-builtin"); CmdArgs.push_back("-fno-merge-constants"); } if (Args.hasArg(options::OPT_coverage)) { CmdArgs.push_back("-fprofile-arcs"); CmdArgs.push_back("-ftest-coverage"); } if (types::isCXX(Inputs[0].getType())) CmdArgs.push_back("-D__private_extern__=extern"); } void darwin::CC1::AddCPPOptionsArgs(const ArgList &Args, ArgStringList &CmdArgs, const InputInfoList &Inputs, const ArgStringList &OutputArgs) const { // Derived from cpp_options AddCPPUniqueOptionsArgs(Args, CmdArgs, Inputs); CmdArgs.append(OutputArgs.begin(), OutputArgs.end()); AddCC1Args(Args, CmdArgs); // NOTE: The code below has some commonality with cpp_options, but // in classic gcc style ends up sending things in different // orders. This may be a good merge candidate once we drop pedantic // compatibility. Args.AddAllArgs(CmdArgs, options::OPT_m_Group); Args.AddAllArgs(CmdArgs, options::OPT_std_EQ, options::OPT_ansi, options::OPT_trigraphs); if (!Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi)) { // Honor -std-default. Args.AddAllArgsTranslated(CmdArgs, options::OPT_std_default_EQ, "-std=", /*Joined=*/true); } Args.AddAllArgs(CmdArgs, options::OPT_W_Group, options::OPT_pedantic_Group); Args.AddLastArg(CmdArgs, options::OPT_w); // The driver treats -fsyntax-only specially. Args.AddAllArgs(CmdArgs, options::OPT_f_Group, options::OPT_fsyntax_only); // Claim Clang only -f options, they aren't worth warning about. Args.ClaimAllArgs(options::OPT_f_clang_Group); if (Args.hasArg(options::OPT_g_Group) && !Args.hasArg(options::OPT_g0) && !Args.hasArg(options::OPT_fno_working_directory)) CmdArgs.push_back("-fworking-directory"); Args.AddAllArgs(CmdArgs, options::OPT_O); Args.AddAllArgs(CmdArgs, options::OPT_undef); if (Args.hasArg(options::OPT_save_temps)) CmdArgs.push_back("-fpch-preprocess"); } void darwin::CC1::AddCPPUniqueOptionsArgs(const ArgList &Args, ArgStringList &CmdArgs, const InputInfoList &Inputs) const { const Driver &D = getToolChain().getDriver(); CheckPreprocessingOptions(D, Args); // Derived from cpp_unique_options. // -{C,CC} only with -E is checked in CheckPreprocessingOptions(). Args.AddLastArg(CmdArgs, options::OPT_C); Args.AddLastArg(CmdArgs, options::OPT_CC); if (!Args.hasArg(options::OPT_Q)) CmdArgs.push_back("-quiet"); Args.AddAllArgs(CmdArgs, options::OPT_nostdinc); Args.AddAllArgs(CmdArgs, options::OPT_nostdincxx); Args.AddLastArg(CmdArgs, options::OPT_v); Args.AddAllArgs(CmdArgs, options::OPT_I_Group, options::OPT_F); Args.AddLastArg(CmdArgs, options::OPT_P); // FIXME: Handle %I properly. if (getToolChain().getArch() == llvm::Triple::x86_64) { CmdArgs.push_back("-imultilib"); CmdArgs.push_back("x86_64"); } if (Args.hasArg(options::OPT_MD)) { CmdArgs.push_back("-MD"); CmdArgs.push_back(darwin::CC1::getDependencyFileName(Args, Inputs)); } if (Args.hasArg(options::OPT_MMD)) { CmdArgs.push_back("-MMD"); CmdArgs.push_back(darwin::CC1::getDependencyFileName(Args, Inputs)); } Args.AddLastArg(CmdArgs, options::OPT_M); Args.AddLastArg(CmdArgs, options::OPT_MM); Args.AddAllArgs(CmdArgs, options::OPT_MF); Args.AddLastArg(CmdArgs, options::OPT_MG); Args.AddLastArg(CmdArgs, options::OPT_MP); Args.AddAllArgs(CmdArgs, options::OPT_MQ); Args.AddAllArgs(CmdArgs, options::OPT_MT); if (!Args.hasArg(options::OPT_M) && !Args.hasArg(options::OPT_MM) && (Args.hasArg(options::OPT_MD) || Args.hasArg(options::OPT_MMD))) { if (Arg *OutputOpt = Args.getLastArg(options::OPT_o)) { CmdArgs.push_back("-MQ"); CmdArgs.push_back(OutputOpt->getValue()); } } Args.AddLastArg(CmdArgs, options::OPT_remap); if (Args.hasArg(options::OPT_g3)) CmdArgs.push_back("-dD"); Args.AddLastArg(CmdArgs, options::OPT_H); AddCPPArgs(Args, CmdArgs); Args.AddAllArgs(CmdArgs, options::OPT_D, options::OPT_U, options::OPT_A); Args.AddAllArgs(CmdArgs, options::OPT_i_Group); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } Args.AddAllArgValues(CmdArgs, options::OPT_Wp_COMMA, options::OPT_Xpreprocessor); if (Args.hasArg(options::OPT_fmudflap)) { CmdArgs.push_back("-D_MUDFLAP"); CmdArgs.push_back("-include"); CmdArgs.push_back("mf-runtime.h"); } if (Args.hasArg(options::OPT_fmudflapth)) { CmdArgs.push_back("-D_MUDFLAP"); CmdArgs.push_back("-D_MUDFLAPTH"); CmdArgs.push_back("-include"); CmdArgs.push_back("mf-runtime.h"); } } void darwin::CC1::AddCPPArgs(const ArgList &Args, ArgStringList &CmdArgs) const { // Derived from cpp spec. if (Args.hasArg(options::OPT_static)) { // The gcc spec is broken here, it refers to dynamic but // that has been translated. Start by being bug compatible. // if (!Args.hasArg(arglist.parser.dynamicOption)) CmdArgs.push_back("-D__STATIC__"); } else CmdArgs.push_back("-D__DYNAMIC__"); if (Args.hasArg(options::OPT_pthread)) CmdArgs.push_back("-D_REENTRANT"); } void darwin::Preprocess::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; assert(Inputs.size() == 1 && "Unexpected number of inputs!"); CmdArgs.push_back("-E"); if (Args.hasArg(options::OPT_traditional) || Args.hasArg(options::OPT_traditional_cpp)) CmdArgs.push_back("-traditional-cpp"); ArgStringList OutputArgs; assert(Output.isFilename() && "Unexpected CC1 output."); OutputArgs.push_back("-o"); OutputArgs.push_back(Output.getFilename()); if (Args.hasArg(options::OPT_E) || getToolChain().getDriver().CCCIsCPP) { AddCPPOptionsArgs(Args, CmdArgs, Inputs, OutputArgs); } else { AddCPPOptionsArgs(Args, CmdArgs, Inputs, ArgStringList()); CmdArgs.append(OutputArgs.begin(), OutputArgs.end()); } Args.AddAllArgs(CmdArgs, options::OPT_d_Group); RemoveCC1UnsupportedArgs(CmdArgs); const char *CC1Name = getCC1Name(Inputs[0].getType()); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath(CC1Name)); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void darwin::Compile::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; assert(Inputs.size() == 1 && "Unexpected number of inputs!"); // Silence warning about unused --serialize-diagnostics Args.ClaimAllArgs(options::OPT__serialize_diags); types::ID InputType = Inputs[0].getType(); if (const Arg *A = Args.getLastArg(options::OPT_traditional)) D.Diag(diag::err_drv_argument_only_allowed_with) << A->getAsString(Args) << "-E"; if (JA.getType() == types::TY_LLVM_IR || JA.getType() == types::TY_LTO_IR) CmdArgs.push_back("-emit-llvm"); else if (JA.getType() == types::TY_LLVM_BC || JA.getType() == types::TY_LTO_BC) CmdArgs.push_back("-emit-llvm-bc"); else if (Output.getType() == types::TY_AST) D.Diag(diag::err_drv_no_ast_support) << getToolChain().getTripleString(); else if (JA.getType() != types::TY_PP_Asm && JA.getType() != types::TY_PCH) D.Diag(diag::err_drv_invalid_gcc_output_type) << getTypeName(JA.getType()); ArgStringList OutputArgs; if (Output.getType() != types::TY_PCH) { OutputArgs.push_back("-o"); if (Output.isNothing()) OutputArgs.push_back("/dev/null"); else OutputArgs.push_back(Output.getFilename()); } // There is no need for this level of compatibility, but it makes // diffing easier. bool OutputArgsEarly = (Args.hasArg(options::OPT_fsyntax_only) || Args.hasArg(options::OPT_S)); if (types::getPreprocessedType(InputType) != types::TY_INVALID) { AddCPPUniqueOptionsArgs(Args, CmdArgs, Inputs); if (OutputArgsEarly) { AddCC1OptionsArgs(Args, CmdArgs, Inputs, OutputArgs); } else { AddCC1OptionsArgs(Args, CmdArgs, Inputs, ArgStringList()); CmdArgs.append(OutputArgs.begin(), OutputArgs.end()); } } else { CmdArgs.push_back("-fpreprocessed"); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; // Reject AST inputs. if (II.getType() == types::TY_AST) { D.Diag(diag::err_drv_no_ast_support) << getToolChain().getTripleString(); return; } CmdArgs.push_back(II.getFilename()); } if (OutputArgsEarly) { AddCC1OptionsArgs(Args, CmdArgs, Inputs, OutputArgs); } else { AddCC1OptionsArgs(Args, CmdArgs, Inputs, ArgStringList()); CmdArgs.append(OutputArgs.begin(), OutputArgs.end()); } } if (Output.getType() == types::TY_PCH) { assert(Output.isFilename() && "Invalid PCH output."); CmdArgs.push_back("-o"); // NOTE: gcc uses a temp .s file for this, but there doesn't seem // to be a good reason. const char *TmpPath = C.getArgs().MakeArgString( D.GetTemporaryPath("cc", "s")); C.addTempFile(TmpPath); CmdArgs.push_back(TmpPath); // If we're emitting a pch file with the last 4 characters of ".pth" // and falling back to llvm-gcc we want to use ".gch" instead. std::string OutputFile(Output.getFilename()); size_t loc = OutputFile.rfind(".pth"); if (loc != std::string::npos) OutputFile.replace(loc, 4, ".gch"); const char *Tmp = C.getArgs().MakeArgString("--output-pch="+OutputFile); CmdArgs.push_back(Tmp); } RemoveCC1UnsupportedArgs(CmdArgs); const char *CC1Name = getCC1Name(Inputs[0].getType()); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath(CC1Name)); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void darwin::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; assert(Inputs.size() == 1 && "Unexpected number of inputs."); const InputInfo &Input = Inputs[0]; // Determine the original source input. const Action *SourceAction = &JA; while (SourceAction->getKind() != Action::InputClass) { assert(!SourceAction->getInputs().empty() && "unexpected root action!"); SourceAction = SourceAction->getInputs()[0]; } // Forward -g, assuming we are dealing with an actual assembly file. if (SourceAction->getType() == types::TY_Asm || SourceAction->getType() == types::TY_PP_Asm) { if (Args.hasArg(options::OPT_gstabs)) CmdArgs.push_back("--gstabs"); else if (Args.hasArg(options::OPT_g_Group)) CmdArgs.push_back("-g"); } // Derived from asm spec. AddDarwinArch(Args, CmdArgs); // Use -force_cpusubtype_ALL on x86 by default. if (getToolChain().getTriple().getArch() == llvm::Triple::x86 || getToolChain().getTriple().getArch() == llvm::Triple::x86_64 || Args.hasArg(options::OPT_force__cpusubtype__ALL)) CmdArgs.push_back("-force_cpusubtype_ALL"); if (getToolChain().getTriple().getArch() != llvm::Triple::x86_64 && (((Args.hasArg(options::OPT_mkernel) || Args.hasArg(options::OPT_fapple_kext)) && (!getDarwinToolChain().isTargetIPhoneOS() || getDarwinToolChain().isIPhoneOSVersionLT(6, 0))) || Args.hasArg(options::OPT_static))) CmdArgs.push_back("-static"); Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); assert(Output.isFilename() && "Unexpected lipo output."); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); assert(Input.isFilename() && "Invalid input."); CmdArgs.push_back(Input.getFilename()); // asm_final spec is empty. const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void darwin::DarwinTool::anchor() {} void darwin::DarwinTool::AddDarwinArch(const ArgList &Args, ArgStringList &CmdArgs) const { StringRef ArchName = getDarwinToolChain().getDarwinArchName(Args); // Derived from darwin_arch spec. CmdArgs.push_back("-arch"); CmdArgs.push_back(Args.MakeArgString(ArchName)); // FIXME: Is this needed anymore? if (ArchName == "arm") CmdArgs.push_back("-force_cpusubtype_ALL"); } bool darwin::Link::NeedsTempPath(const InputInfoList &Inputs) const { // We only need to generate a temp path for LTO if we aren't compiling object // files. When compiling source files, we run 'dsymutil' after linking. We // don't run 'dsymutil' when compiling object files. for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) if (it->getType() != types::TY_Object) return true; return false; } void darwin::Link::AddLinkArgs(Compilation &C, const ArgList &Args, ArgStringList &CmdArgs, const InputInfoList &Inputs) const { const Driver &D = getToolChain().getDriver(); const toolchains::Darwin &DarwinTC = getDarwinToolChain(); unsigned Version[3] = { 0, 0, 0 }; if (Arg *A = Args.getLastArg(options::OPT_mlinker_version_EQ)) { bool HadExtra; if (!Driver::GetReleaseVersion(A->getValue(), Version[0], Version[1], Version[2], HadExtra) || HadExtra) D.Diag(diag::err_drv_invalid_version_number) << A->getAsString(Args); } // Newer linkers support -demangle, pass it if supported and not disabled by // the user. if (Version[0] >= 100 && !Args.hasArg(options::OPT_Z_Xlinker__no_demangle)) { // Don't pass -demangle to ld_classic. // // FIXME: This is a temporary workaround, ld should be handling this. bool UsesLdClassic = (getToolChain().getArch() == llvm::Triple::x86 && Args.hasArg(options::OPT_static)); if (getToolChain().getArch() == llvm::Triple::x86) { for (arg_iterator it = Args.filtered_begin(options::OPT_Xlinker, options::OPT_Wl_COMMA), ie = Args.filtered_end(); it != ie; ++it) { const Arg *A = *it; for (unsigned i = 0, e = A->getNumValues(); i != e; ++i) if (StringRef(A->getValue(i)) == "-kext") UsesLdClassic = true; } } if (!UsesLdClassic) CmdArgs.push_back("-demangle"); } // If we are using LTO, then automatically create a temporary file path for // the linker to use, so that it's lifetime will extend past a possible // dsymutil step. if (Version[0] >= 116 && D.IsUsingLTO(Args) && NeedsTempPath(Inputs)) { const char *TmpPath = C.getArgs().MakeArgString( D.GetTemporaryPath("cc", types::getTypeTempSuffix(types::TY_Object))); C.addTempFile(TmpPath); CmdArgs.push_back("-object_path_lto"); CmdArgs.push_back(TmpPath); } // Derived from the "link" spec. Args.AddAllArgs(CmdArgs, options::OPT_static); if (!Args.hasArg(options::OPT_static)) CmdArgs.push_back("-dynamic"); if (Args.hasArg(options::OPT_fgnu_runtime)) { // FIXME: gcc replaces -lobjc in forward args with -lobjc-gnu // here. How do we wish to handle such things? } if (!Args.hasArg(options::OPT_dynamiclib)) { AddDarwinArch(Args, CmdArgs); // FIXME: Why do this only on this path? Args.AddLastArg(CmdArgs, options::OPT_force__cpusubtype__ALL); Args.AddLastArg(CmdArgs, options::OPT_bundle); Args.AddAllArgs(CmdArgs, options::OPT_bundle__loader); Args.AddAllArgs(CmdArgs, options::OPT_client__name); Arg *A; if ((A = Args.getLastArg(options::OPT_compatibility__version)) || (A = Args.getLastArg(options::OPT_current__version)) || (A = Args.getLastArg(options::OPT_install__name))) D.Diag(diag::err_drv_argument_only_allowed_with) << A->getAsString(Args) << "-dynamiclib"; Args.AddLastArg(CmdArgs, options::OPT_force__flat__namespace); Args.AddLastArg(CmdArgs, options::OPT_keep__private__externs); Args.AddLastArg(CmdArgs, options::OPT_private__bundle); } else { CmdArgs.push_back("-dylib"); Arg *A; if ((A = Args.getLastArg(options::OPT_bundle)) || (A = Args.getLastArg(options::OPT_bundle__loader)) || (A = Args.getLastArg(options::OPT_client__name)) || (A = Args.getLastArg(options::OPT_force__flat__namespace)) || (A = Args.getLastArg(options::OPT_keep__private__externs)) || (A = Args.getLastArg(options::OPT_private__bundle))) D.Diag(diag::err_drv_argument_not_allowed_with) << A->getAsString(Args) << "-dynamiclib"; Args.AddAllArgsTranslated(CmdArgs, options::OPT_compatibility__version, "-dylib_compatibility_version"); Args.AddAllArgsTranslated(CmdArgs, options::OPT_current__version, "-dylib_current_version"); AddDarwinArch(Args, CmdArgs); Args.AddAllArgsTranslated(CmdArgs, options::OPT_install__name, "-dylib_install_name"); } Args.AddLastArg(CmdArgs, options::OPT_all__load); Args.AddAllArgs(CmdArgs, options::OPT_allowable__client); Args.AddLastArg(CmdArgs, options::OPT_bind__at__load); if (DarwinTC.isTargetIPhoneOS()) Args.AddLastArg(CmdArgs, options::OPT_arch__errors__fatal); Args.AddLastArg(CmdArgs, options::OPT_dead__strip); Args.AddLastArg(CmdArgs, options::OPT_no__dead__strip__inits__and__terms); Args.AddAllArgs(CmdArgs, options::OPT_dylib__file); Args.AddLastArg(CmdArgs, options::OPT_dynamic); Args.AddAllArgs(CmdArgs, options::OPT_exported__symbols__list); Args.AddLastArg(CmdArgs, options::OPT_flat__namespace); Args.AddAllArgs(CmdArgs, options::OPT_force__load); Args.AddAllArgs(CmdArgs, options::OPT_headerpad__max__install__names); Args.AddAllArgs(CmdArgs, options::OPT_image__base); Args.AddAllArgs(CmdArgs, options::OPT_init); // Add the deployment target. VersionTuple TargetVersion = DarwinTC.getTargetVersion(); // If we had an explicit -mios-simulator-version-min argument, honor that, // otherwise use the traditional deployment targets. We can't just check the // is-sim attribute because existing code follows this path, and the linker // may not handle the argument. // // FIXME: We may be able to remove this, once we can verify no one depends on // it. if (Args.hasArg(options::OPT_mios_simulator_version_min_EQ)) CmdArgs.push_back("-ios_simulator_version_min"); else if (DarwinTC.isTargetIPhoneOS()) CmdArgs.push_back("-iphoneos_version_min"); else CmdArgs.push_back("-macosx_version_min"); CmdArgs.push_back(Args.MakeArgString(TargetVersion.getAsString())); Args.AddLastArg(CmdArgs, options::OPT_nomultidefs); Args.AddLastArg(CmdArgs, options::OPT_multi__module); Args.AddLastArg(CmdArgs, options::OPT_single__module); Args.AddAllArgs(CmdArgs, options::OPT_multiply__defined); Args.AddAllArgs(CmdArgs, options::OPT_multiply__defined__unused); if (const Arg *A = Args.getLastArg(options::OPT_fpie, options::OPT_fPIE, options::OPT_fno_pie, options::OPT_fno_PIE)) { if (A->getOption().matches(options::OPT_fpie) || A->getOption().matches(options::OPT_fPIE)) CmdArgs.push_back("-pie"); else CmdArgs.push_back("-no_pie"); } Args.AddLastArg(CmdArgs, options::OPT_prebind); Args.AddLastArg(CmdArgs, options::OPT_noprebind); Args.AddLastArg(CmdArgs, options::OPT_nofixprebinding); Args.AddLastArg(CmdArgs, options::OPT_prebind__all__twolevel__modules); Args.AddLastArg(CmdArgs, options::OPT_read__only__relocs); Args.AddAllArgs(CmdArgs, options::OPT_sectcreate); Args.AddAllArgs(CmdArgs, options::OPT_sectorder); Args.AddAllArgs(CmdArgs, options::OPT_seg1addr); Args.AddAllArgs(CmdArgs, options::OPT_segprot); Args.AddAllArgs(CmdArgs, options::OPT_segaddr); Args.AddAllArgs(CmdArgs, options::OPT_segs__read__only__addr); Args.AddAllArgs(CmdArgs, options::OPT_segs__read__write__addr); Args.AddAllArgs(CmdArgs, options::OPT_seg__addr__table); Args.AddAllArgs(CmdArgs, options::OPT_seg__addr__table__filename); Args.AddAllArgs(CmdArgs, options::OPT_sub__library); Args.AddAllArgs(CmdArgs, options::OPT_sub__umbrella); // Give --sysroot= preference, over the Apple specific behavior to also use // --isysroot as the syslibroot. StringRef sysroot = C.getSysRoot(); if (sysroot != "") { CmdArgs.push_back("-syslibroot"); CmdArgs.push_back(C.getArgs().MakeArgString(sysroot)); } else if (const Arg *A = Args.getLastArg(options::OPT_isysroot)) { CmdArgs.push_back("-syslibroot"); CmdArgs.push_back(A->getValue()); } Args.AddLastArg(CmdArgs, options::OPT_twolevel__namespace); Args.AddLastArg(CmdArgs, options::OPT_twolevel__namespace__hints); Args.AddAllArgs(CmdArgs, options::OPT_umbrella); Args.AddAllArgs(CmdArgs, options::OPT_undefined); Args.AddAllArgs(CmdArgs, options::OPT_unexported__symbols__list); Args.AddAllArgs(CmdArgs, options::OPT_weak__reference__mismatches); Args.AddLastArg(CmdArgs, options::OPT_X_Flag); Args.AddAllArgs(CmdArgs, options::OPT_y); Args.AddLastArg(CmdArgs, options::OPT_w); Args.AddAllArgs(CmdArgs, options::OPT_pagezero__size); Args.AddAllArgs(CmdArgs, options::OPT_segs__read__); Args.AddLastArg(CmdArgs, options::OPT_seglinkedit); Args.AddLastArg(CmdArgs, options::OPT_noseglinkedit); Args.AddAllArgs(CmdArgs, options::OPT_sectalign); Args.AddAllArgs(CmdArgs, options::OPT_sectobjectsymbols); Args.AddAllArgs(CmdArgs, options::OPT_segcreate); Args.AddLastArg(CmdArgs, options::OPT_whyload); Args.AddLastArg(CmdArgs, options::OPT_whatsloaded); Args.AddAllArgs(CmdArgs, options::OPT_dylinker__install__name); Args.AddLastArg(CmdArgs, options::OPT_dylinker); Args.AddLastArg(CmdArgs, options::OPT_Mach); } void darwin::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { assert(Output.getType() == types::TY_Image && "Invalid linker output type."); // The logic here is derived from gcc's behavior; most of which // comes from specs (starting with link_command). Consult gcc for // more information. ArgStringList CmdArgs; /// Hack(tm) to ignore linking errors when we are doing ARC migration. if (Args.hasArg(options::OPT_ccc_arcmt_check, options::OPT_ccc_arcmt_migrate)) { for (ArgList::const_iterator I = Args.begin(), E = Args.end(); I != E; ++I) (*I)->claim(); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("touch")); CmdArgs.push_back(Output.getFilename()); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); return; } // I'm not sure why this particular decomposition exists in gcc, but // we follow suite for ease of comparison. AddLinkArgs(C, Args, CmdArgs, Inputs); Args.AddAllArgs(CmdArgs, options::OPT_d_Flag); Args.AddAllArgs(CmdArgs, options::OPT_s); Args.AddAllArgs(CmdArgs, options::OPT_t); Args.AddAllArgs(CmdArgs, options::OPT_Z_Flag); Args.AddAllArgs(CmdArgs, options::OPT_u_Group); Args.AddLastArg(CmdArgs, options::OPT_e); Args.AddAllArgs(CmdArgs, options::OPT_m_Separate); Args.AddAllArgs(CmdArgs, options::OPT_r); // Forward -ObjC when either -ObjC or -ObjC++ is used, to force loading // members of static archive libraries which implement Objective-C classes or // categories. if (Args.hasArg(options::OPT_ObjC) || Args.hasArg(options::OPT_ObjCXX)) CmdArgs.push_back("-ObjC"); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { // Derived from startfile spec. if (Args.hasArg(options::OPT_dynamiclib)) { // Derived from darwin_dylib1 spec. if (getDarwinToolChain().isTargetIOSSimulator()) { // The simulator doesn't have a versioned crt1 file. CmdArgs.push_back("-ldylib1.o"); } else if (getDarwinToolChain().isTargetIPhoneOS()) { if (getDarwinToolChain().isIPhoneOSVersionLT(3, 1)) CmdArgs.push_back("-ldylib1.o"); } else { if (getDarwinToolChain().isMacosxVersionLT(10, 5)) CmdArgs.push_back("-ldylib1.o"); else if (getDarwinToolChain().isMacosxVersionLT(10, 6)) CmdArgs.push_back("-ldylib1.10.5.o"); } } else { if (Args.hasArg(options::OPT_bundle)) { if (!Args.hasArg(options::OPT_static)) { // Derived from darwin_bundle1 spec. if (getDarwinToolChain().isTargetIOSSimulator()) { // The simulator doesn't have a versioned crt1 file. CmdArgs.push_back("-lbundle1.o"); } else if (getDarwinToolChain().isTargetIPhoneOS()) { if (getDarwinToolChain().isIPhoneOSVersionLT(3, 1)) CmdArgs.push_back("-lbundle1.o"); } else { if (getDarwinToolChain().isMacosxVersionLT(10, 6)) CmdArgs.push_back("-lbundle1.o"); } } } else { if (Args.hasArg(options::OPT_pg) && getToolChain().SupportsProfiling()) { if (Args.hasArg(options::OPT_static) || Args.hasArg(options::OPT_object) || Args.hasArg(options::OPT_preload)) { CmdArgs.push_back("-lgcrt0.o"); } else { CmdArgs.push_back("-lgcrt1.o"); // darwin_crt2 spec is empty. } // By default on OS X 10.8 and later, we don't link with a crt1.o // file and the linker knows to use _main as the entry point. But, // when compiling with -pg, we need to link with the gcrt1.o file, // so pass the -no_new_main option to tell the linker to use the // "start" symbol as the entry point. if (getDarwinToolChain().isTargetMacOS() && !getDarwinToolChain().isMacosxVersionLT(10, 8)) CmdArgs.push_back("-no_new_main"); } else { if (Args.hasArg(options::OPT_static) || Args.hasArg(options::OPT_object) || Args.hasArg(options::OPT_preload)) { CmdArgs.push_back("-lcrt0.o"); } else { // Derived from darwin_crt1 spec. if (getDarwinToolChain().isTargetIOSSimulator()) { // The simulator doesn't have a versioned crt1 file. CmdArgs.push_back("-lcrt1.o"); } else if (getDarwinToolChain().isTargetIPhoneOS()) { if (getDarwinToolChain().isIPhoneOSVersionLT(3, 1)) CmdArgs.push_back("-lcrt1.o"); else if (getDarwinToolChain().isIPhoneOSVersionLT(6, 0)) CmdArgs.push_back("-lcrt1.3.1.o"); } else { if (getDarwinToolChain().isMacosxVersionLT(10, 5)) CmdArgs.push_back("-lcrt1.o"); else if (getDarwinToolChain().isMacosxVersionLT(10, 6)) CmdArgs.push_back("-lcrt1.10.5.o"); else if (getDarwinToolChain().isMacosxVersionLT(10, 8)) CmdArgs.push_back("-lcrt1.10.6.o"); // darwin_crt2 spec is empty. } } } } } if (!getDarwinToolChain().isTargetIPhoneOS() && Args.hasArg(options::OPT_shared_libgcc) && getDarwinToolChain().isMacosxVersionLT(10, 5)) { const char *Str = Args.MakeArgString(getToolChain().GetFilePath("crt3.o")); CmdArgs.push_back(Str); } } Args.AddAllArgs(CmdArgs, options::OPT_L); SanitizerArgs Sanitize(getToolChain().getDriver(), Args); // If we're building a dynamic lib with -fsanitize=address, or // -fsanitize=undefined, unresolved symbols may appear. Mark all // of them as dynamic_lookup. Linking executables is handled in // lib/Driver/ToolChains.cpp. if (Sanitize.needsAsanRt() || Sanitize.needsUbsanRt()) { if (Args.hasArg(options::OPT_dynamiclib) || Args.hasArg(options::OPT_bundle)) { CmdArgs.push_back("-undefined"); CmdArgs.push_back("dynamic_lookup"); } } if (Args.hasArg(options::OPT_fopenmp)) // This is more complicated in gcc... CmdArgs.push_back("-lgomp"); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs); if (isObjCRuntimeLinked(Args) && !Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { // Avoid linking compatibility stubs on i386 mac. if (!getDarwinToolChain().isTargetMacOS() || getDarwinToolChain().getArch() != llvm::Triple::x86) { // If we don't have ARC or subscripting runtime support, link in the // runtime stubs. We have to do this *before* adding any of the normal // linker inputs so that its initializer gets run first. ObjCRuntime runtime = getDarwinToolChain().getDefaultObjCRuntime(/*nonfragile*/ true); // We use arclite library for both ARC and subscripting support. if ((!runtime.hasNativeARC() && isObjCAutoRefCount(Args)) || !runtime.hasSubscripting()) getDarwinToolChain().AddLinkARCArgs(Args, CmdArgs); } CmdArgs.push_back("-framework"); CmdArgs.push_back("Foundation"); // Link libobj. CmdArgs.push_back("-lobjc"); } if (LinkingOutput) { CmdArgs.push_back("-arch_multiple"); CmdArgs.push_back("-final_output"); CmdArgs.push_back(LinkingOutput); } if (Args.hasArg(options::OPT_fnested_functions)) CmdArgs.push_back("-allow_stack_execute"); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { if (getToolChain().getDriver().CCCIsCXX) getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); // link_ssp spec is empty. // Let the tool chain choose which runtime library to link. getDarwinToolChain().AddLinkRuntimeLibArgs(Args, CmdArgs); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { // endfile_spec is empty. } Args.AddAllArgs(CmdArgs, options::OPT_T_Group); Args.AddAllArgs(CmdArgs, options::OPT_F); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void darwin::Lipo::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; CmdArgs.push_back("-create"); assert(Output.isFilename() && "Unexpected lipo output."); CmdArgs.push_back("-output"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; assert(II.isFilename() && "Unexpected lipo input."); CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("lipo")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void darwin::Dsymutil::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); assert(Inputs.size() == 1 && "Unable to handle multiple inputs."); const InputInfo &Input = Inputs[0]; assert(Input.isFilename() && "Unexpected dsymutil input."); CmdArgs.push_back(Input.getFilename()); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("dsymutil")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void darwin::VerifyDebug::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; CmdArgs.push_back("--verify"); CmdArgs.push_back("--debug-info"); CmdArgs.push_back("--eh-frame"); CmdArgs.push_back("--quiet"); assert(Inputs.size() == 1 && "Unable to handle multiple inputs."); const InputInfo &Input = Inputs[0]; assert(Input.isFilename() && "Unexpected verify input"); // Grabbing the output of the earlier dsymutil run. CmdArgs.push_back(Input.getFilename()); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("dwarfdump")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void solaris::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void solaris::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { // FIXME: Find a real GCC, don't hard-code versions here std::string GCCLibPath = "/usr/gcc/4.5/lib/gcc/"; const llvm::Triple &T = getToolChain().getTriple(); std::string LibPath = "/usr/lib/"; llvm::Triple::ArchType Arch = T.getArch(); switch (Arch) { case llvm::Triple::x86: GCCLibPath += ("i386-" + T.getVendorName() + "-" + T.getOSName()).str() + "/4.5.2/"; break; case llvm::Triple::x86_64: GCCLibPath += ("i386-" + T.getVendorName() + "-" + T.getOSName()).str(); GCCLibPath += "/4.5.2/amd64/"; LibPath += "amd64/"; break; default: assert(0 && "Unsupported architecture"); } ArgStringList CmdArgs; // Demangle C++ names in errors CmdArgs.push_back("-C"); if ((!Args.hasArg(options::OPT_nostdlib)) && (!Args.hasArg(options::OPT_shared))) { CmdArgs.push_back("-e"); CmdArgs.push_back("_start"); } if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-Bstatic"); CmdArgs.push_back("-dn"); } else { CmdArgs.push_back("-Bdynamic"); if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-shared"); } else { CmdArgs.push_back("--dynamic-linker"); CmdArgs.push_back(Args.MakeArgString(LibPath + "ld.so.1")); } } if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Invalid output."); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) { CmdArgs.push_back(Args.MakeArgString(LibPath + "crt1.o")); CmdArgs.push_back(Args.MakeArgString(LibPath + "crti.o")); CmdArgs.push_back(Args.MakeArgString(LibPath + "values-Xa.o")); CmdArgs.push_back(Args.MakeArgString(GCCLibPath + "crtbegin.o")); } else { CmdArgs.push_back(Args.MakeArgString(LibPath + "crti.o")); CmdArgs.push_back(Args.MakeArgString(LibPath + "values-Xa.o")); CmdArgs.push_back(Args.MakeArgString(GCCLibPath + "crtbegin.o")); } if (getToolChain().getDriver().CCCIsCXX) CmdArgs.push_back(Args.MakeArgString(LibPath + "cxa_finalize.o")); } CmdArgs.push_back(Args.MakeArgString("-L" + GCCLibPath)); Args.AddAllArgs(CmdArgs, options::OPT_L); Args.AddAllArgs(CmdArgs, options::OPT_T_Group); Args.AddAllArgs(CmdArgs, options::OPT_e); Args.AddAllArgs(CmdArgs, options::OPT_r); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { if (getToolChain().getDriver().CCCIsCXX) getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); CmdArgs.push_back("-lgcc_s"); if (!Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-lgcc"); CmdArgs.push_back("-lc"); CmdArgs.push_back("-lm"); } } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { CmdArgs.push_back(Args.MakeArgString(GCCLibPath + "crtend.o")); } CmdArgs.push_back(Args.MakeArgString(LibPath + "crtn.o")); addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple()); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void auroraux::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("gas")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void auroraux::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; if ((!Args.hasArg(options::OPT_nostdlib)) && (!Args.hasArg(options::OPT_shared))) { CmdArgs.push_back("-e"); CmdArgs.push_back("_start"); } if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-Bstatic"); CmdArgs.push_back("-dn"); } else { // CmdArgs.push_back("--eh-frame-hdr"); CmdArgs.push_back("-Bdynamic"); if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-shared"); } else { CmdArgs.push_back("--dynamic-linker"); CmdArgs.push_back("/lib/ld.so.1"); // 64Bit Path /lib/amd64/ld.so.1 } } if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Invalid output."); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) { CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crt1.o"))); CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crti.o"))); CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtbegin.o"))); } else { CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crti.o"))); } CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtn.o"))); } CmdArgs.push_back(Args.MakeArgString("-L/opt/gcc4/lib/gcc/" + getToolChain().getTripleString() + "/4.2.4")); Args.AddAllArgs(CmdArgs, options::OPT_L); Args.AddAllArgs(CmdArgs, options::OPT_T_Group); Args.AddAllArgs(CmdArgs, options::OPT_e); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { // FIXME: For some reason GCC passes -lgcc before adding // the default system libraries. Just mimic this for now. CmdArgs.push_back("-lgcc"); if (Args.hasArg(options::OPT_pthread)) CmdArgs.push_back("-pthread"); if (!Args.hasArg(options::OPT_shared)) CmdArgs.push_back("-lc"); CmdArgs.push_back("-lgcc"); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtend.o"))); } addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple()); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void openbsd::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void openbsd::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; if ((!Args.hasArg(options::OPT_nostdlib)) && (!Args.hasArg(options::OPT_shared))) { CmdArgs.push_back("-e"); CmdArgs.push_back("__start"); } if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-Bstatic"); } else { if (Args.hasArg(options::OPT_rdynamic)) CmdArgs.push_back("-export-dynamic"); CmdArgs.push_back("--eh-frame-hdr"); CmdArgs.push_back("-Bdynamic"); if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-shared"); } else { CmdArgs.push_back("-dynamic-linker"); CmdArgs.push_back("/usr/libexec/ld.so"); } } if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Invalid output."); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) { if (Args.hasArg(options::OPT_pg)) CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("gcrt0.o"))); else CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crt0.o"))); CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtbegin.o"))); } else { CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtbeginS.o"))); } } std::string Triple = getToolChain().getTripleString(); if (Triple.substr(0, 6) == "x86_64") Triple.replace(0, 6, "amd64"); CmdArgs.push_back(Args.MakeArgString("-L/usr/lib/gcc-lib/" + Triple + "/4.2.1")); Args.AddAllArgs(CmdArgs, options::OPT_L); Args.AddAllArgs(CmdArgs, options::OPT_T_Group); Args.AddAllArgs(CmdArgs, options::OPT_e); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { if (D.CCCIsCXX) { getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); if (Args.hasArg(options::OPT_pg)) CmdArgs.push_back("-lm_p"); else CmdArgs.push_back("-lm"); } // FIXME: For some reason GCC passes -lgcc before adding // the default system libraries. Just mimic this for now. CmdArgs.push_back("-lgcc"); if (Args.hasArg(options::OPT_pthread)) { if (!Args.hasArg(options::OPT_shared) && Args.hasArg(options::OPT_pg)) CmdArgs.push_back("-lpthread_p"); else CmdArgs.push_back("-lpthread"); } if (!Args.hasArg(options::OPT_shared)) { if (Args.hasArg(options::OPT_pg)) CmdArgs.push_back("-lc_p"); else CmdArgs.push_back("-lc"); } CmdArgs.push_back("-lgcc"); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtend.o"))); else CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtendS.o"))); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void bitrig::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void bitrig::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; if ((!Args.hasArg(options::OPT_nostdlib)) && (!Args.hasArg(options::OPT_shared))) { CmdArgs.push_back("-e"); CmdArgs.push_back("__start"); } if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-Bstatic"); } else { if (Args.hasArg(options::OPT_rdynamic)) CmdArgs.push_back("-export-dynamic"); CmdArgs.push_back("--eh-frame-hdr"); CmdArgs.push_back("-Bdynamic"); if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-shared"); } else { CmdArgs.push_back("-dynamic-linker"); CmdArgs.push_back("/usr/libexec/ld.so"); } } if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Invalid output."); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) { if (Args.hasArg(options::OPT_pg)) CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("gcrt0.o"))); else CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crt0.o"))); CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtbegin.o"))); } else { CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtbeginS.o"))); } } Args.AddAllArgs(CmdArgs, options::OPT_L); Args.AddAllArgs(CmdArgs, options::OPT_T_Group); Args.AddAllArgs(CmdArgs, options::OPT_e); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { if (D.CCCIsCXX) { getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); if (Args.hasArg(options::OPT_pg)) CmdArgs.push_back("-lm_p"); else CmdArgs.push_back("-lm"); } if (Args.hasArg(options::OPT_pthread)) { if (!Args.hasArg(options::OPT_shared) && Args.hasArg(options::OPT_pg)) CmdArgs.push_back("-lpthread_p"); else CmdArgs.push_back("-lpthread"); } if (!Args.hasArg(options::OPT_shared)) { if (Args.hasArg(options::OPT_pg)) CmdArgs.push_back("-lc_p"); else CmdArgs.push_back("-lc"); } std::string myarch = "-lclang_rt."; const llvm::Triple &T = getToolChain().getTriple(); llvm::Triple::ArchType Arch = T.getArch(); switch (Arch) { case llvm::Triple::arm: myarch += ("arm"); break; case llvm::Triple::x86: myarch += ("i386"); break; case llvm::Triple::x86_64: myarch += ("amd64"); break; default: assert(0 && "Unsupported architecture"); } CmdArgs.push_back(Args.MakeArgString(myarch)); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtend.o"))); else CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtendS.o"))); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void freebsd::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; // When building 32-bit code on FreeBSD/amd64, we have to explicitly // instruct as in the base system to assemble 32-bit code. if (getToolChain().getArch() == llvm::Triple::x86) CmdArgs.push_back("--32"); else if (getToolChain().getArch() == llvm::Triple::ppc) CmdArgs.push_back("-a32"); else if (getToolChain().getArch() == llvm::Triple::mips || getToolChain().getArch() == llvm::Triple::mipsel || getToolChain().getArch() == llvm::Triple::mips64 || getToolChain().getArch() == llvm::Triple::mips64el) { StringRef CPUName; StringRef ABIName; getMipsCPUAndABI(Args, getToolChain(), CPUName, ABIName); CmdArgs.push_back("-march"); CmdArgs.push_back(CPUName.data()); // Convert ABI name to the GNU tools acceptable variant. if (ABIName == "o32") ABIName = "32"; else if (ABIName == "n64") ABIName = "64"; CmdArgs.push_back("-mabi"); CmdArgs.push_back(ABIName.data()); if (getToolChain().getArch() == llvm::Triple::mips || getToolChain().getArch() == llvm::Triple::mips64) CmdArgs.push_back("-EB"); else CmdArgs.push_back("-EL"); Arg *LastPICArg = Args.getLastArg(options::OPT_fPIC, options::OPT_fno_PIC, options::OPT_fpic, options::OPT_fno_pic, options::OPT_fPIE, options::OPT_fno_PIE, options::OPT_fpie, options::OPT_fno_pie); if (LastPICArg && (LastPICArg->getOption().matches(options::OPT_fPIC) || LastPICArg->getOption().matches(options::OPT_fpic) || LastPICArg->getOption().matches(options::OPT_fPIE) || LastPICArg->getOption().matches(options::OPT_fpie))) { CmdArgs.push_back("-KPIC"); } } else if (getToolChain().getArch() == llvm::Triple::arm || getToolChain().getArch() == llvm::Triple::thumb) { CmdArgs.push_back("-mfpu=softvfp"); switch(getToolChain().getTriple().getEnvironment()) { case llvm::Triple::GNUEABI: case llvm::Triple::EABI: break; default: CmdArgs.push_back("-matpcs"); } } Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void freebsd::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const toolchains::FreeBSD& ToolChain = static_cast(getToolChain()); const Driver &D = ToolChain.getDriver(); ArgStringList CmdArgs; // Silence warning for "clang -g foo.o -o foo" Args.ClaimAllArgs(options::OPT_g_Group); // and "clang -emit-llvm foo.o -o foo" Args.ClaimAllArgs(options::OPT_emit_llvm); // and for "clang -w foo.o -o foo". Other warning options are already // handled somewhere else. Args.ClaimAllArgs(options::OPT_w); if (!D.SysRoot.empty()) CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot)); if (Args.hasArg(options::OPT_pie)) CmdArgs.push_back("-pie"); if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-Bstatic"); } else { if (Args.hasArg(options::OPT_rdynamic)) CmdArgs.push_back("-export-dynamic"); CmdArgs.push_back("--eh-frame-hdr"); if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-Bshareable"); } else { CmdArgs.push_back("-dynamic-linker"); CmdArgs.push_back("/libexec/ld-elf.so.1"); } if (ToolChain.getTriple().getOSMajorVersion() >= 9) { llvm::Triple::ArchType Arch = ToolChain.getArch(); if (Arch == llvm::Triple::arm || Arch == llvm::Triple::sparc || Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64) { CmdArgs.push_back("--hash-style=both"); } } CmdArgs.push_back("--enable-new-dtags"); } // When building 32-bit code on FreeBSD/amd64, we have to explicitly // instruct ld in the base system to link 32-bit code. if (ToolChain.getArch() == llvm::Triple::x86) { CmdArgs.push_back("-m"); CmdArgs.push_back("elf_i386_fbsd"); } if (ToolChain.getArch() == llvm::Triple::ppc) { CmdArgs.push_back("-m"); CmdArgs.push_back("elf32ppc_fbsd"); } if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Invalid output."); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { const char *crt1 = NULL; if (!Args.hasArg(options::OPT_shared)) { if (Args.hasArg(options::OPT_pg)) crt1 = "gcrt1.o"; else if (Args.hasArg(options::OPT_pie)) crt1 = "Scrt1.o"; else crt1 = "crt1.o"; } if (crt1) CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crt1))); CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crti.o"))); const char *crtbegin = NULL; if (Args.hasArg(options::OPT_static)) crtbegin = "crtbeginT.o"; else if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie)) crtbegin = "crtbeginS.o"; else crtbegin = "crtbegin.o"; CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtbegin))); } Args.AddAllArgs(CmdArgs, options::OPT_L); const ToolChain::path_list Paths = ToolChain.getFilePaths(); for (ToolChain::path_list::const_iterator i = Paths.begin(), e = Paths.end(); i != e; ++i) CmdArgs.push_back(Args.MakeArgString(StringRef("-L") + *i)); Args.AddAllArgs(CmdArgs, options::OPT_T_Group); Args.AddAllArgs(CmdArgs, options::OPT_e); Args.AddAllArgs(CmdArgs, options::OPT_s); Args.AddAllArgs(CmdArgs, options::OPT_t); Args.AddAllArgs(CmdArgs, options::OPT_Z_Flag); Args.AddAllArgs(CmdArgs, options::OPT_r); AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { if (D.CCCIsCXX) { ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs); if (Args.hasArg(options::OPT_pg)) CmdArgs.push_back("-lm_p"); else CmdArgs.push_back("-lm"); } // FIXME: For some reason GCC passes -lgcc and -lgcc_s before adding // the default system libraries. Just mimic this for now. if (Args.hasArg(options::OPT_pg)) CmdArgs.push_back("-lgcc_p"); else CmdArgs.push_back("-lgcc"); if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-lgcc_eh"); } else if (Args.hasArg(options::OPT_pg)) { CmdArgs.push_back("-lgcc_eh_p"); } else { CmdArgs.push_back("--as-needed"); CmdArgs.push_back("-lgcc_s"); CmdArgs.push_back("--no-as-needed"); } if (Args.hasArg(options::OPT_pthread)) { if (Args.hasArg(options::OPT_pg)) CmdArgs.push_back("-lpthread_p"); else CmdArgs.push_back("-lpthread"); } if (Args.hasArg(options::OPT_pg)) { if (Args.hasArg(options::OPT_shared)) CmdArgs.push_back("-lc"); else CmdArgs.push_back("-lc_p"); CmdArgs.push_back("-lgcc_p"); } else { CmdArgs.push_back("-lc"); CmdArgs.push_back("-lgcc"); } if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-lgcc_eh"); } else if (Args.hasArg(options::OPT_pg)) { CmdArgs.push_back("-lgcc_eh_p"); } else { CmdArgs.push_back("--as-needed"); CmdArgs.push_back("-lgcc_s"); CmdArgs.push_back("--no-as-needed"); } } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (Args.hasArg(options::OPT_shared) || Args.hasArg(options::OPT_pie)) CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtendS.o"))); else CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtend.o"))); CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtn.o"))); } addProfileRT(ToolChain, Args, CmdArgs, ToolChain.getTriple()); const char *Exec = Args.MakeArgString(ToolChain.GetProgramPath("ld")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void netbsd::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; // When building 32-bit code on NetBSD/amd64, we have to explicitly // instruct as in the base system to assemble 32-bit code. if (getToolChain().getArch() == llvm::Triple::x86) CmdArgs.push_back("--32"); // Set byte order explicitly if (getToolChain().getArch() == llvm::Triple::mips) CmdArgs.push_back("-EB"); else if (getToolChain().getArch() == llvm::Triple::mipsel) CmdArgs.push_back("-EL"); Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString((getToolChain().GetProgramPath("as"))); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void netbsd::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; if (!D.SysRoot.empty()) CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot)); if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-Bstatic"); } else { if (Args.hasArg(options::OPT_rdynamic)) CmdArgs.push_back("-export-dynamic"); CmdArgs.push_back("--eh-frame-hdr"); if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-Bshareable"); } else { CmdArgs.push_back("-dynamic-linker"); CmdArgs.push_back("/libexec/ld.elf_so"); } } // When building 32-bit code on NetBSD/amd64, we have to explicitly // instruct ld in the base system to link 32-bit code. if (getToolChain().getArch() == llvm::Triple::x86) { CmdArgs.push_back("-m"); CmdArgs.push_back("elf_i386"); } if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Invalid output."); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) { CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crt0.o"))); CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crti.o"))); CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtbegin.o"))); } else { CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crti.o"))); CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtbeginS.o"))); } } Args.AddAllArgs(CmdArgs, options::OPT_L); Args.AddAllArgs(CmdArgs, options::OPT_T_Group); Args.AddAllArgs(CmdArgs, options::OPT_e); Args.AddAllArgs(CmdArgs, options::OPT_s); Args.AddAllArgs(CmdArgs, options::OPT_t); Args.AddAllArgs(CmdArgs, options::OPT_Z_Flag); Args.AddAllArgs(CmdArgs, options::OPT_r); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { if (D.CCCIsCXX) { getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); CmdArgs.push_back("-lm"); } // FIXME: For some reason GCC passes -lgcc and -lgcc_s before adding // the default system libraries. Just mimic this for now. if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-lgcc_eh"); } else { CmdArgs.push_back("--as-needed"); CmdArgs.push_back("-lgcc_s"); CmdArgs.push_back("--no-as-needed"); } CmdArgs.push_back("-lgcc"); if (Args.hasArg(options::OPT_pthread)) CmdArgs.push_back("-lpthread"); CmdArgs.push_back("-lc"); CmdArgs.push_back("-lgcc"); if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-lgcc_eh"); } else { CmdArgs.push_back("--as-needed"); CmdArgs.push_back("-lgcc_s"); CmdArgs.push_back("--no-as-needed"); } } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath( "crtend.o"))); else CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath( "crtendS.o"))); CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath( "crtn.o"))); } addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple()); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void linuxtools::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; // Add --32/--64 to make sure we get the format we want. // This is incomplete if (getToolChain().getArch() == llvm::Triple::x86) { CmdArgs.push_back("--32"); } else if (getToolChain().getArch() == llvm::Triple::x86_64) { CmdArgs.push_back("--64"); } else if (getToolChain().getArch() == llvm::Triple::ppc) { CmdArgs.push_back("-a32"); CmdArgs.push_back("-mppc"); CmdArgs.push_back("-many"); } else if (getToolChain().getArch() == llvm::Triple::ppc64) { CmdArgs.push_back("-a64"); CmdArgs.push_back("-mppc64"); CmdArgs.push_back("-many"); } else if (getToolChain().getArch() == llvm::Triple::arm) { StringRef MArch = getToolChain().getArchName(); if (MArch == "armv7" || MArch == "armv7a" || MArch == "armv7-a") CmdArgs.push_back("-mfpu=neon"); StringRef ARMFloatABI = getARMFloatABI(getToolChain().getDriver(), Args, getToolChain().getTriple()); CmdArgs.push_back(Args.MakeArgString("-mfloat-abi=" + ARMFloatABI)); Args.AddLastArg(CmdArgs, options::OPT_march_EQ); Args.AddLastArg(CmdArgs, options::OPT_mcpu_EQ); Args.AddLastArg(CmdArgs, options::OPT_mfpu_EQ); } else if (getToolChain().getArch() == llvm::Triple::mips || getToolChain().getArch() == llvm::Triple::mipsel || getToolChain().getArch() == llvm::Triple::mips64 || getToolChain().getArch() == llvm::Triple::mips64el) { StringRef CPUName; StringRef ABIName; getMipsCPUAndABI(Args, getToolChain(), CPUName, ABIName); CmdArgs.push_back("-march"); CmdArgs.push_back(CPUName.data()); // Convert ABI name to the GNU tools acceptable variant. if (ABIName == "o32") ABIName = "32"; else if (ABIName == "n64") ABIName = "64"; CmdArgs.push_back("-mabi"); CmdArgs.push_back(ABIName.data()); if (getToolChain().getArch() == llvm::Triple::mips || getToolChain().getArch() == llvm::Triple::mips64) CmdArgs.push_back("-EB"); else CmdArgs.push_back("-EL"); Arg *LastPICArg = Args.getLastArg(options::OPT_fPIC, options::OPT_fno_PIC, options::OPT_fpic, options::OPT_fno_pic, options::OPT_fPIE, options::OPT_fno_PIE, options::OPT_fpie, options::OPT_fno_pie); if (LastPICArg && (LastPICArg->getOption().matches(options::OPT_fPIC) || LastPICArg->getOption().matches(options::OPT_fpic) || LastPICArg->getOption().matches(options::OPT_fPIE) || LastPICArg->getOption().matches(options::OPT_fpie))) { CmdArgs.push_back("-KPIC"); } } Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } static void AddLibgcc(llvm::Triple Triple, const Driver &D, ArgStringList &CmdArgs, const ArgList &Args) { bool isAndroid = Triple.getEnvironment() == llvm::Triple::Android; bool StaticLibgcc = isAndroid || Args.hasArg(options::OPT_static) || Args.hasArg(options::OPT_static_libgcc); if (!D.CCCIsCXX) CmdArgs.push_back("-lgcc"); if (StaticLibgcc) { if (D.CCCIsCXX) CmdArgs.push_back("-lgcc"); } else { if (!D.CCCIsCXX) CmdArgs.push_back("--as-needed"); CmdArgs.push_back("-lgcc_s"); if (!D.CCCIsCXX) CmdArgs.push_back("--no-as-needed"); } if (StaticLibgcc && !isAndroid) CmdArgs.push_back("-lgcc_eh"); else if (!Args.hasArg(options::OPT_shared) && D.CCCIsCXX) CmdArgs.push_back("-lgcc"); } static bool hasMipsN32ABIArg(const ArgList &Args) { Arg *A = Args.getLastArg(options::OPT_mabi_EQ); return A && (A->getValue() == StringRef("n32")); } void linuxtools::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const toolchains::Linux& ToolChain = static_cast(getToolChain()); const Driver &D = ToolChain.getDriver(); const bool isAndroid = ToolChain.getTriple().getEnvironment() == llvm::Triple::Android; ArgStringList CmdArgs; // Silence warning for "clang -g foo.o -o foo" Args.ClaimAllArgs(options::OPT_g_Group); // and "clang -emit-llvm foo.o -o foo" Args.ClaimAllArgs(options::OPT_emit_llvm); // and for "clang -w foo.o -o foo". Other warning options are already // handled somewhere else. Args.ClaimAllArgs(options::OPT_w); if (!D.SysRoot.empty()) CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot)); if (Args.hasArg(options::OPT_pie)) CmdArgs.push_back("-pie"); if (Args.hasArg(options::OPT_rdynamic)) CmdArgs.push_back("-export-dynamic"); if (Args.hasArg(options::OPT_s)) CmdArgs.push_back("-s"); for (std::vector::const_iterator i = ToolChain.ExtraOpts.begin(), e = ToolChain.ExtraOpts.end(); i != e; ++i) CmdArgs.push_back(i->c_str()); if (!Args.hasArg(options::OPT_static)) { CmdArgs.push_back("--eh-frame-hdr"); } CmdArgs.push_back("-m"); if (ToolChain.getArch() == llvm::Triple::x86) CmdArgs.push_back("elf_i386"); else if (ToolChain.getArch() == llvm::Triple::arm || ToolChain.getArch() == llvm::Triple::thumb) CmdArgs.push_back("armelf_linux_eabi"); else if (ToolChain.getArch() == llvm::Triple::ppc) CmdArgs.push_back("elf32ppclinux"); else if (ToolChain.getArch() == llvm::Triple::ppc64) CmdArgs.push_back("elf64ppc"); else if (ToolChain.getArch() == llvm::Triple::mips) CmdArgs.push_back("elf32btsmip"); else if (ToolChain.getArch() == llvm::Triple::mipsel) CmdArgs.push_back("elf32ltsmip"); else if (ToolChain.getArch() == llvm::Triple::mips64) { if (hasMipsN32ABIArg(Args)) CmdArgs.push_back("elf32btsmipn32"); else CmdArgs.push_back("elf64btsmip"); } else if (ToolChain.getArch() == llvm::Triple::mips64el) { if (hasMipsN32ABIArg(Args)) CmdArgs.push_back("elf32ltsmipn32"); else CmdArgs.push_back("elf64ltsmip"); } else CmdArgs.push_back("elf_x86_64"); if (Args.hasArg(options::OPT_static)) { if (ToolChain.getArch() == llvm::Triple::arm || ToolChain.getArch() == llvm::Triple::thumb) CmdArgs.push_back("-Bstatic"); else CmdArgs.push_back("-static"); } else if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-shared"); if (isAndroid) { CmdArgs.push_back("-Bsymbolic"); } } if (ToolChain.getArch() == llvm::Triple::arm || ToolChain.getArch() == llvm::Triple::thumb || (!Args.hasArg(options::OPT_static) && !Args.hasArg(options::OPT_shared))) { CmdArgs.push_back("-dynamic-linker"); if (isAndroid) CmdArgs.push_back("/system/bin/linker"); else if (ToolChain.getArch() == llvm::Triple::x86) CmdArgs.push_back("/lib/ld-linux.so.2"); else if (ToolChain.getArch() == llvm::Triple::arm || ToolChain.getArch() == llvm::Triple::thumb) { if (ToolChain.getTriple().getEnvironment() == llvm::Triple::GNUEABIHF) CmdArgs.push_back("/lib/ld-linux-armhf.so.3"); else CmdArgs.push_back("/lib/ld-linux.so.3"); } else if (ToolChain.getArch() == llvm::Triple::mips || ToolChain.getArch() == llvm::Triple::mipsel) CmdArgs.push_back("/lib/ld.so.1"); else if (ToolChain.getArch() == llvm::Triple::mips64 || ToolChain.getArch() == llvm::Triple::mips64el) { if (hasMipsN32ABIArg(Args)) CmdArgs.push_back("/lib32/ld.so.1"); else CmdArgs.push_back("/lib64/ld.so.1"); } else if (ToolChain.getArch() == llvm::Triple::ppc) CmdArgs.push_back("/lib/ld.so.1"); else if (ToolChain.getArch() == llvm::Triple::ppc64) CmdArgs.push_back("/lib64/ld64.so.1"); else CmdArgs.push_back("/lib64/ld-linux-x86-64.so.2"); } CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!isAndroid) { const char *crt1 = NULL; if (!Args.hasArg(options::OPT_shared)){ if (Args.hasArg(options::OPT_pie)) crt1 = "Scrt1.o"; else crt1 = "crt1.o"; } if (crt1) CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crt1))); CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crti.o"))); } const char *crtbegin; if (Args.hasArg(options::OPT_static)) crtbegin = isAndroid ? "crtbegin_static.o" : "crtbeginT.o"; else if (Args.hasArg(options::OPT_shared)) crtbegin = isAndroid ? "crtbegin_so.o" : "crtbeginS.o"; else if (Args.hasArg(options::OPT_pie)) crtbegin = isAndroid ? "crtbegin_dynamic.o" : "crtbeginS.o"; else crtbegin = isAndroid ? "crtbegin_dynamic.o" : "crtbegin.o"; CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtbegin))); // Add crtfastmath.o if available and fast math is enabled. ToolChain.AddFastMathRuntimeIfAvailable(Args, CmdArgs); } Args.AddAllArgs(CmdArgs, options::OPT_L); const ToolChain::path_list Paths = ToolChain.getFilePaths(); for (ToolChain::path_list::const_iterator i = Paths.begin(), e = Paths.end(); i != e; ++i) CmdArgs.push_back(Args.MakeArgString(StringRef("-L") + *i)); // Tell the linker to load the plugin. This has to come before AddLinkerInputs // as gold requires -plugin to come before any -plugin-opt that -Wl might // forward. if (D.IsUsingLTO(Args) || Args.hasArg(options::OPT_use_gold_plugin)) { CmdArgs.push_back("-plugin"); std::string Plugin = ToolChain.getDriver().Dir + "/../lib/LLVMgold.so"; CmdArgs.push_back(Args.MakeArgString(Plugin)); + + // Try to pass driver level flags relevant to LTO code generation down to + // the plugin. + + // Handle architecture-specific flags for selecting CPU variants. + if (ToolChain.getArch() == llvm::Triple::x86 || + ToolChain.getArch() == llvm::Triple::x86_64) + CmdArgs.push_back( + Args.MakeArgString(Twine("-plugin-opt=mcpu=") + + getX86TargetCPU(Args, ToolChain.getTriple()))); + else if (ToolChain.getArch() == llvm::Triple::arm || + ToolChain.getArch() == llvm::Triple::thumb) + CmdArgs.push_back( + Args.MakeArgString(Twine("-plugin-opt=mcpu=") + + getARMTargetCPU(Args, ToolChain.getTriple()))); + + // FIXME: Factor out logic for MIPS, PPC, and other targets to support this + // as well. } + if (Args.hasArg(options::OPT_Z_Xlinker__no_demangle)) CmdArgs.push_back("--no-demangle"); AddLinkerInputs(ToolChain, Inputs, Args, CmdArgs); SanitizerArgs Sanitize(D, Args); // Call this before we add the C++ ABI library. if (Sanitize.needsUbsanRt()) addUbsanRTLinux(getToolChain(), Args, CmdArgs); if (D.CCCIsCXX && !Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) && !Args.hasArg(options::OPT_static); if (OnlyLibstdcxxStatic) CmdArgs.push_back("-Bstatic"); ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs); if (OnlyLibstdcxxStatic) CmdArgs.push_back("-Bdynamic"); CmdArgs.push_back("-lm"); } // Call this before we add the C run-time. if (Sanitize.needsAsanRt()) addAsanRTLinux(getToolChain(), Args, CmdArgs); if (Sanitize.needsTsanRt()) addTsanRTLinux(getToolChain(), Args, CmdArgs); if (!Args.hasArg(options::OPT_nostdlib)) { if (!Args.hasArg(options::OPT_nodefaultlibs)) { if (Args.hasArg(options::OPT_static)) CmdArgs.push_back("--start-group"); AddLibgcc(ToolChain.getTriple(), D, CmdArgs, Args); if (Args.hasArg(options::OPT_pthread) || Args.hasArg(options::OPT_pthreads)) CmdArgs.push_back("-lpthread"); CmdArgs.push_back("-lc"); if (Args.hasArg(options::OPT_static)) CmdArgs.push_back("--end-group"); else AddLibgcc(ToolChain.getTriple(), D, CmdArgs, Args); } if (!Args.hasArg(options::OPT_nostartfiles)) { const char *crtend; if (Args.hasArg(options::OPT_shared)) crtend = isAndroid ? "crtend_so.o" : "crtendS.o"; else if (Args.hasArg(options::OPT_pie)) crtend = isAndroid ? "crtend_android.o" : "crtendS.o"; else crtend = isAndroid ? "crtend_android.o" : "crtend.o"; CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath(crtend))); if (!isAndroid) CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtn.o"))); } } addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple()); C.addCommand(new Command(JA, *this, ToolChain.Linker.c_str(), CmdArgs)); } void minix::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void minix::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Invalid output."); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crt1.o"))); CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crti.o"))); CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o"))); CmdArgs.push_back(Args.MakeArgString(getToolChain().GetFilePath("crtn.o"))); } Args.AddAllArgs(CmdArgs, options::OPT_L); Args.AddAllArgs(CmdArgs, options::OPT_T_Group); Args.AddAllArgs(CmdArgs, options::OPT_e); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs); addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple()); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { if (D.CCCIsCXX) { getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); CmdArgs.push_back("-lm"); } } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (Args.hasArg(options::OPT_pthread)) CmdArgs.push_back("-lpthread"); CmdArgs.push_back("-lc"); CmdArgs.push_back("-lCompilerRT-Generic"); CmdArgs.push_back("-L/usr/pkg/compiler-rt/lib"); CmdArgs.push_back( Args.MakeArgString(getToolChain().GetFilePath("crtend.o"))); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } /// DragonFly Tools // For now, DragonFly Assemble does just about the same as for // FreeBSD, but this may change soon. void dragonfly::Assemble::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; // When building 32-bit code on DragonFly/pc64, we have to explicitly // instruct as in the base system to assemble 32-bit code. if (getToolChain().getArch() == llvm::Triple::x86) CmdArgs.push_back("--32"); Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler); CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { const InputInfo &II = *it; CmdArgs.push_back(II.getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void dragonfly::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { const Driver &D = getToolChain().getDriver(); ArgStringList CmdArgs; if (!D.SysRoot.empty()) CmdArgs.push_back(Args.MakeArgString("--sysroot=" + D.SysRoot)); if (Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-Bstatic"); } else { if (Args.hasArg(options::OPT_shared)) CmdArgs.push_back("-Bshareable"); else { CmdArgs.push_back("-dynamic-linker"); CmdArgs.push_back("/usr/libexec/ld-elf.so.2"); } } // When building 32-bit code on DragonFly/pc64, we have to explicitly // instruct ld in the base system to link 32-bit code. if (getToolChain().getArch() == llvm::Triple::x86) { CmdArgs.push_back("-m"); CmdArgs.push_back("elf_i386"); } if (Output.isFilename()) { CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); } else { assert(Output.isNothing() && "Invalid output."); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) { CmdArgs.push_back( Args.MakeArgString(getToolChain().GetFilePath("crt1.o"))); CmdArgs.push_back( Args.MakeArgString(getToolChain().GetFilePath("crti.o"))); CmdArgs.push_back( Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o"))); } else { CmdArgs.push_back( Args.MakeArgString(getToolChain().GetFilePath("crti.o"))); CmdArgs.push_back( Args.MakeArgString(getToolChain().GetFilePath("crtbeginS.o"))); } } Args.AddAllArgs(CmdArgs, options::OPT_L); Args.AddAllArgs(CmdArgs, options::OPT_T_Group); Args.AddAllArgs(CmdArgs, options::OPT_e); AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs); if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nodefaultlibs)) { // FIXME: GCC passes on -lgcc, -lgcc_pic and a whole lot of // rpaths CmdArgs.push_back("-L/usr/lib/gcc41"); if (!Args.hasArg(options::OPT_static)) { CmdArgs.push_back("-rpath"); CmdArgs.push_back("/usr/lib/gcc41"); CmdArgs.push_back("-rpath-link"); CmdArgs.push_back("/usr/lib/gcc41"); CmdArgs.push_back("-rpath"); CmdArgs.push_back("/usr/lib"); CmdArgs.push_back("-rpath-link"); CmdArgs.push_back("/usr/lib"); } if (D.CCCIsCXX) { getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs); CmdArgs.push_back("-lm"); } if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-lgcc_pic"); } else { CmdArgs.push_back("-lgcc"); } if (Args.hasArg(options::OPT_pthread)) CmdArgs.push_back("-lpthread"); if (!Args.hasArg(options::OPT_nolibc)) { CmdArgs.push_back("-lc"); } if (Args.hasArg(options::OPT_shared)) { CmdArgs.push_back("-lgcc_pic"); } else { CmdArgs.push_back("-lgcc"); } } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { if (!Args.hasArg(options::OPT_shared)) CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtend.o"))); else CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtendS.o"))); CmdArgs.push_back(Args.MakeArgString( getToolChain().GetFilePath("crtn.o"))); } addProfileRT(getToolChain(), Args, CmdArgs, getToolChain().getTriple()); const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("ld")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } void visualstudio::Link::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &Args, const char *LinkingOutput) const { ArgStringList CmdArgs; if (Output.isFilename()) { CmdArgs.push_back(Args.MakeArgString(std::string("-out:") + Output.getFilename())); } else { assert(Output.isNothing() && "Invalid output."); } if (!Args.hasArg(options::OPT_nostdlib) && !Args.hasArg(options::OPT_nostartfiles)) { CmdArgs.push_back("-defaultlib:libcmt"); } CmdArgs.push_back("-nologo"); Args.AddAllArgValues(CmdArgs, options::OPT_l); // Add filenames immediately. for (InputInfoList::const_iterator it = Inputs.begin(), ie = Inputs.end(); it != ie; ++it) { if (it->isFilename()) CmdArgs.push_back(it->getFilename()); } const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("link.exe")); C.addCommand(new Command(JA, *this, Exec, CmdArgs)); } Index: user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.h =================================================================== --- user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.h (revision 247191) +++ user/attilio/vmobj-rwlock/contrib/llvm/tools/clang/lib/Driver/Tools.h (revision 247192) @@ -1,646 +1,647 @@ //===--- Tools.h - Tool Implementations -------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #ifndef CLANG_LIB_DRIVER_TOOLS_H_ #define CLANG_LIB_DRIVER_TOOLS_H_ #include "clang/Driver/Tool.h" #include "clang/Driver/Types.h" #include "clang/Driver/Util.h" #include "llvm/ADT/Triple.h" #include "llvm/Support/Compiler.h" namespace clang { class ObjCRuntime; namespace driver { class Driver; namespace toolchains { class Darwin; } namespace tools { /// \brief Clang compiler tool. class LLVM_LIBRARY_VISIBILITY Clang : public Tool { void AddPreprocessingOptions(Compilation &C, const Driver &D, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, const InputInfoList &Inputs) const; void AddARMTargetArgs(const ArgList &Args, ArgStringList &CmdArgs, bool KernelOrKext) const; void AddMIPSTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const; void AddPPCTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const; void AddSparcTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const; void AddX86TargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const; void AddHexagonTargetArgs (const ArgList &Args, ArgStringList &CmdArgs) const; enum RewriteKind { RK_None, RK_Fragile, RK_NonFragile }; ObjCRuntime AddObjCRuntimeArgs(const ArgList &args, ArgStringList &cmdArgs, RewriteKind rewrite) const; public: Clang(const ToolChain &TC) : Tool("clang", "clang frontend", TC) {} virtual bool hasGoodDiagnostics() const { return true; } virtual bool hasIntegratedAssembler() const { return true; } virtual bool hasIntegratedCPP() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; /// \brief Clang integrated assembler tool. class LLVM_LIBRARY_VISIBILITY ClangAs : public Tool { void AddARMTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const; + void AddX86TargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const; public: ClangAs(const ToolChain &TC) : Tool("clang::as", "clang integrated assembler", TC) {} virtual bool hasGoodDiagnostics() const { return true; } virtual bool hasIntegratedAssembler() const { return false; } virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; /// gcc - Generic GCC tool implementations. namespace gcc { class LLVM_LIBRARY_VISIBILITY Common : public Tool { public: Common(const char *Name, const char *ShortName, const ToolChain &TC) : Tool(Name, ShortName, TC) {} virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; /// RenderExtraToolArgs - Render any arguments necessary to force /// the particular tool mode. virtual void RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const = 0; }; class LLVM_LIBRARY_VISIBILITY Preprocess : public Common { public: Preprocess(const ToolChain &TC) : Common("gcc::Preprocess", "gcc preprocessor", TC) {} virtual bool hasGoodDiagnostics() const { return true; } virtual bool hasIntegratedCPP() const { return false; } virtual void RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const; }; class LLVM_LIBRARY_VISIBILITY Precompile : public Common { public: Precompile(const ToolChain &TC) : Common("gcc::Precompile", "gcc precompile", TC) {} virtual bool hasGoodDiagnostics() const { return true; } virtual bool hasIntegratedCPP() const { return true; } virtual void RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const; }; class LLVM_LIBRARY_VISIBILITY Compile : public Common { public: Compile(const ToolChain &TC) : Common("gcc::Compile", "gcc frontend", TC) {} virtual bool hasGoodDiagnostics() const { return true; } virtual bool hasIntegratedCPP() const { return true; } virtual void RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const; }; class LLVM_LIBRARY_VISIBILITY Assemble : public Common { public: Assemble(const ToolChain &TC) : Common("gcc::Assemble", "assembler (via gcc)", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Common { public: Link(const ToolChain &TC) : Common("gcc::Link", "linker (via gcc)", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const; }; } // end namespace gcc namespace hexagon { // For Hexagon, we do not need to instantiate tools for PreProcess, PreCompile and Compile. // We simply use "clang -cc1" for those actions. class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("hexagon::Assemble", "hexagon-as", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const; virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("hexagon::Link", "hexagon-ld", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void RenderExtraToolArgs(const JobAction &JA, ArgStringList &CmdArgs) const; virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace hexagon. namespace darwin { llvm::Triple::ArchType getArchTypeForDarwinArchName(StringRef Str); class LLVM_LIBRARY_VISIBILITY DarwinTool : public Tool { virtual void anchor(); protected: void AddDarwinArch(const ArgList &Args, ArgStringList &CmdArgs) const; const toolchains::Darwin &getDarwinToolChain() const { return reinterpret_cast(getToolChain()); } public: DarwinTool(const char *Name, const char *ShortName, const ToolChain &TC) : Tool(Name, ShortName, TC) {} }; class LLVM_LIBRARY_VISIBILITY CC1 : public DarwinTool { virtual void anchor(); public: static const char *getBaseInputName(const ArgList &Args, const InputInfoList &Input); static const char *getBaseInputStem(const ArgList &Args, const InputInfoList &Input); static const char *getDependencyFileName(const ArgList &Args, const InputInfoList &Inputs); protected: const char *getCC1Name(types::ID Type) const; void AddCC1Args(const ArgList &Args, ArgStringList &CmdArgs) const; void RemoveCC1UnsupportedArgs(ArgStringList &CmdArgs) const; void AddCC1OptionsArgs(const ArgList &Args, ArgStringList &CmdArgs, const InputInfoList &Inputs, const ArgStringList &OutputArgs) const; void AddCPPOptionsArgs(const ArgList &Args, ArgStringList &CmdArgs, const InputInfoList &Inputs, const ArgStringList &OutputArgs) const; void AddCPPUniqueOptionsArgs(const ArgList &Args, ArgStringList &CmdArgs, const InputInfoList &Inputs) const; void AddCPPArgs(const ArgList &Args, ArgStringList &CmdArgs) const; public: CC1(const char *Name, const char *ShortName, const ToolChain &TC) : DarwinTool(Name, ShortName, TC) {} virtual bool hasGoodDiagnostics() const { return true; } virtual bool hasIntegratedCPP() const { return true; } }; class LLVM_LIBRARY_VISIBILITY Preprocess : public CC1 { public: Preprocess(const ToolChain &TC) : CC1("darwin::Preprocess", "gcc preprocessor", TC) {} virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Compile : public CC1 { public: Compile(const ToolChain &TC) : CC1("darwin::Compile", "gcc frontend", TC) {} virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Assemble : public DarwinTool { public: Assemble(const ToolChain &TC) : DarwinTool("darwin::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public DarwinTool { bool NeedsTempPath(const InputInfoList &Inputs) const; void AddLinkArgs(Compilation &C, const ArgList &Args, ArgStringList &CmdArgs, const InputInfoList &Inputs) const; public: Link(const ToolChain &TC) : DarwinTool("darwin::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Lipo : public DarwinTool { public: Lipo(const ToolChain &TC) : DarwinTool("darwin::Lipo", "lipo", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Dsymutil : public DarwinTool { public: Dsymutil(const ToolChain &TC) : DarwinTool("darwin::Dsymutil", "dsymutil", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY VerifyDebug : public DarwinTool { public: VerifyDebug(const ToolChain &TC) : DarwinTool("darwin::VerifyDebug", "dwarfdump", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } /// openbsd -- Directly call GNU Binutils assembler and linker namespace openbsd { class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("openbsd::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("openbsd::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace openbsd /// bitrig -- Directly call GNU Binutils assembler and linker namespace bitrig { class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("bitrig::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("bitrig::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace bitrig /// freebsd -- Directly call GNU Binutils assembler and linker namespace freebsd { class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("freebsd::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("freebsd::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace freebsd /// netbsd -- Directly call GNU Binutils assembler and linker namespace netbsd { class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("netbsd::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("netbsd::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace netbsd /// linux -- Directly call GNU Binutils assembler and linker namespace linuxtools { class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("linux::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("linux::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } /// minix -- Directly call GNU Binutils assembler and linker namespace minix { class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("minix::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("minix::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace minix /// solaris -- Directly call Solaris assembler and linker namespace solaris { class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("solaris::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("solaris::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace solaris /// auroraux -- Directly call GNU Binutils assembler and linker namespace auroraux { class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("auroraux::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("auroraux::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace auroraux /// dragonfly -- Directly call GNU Binutils assembler and linker namespace dragonfly { class LLVM_LIBRARY_VISIBILITY Assemble : public Tool { public: Assemble(const ToolChain &TC) : Tool("dragonfly::Assemble", "assembler", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("dragonfly::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace dragonfly /// Visual studio tools. namespace visualstudio { class LLVM_LIBRARY_VISIBILITY Link : public Tool { public: Link(const ToolChain &TC) : Tool("visualstudio::Link", "linker", TC) {} virtual bool hasIntegratedCPP() const { return false; } virtual bool isLinkJob() const { return true; } virtual void ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, const ArgList &TCArgs, const char *LinkingOutput) const; }; } // end namespace visualstudio } // end namespace toolchains } // end namespace driver } // end namespace clang #endif // CLANG_LIB_DRIVER_TOOLS_H_ Index: user/attilio/vmobj-rwlock/contrib/llvm/tools/clang =================================================================== --- user/attilio/vmobj-rwlock/contrib/llvm/tools/clang (revision 247191) +++ user/attilio/vmobj-rwlock/contrib/llvm/tools/clang (revision 247192) Property changes on: user/attilio/vmobj-rwlock/contrib/llvm/tools/clang ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/llvm/tools/clang:r247016-247191 Index: user/attilio/vmobj-rwlock/contrib/llvm =================================================================== --- user/attilio/vmobj-rwlock/contrib/llvm (revision 247191) +++ user/attilio/vmobj-rwlock/contrib/llvm (revision 247192) Property changes on: user/attilio/vmobj-rwlock/contrib/llvm ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/contrib/llvm:r247016-247191 Index: user/attilio/vmobj-rwlock/etc/Makefile =================================================================== --- user/attilio/vmobj-rwlock/etc/Makefile (revision 247191) +++ user/attilio/vmobj-rwlock/etc/Makefile (revision 247192) @@ -1,390 +1,390 @@ # from: @(#)Makefile 5.11 (Berkeley) 5/21/91 # $FreeBSD$ .include .if ${MK_SENDMAIL} != "no" SUBDIR= sendmail .endif BIN1= crontab \ devd.conf \ devfs.conf \ ddb.conf \ dhclient.conf \ disktab \ fbtab \ ftpusers \ gettytab \ group \ hosts \ hosts.allow \ hosts.equiv \ inetd.conf \ libalias.conf \ login.access \ login.conf \ mac.conf \ motd \ netconfig \ network.subr \ networks \ newsyslog.conf \ nsswitch.conf \ phones \ profile \ protocols \ rc \ rc.bsdextended \ rc.firewall \ rc.initdiskless \ rc.sendmail \ rc.shutdown \ rc.subr \ remote \ rpc \ services \ shells \ sysctl.conf \ syslog.conf \ termcap.small .if exists(${.CURDIR}/etc.${MACHINE}/ttys) BIN1+= etc.${MACHINE}/ttys .elif exists(${.CURDIR}/etc.${MACHINE_ARCH}/ttys) BIN1+= etc.${MACHINE_ARCH}/ttys .elif exists(${.CURDIR}/etc.${MACHINE_CPUARCH}/ttys) BIN1+= etc.${MACHINE_CPUARCH}/ttys .else .error etc.MACHINE/ttys missing .endif OPENBSMDIR= ${.CURDIR}/../contrib/openbsm BSM_ETC_OPEN_FILES= ${OPENBSMDIR}/etc/audit_class \ ${OPENBSMDIR}/etc/audit_event BSM_ETC_RESTRICTED_FILES= ${OPENBSMDIR}/etc/audit_control \ ${OPENBSMDIR}/etc/audit_user BSM_ETC_EXEC_FILES= ${OPENBSMDIR}/etc/audit_warn BSM_ETC_DIR= ${DESTDIR}/etc/security # NB: keep these sorted by MK_* knobs .if ${MK_AMD} != "no" BIN1+= amd.map .endif .if ${MK_APM} != "no" BIN1+= apmd.conf .endif .if ${MK_BSNMP} != "no" BIN1+= snmpd.config .endif .if ${MK_FREEBSD_UPDATE} != "no" BIN1+= freebsd-update.conf .endif .if ${MK_LOCATE} != "no" BIN1+= ${.CURDIR}/../usr.bin/locate/locate/locate.rc .endif .if ${MK_LPR} != "no" BIN1+= hosts.lpd printcap .endif .if ${MK_MAIL} != "no" BIN1+= ${.CURDIR}/../usr.bin/mail/misc/mail.rc .endif .if ${MK_NTP} != "no" BIN1+= ntp.conf .endif .if ${MK_OPENSSH} != "no" SSH= ${.CURDIR}/../crypto/openssh/ssh_config \ ${.CURDIR}/../crypto/openssh/sshd_config \ ${.CURDIR}/../crypto/openssh/moduli .endif .if ${MK_OPENSSL} != "no" SSL= ${.CURDIR}/../crypto/openssl/apps/openssl.cnf .endif .if ${MK_NS_CACHING} != "no" BIN1+= nscd.conf .endif .if ${MK_PORTSNAP} != "no" BIN1+= portsnap.conf .endif .if ${MK_PF} != "no" BIN1+= pf.os .endif .if ${MK_TCSH} != "no" BIN1+= csh.cshrc csh.login csh.logout .endif .if ${MK_WIRELESS} != "no" BIN1+= regdomain.xml .endif # -rwxr-xr-x root:wheel, for the new cron root:wheel BIN2= netstart pccard_ether rc.suspend rc.resume MTREE= BSD.include.dist BSD.root.dist BSD.usr.dist BSD.var.dist .if ${MK_SENDMAIL} != "no" MTREE+= BSD.sendmail.dist .endif .if ${MK_BIND} != "no" MTREE+= BIND.chroot.dist .if ${MK_BIND_LIBS} != "no" MTREE+= BIND.include.dist .endif .endif PPPCNF= ppp.conf .if ${MK_SENDMAIL} == "no" ETCMAIL=mailer.conf aliases .else ETCMAIL=Makefile README mailer.conf access.sample virtusertable.sample \ mailertable.sample aliases .endif # Special top level files for FreeBSD FREEBSD=COPYRIGHT # Sanitize DESTDIR DESTDIR:= ${DESTDIR:C://*:/:g} afterinstall: .if ${MK_MAN} != "no" ${_+_}cd ${.CURDIR}/../share/man; ${MAKE} makedb .endif distribute: ${_+_}cd ${.CURDIR} ; ${MAKE} install DESTDIR=${DISTDIR}/${DISTRIBUTION} ${_+_}cd ${.CURDIR} ; ${MAKE} distribution DESTDIR=${DISTDIR}/${DISTRIBUTION} .include .if ${TARGET_ENDIANNESS} == "1234" CAP_MKDB_ENDIAN?= -l PWD_MKDB_ENDIAN?= -L .elif ${TARGET_ENDIANNESS} == "4321" CAP_MKDB_ENDIAN?= -b PWD_MKDB_ENDIAN?= -B .else CAP_MKDB_ENDIAN?= PWD_MKDB_ENDIAN?= .endif .if defined(NO_ROOT) METALOG.add?= cat -l >> ${METALOG} .endif distribution: .if !defined(DESTDIR) @echo "set DESTDIR before running \"make ${.TARGET}\"" @false .endif cd ${.CURDIR}; \ ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \ ${BIN1} ${DESTDIR}/etc; \ cap_mkdb ${CAP_MKDB_ENDIAN} ${DESTDIR}/etc/login.conf; \ ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 755 \ ${BIN2} ${DESTDIR}/etc; \ ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 600 \ master.passwd nsmb.conf opieaccess ${DESTDIR}/etc; .if ${MK_AT} == "no" sed -i "" -e 's;.*/usr/libexec/atrun;#&;' ${DESTDIR}/etc/crontab .endif .if ${MK_TCSH} == "no" sed -i "" -e 's;/bin/csh;/bin/sh;' ${DESTDIR}/etc/master.passwd .endif pwd_mkdb ${PWD_MKDB_ENDIAN} -i -p -d ${DESTDIR}/etc \ ${DESTDIR}/etc/master.passwd .if defined(NO_ROOT) ( \ echo "./etc/login.conf.db type=file mode=0644 uname=root gname=wheel"; \ echo "./etc/passwd type=file mode=0644 uname=root gname=wheel"; \ echo "./etc/pwd.db type=file mode=0644 uname=root gname=wheel"; \ echo "./etc/spwd.db type=file mode=0600 uname=root gname=wheel"; \ ) | ${METALOG.add} .endif .if ${MK_ATF} != "no" ${_+_}cd ${.CURDIR}/atf; ${MAKE} install .endif .if ${MK_BLUETOOTH} != "no" ${_+_}cd ${.CURDIR}/bluetooth; ${MAKE} install .endif ${_+_}cd ${.CURDIR}/defaults; ${MAKE} install ${_+_}cd ${.CURDIR}/devd; ${MAKE} install ${_+_}cd ${.CURDIR}/gss; ${MAKE} install ${_+_}cd ${.CURDIR}/periodic; ${MAKE} install ${_+_}cd ${.CURDIR}/rc.d; ${MAKE} install ${_+_}cd ${.CURDIR}/../gnu/usr.bin/send-pr; ${MAKE} etc-gnats-freefall ${_+_}cd ${.CURDIR}/../share/termcap; ${MAKE} etc-termcap ${_+_}cd ${.CURDIR}/../usr.sbin/rmt; ${MAKE} etc-rmt ${_+_}cd ${.CURDIR}/pam.d; ${MAKE} install cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 0444 \ ${BSM_ETC_OPEN_FILES} ${BSM_ETC_DIR} cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 0600 \ ${BSM_ETC_RESTRICTED_FILES} ${BSM_ETC_DIR} cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 0500 \ ${BSM_ETC_EXEC_FILES} ${BSM_ETC_DIR} .if ${MK_BIND_MTREE} != "no" if [ ! -e ${DESTDIR}/etc/namedb ]; then \ ln -s ../var/named/etc/namedb ${DESTDIR}/etc/namedb; \ fi .endif .if ${MK_BIND_ETC} != "no" ${_+_}cd ${.CURDIR}/namedb; ${MAKE} install .endif .if ${MK_SENDMAIL} != "no" ${_+_}cd ${.CURDIR}/sendmail; ${MAKE} distribution .endif .if ${MK_OPENSSH} != "no" cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \ ${SSH} ${DESTDIR}/etc/ssh .endif .if ${MK_OPENSSL} != "no" cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \ ${SSL} ${DESTDIR}/etc/ssl .endif .if ${MK_KERBEROS} != "no" cd ${.CURDIR}/root; \ ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \ dot.k5login ${DESTDIR}/root/.k5login; .endif cd ${.CURDIR}/root; \ ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \ dot.profile ${DESTDIR}/root/.profile; \ rm -f ${DESTDIR}/.profile; \ ln ${DESTDIR}/root/.profile ${DESTDIR}/.profile .if ${MK_TCSH} != "no" cd ${.CURDIR}/root; \ ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \ dot.cshrc ${DESTDIR}/root/.cshrc; \ ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \ dot.login ${DESTDIR}/root/.login; \ rm -f ${DESTDIR}/.cshrc; \ ln ${DESTDIR}/root/.cshrc ${DESTDIR}/.cshrc .endif cd ${.CURDIR}/mtree; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 444 \ ${MTREE} ${DESTDIR}/etc/mtree .if ${MK_PPP} != "no" cd ${.CURDIR}/ppp; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 600 \ ${PPPCNF} ${DESTDIR}/etc/ppp .endif .if ${MK_MAIL} != "no" cd ${.CURDIR}/mail; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 \ ${ETCMAIL} ${DESTDIR}/etc/mail if [ -d ${DESTDIR}/etc/mail -a -f ${DESTDIR}/etc/mail/aliases -a \ ! -f ${DESTDIR}/etc/aliases ]; then \ ln -s mail/aliases ${DESTDIR}/etc/aliases; \ fi .endif ${INSTALL} -o ${BINOWN} -g operator -m 664 /dev/null \ ${DESTDIR}/etc/dumpdates ${INSTALL} -o nobody -g ${BINGRP} -m 644 /dev/null \ ${DESTDIR}/var/db/locate.database ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 644 ${.CURDIR}/minfree \ ${DESTDIR}/var/crash cd ${.CURDIR}/..; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 444 \ ${FREEBSD} ${DESTDIR}/ .if ${MK_BOOT} != "no" .if exists(${.CURDIR}/../sys/${MACHINE}/conf/GENERIC.hints) ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 444 \ ${.CURDIR}/../sys/${MACHINE}/conf/GENERIC.hints \ ${DESTDIR}/boot/device.hints .endif .endif .if ${MK_NIS} == "no" sed -i "" -e 's/.*_compat:/# &/' -e 's/compat$$/files/' \ ${DESTDIR}/etc/nsswitch.conf .endif MTREE_CMD?= mtree MTREES= mtree/BSD.root.dist / \ mtree/BSD.var.dist /var \ mtree/BSD.usr.dist /usr \ mtree/BSD.include.dist /usr/include .if ${MK_BIND_LIBS} != "no" MTREES+= mtree/BIND.include.dist /usr/include .endif .if ${MK_BIND_MTREE} != "no" MTREES+= mtree/BIND.chroot.dist /var/named .endif .if ${MK_GROFF} != "no" MTREES+= mtree/BSD.groff.dist /usr .endif .if ${MK_SENDMAIL} != "no" MTREES+= mtree/BSD.sendmail.dist / .endif .for mtree in ${LOCAL_MTREE} MTREES+= ../${mtree} / .endfor distrib-dirs: @set ${MTREES}; \ while test $$# -ge 2; do \ m=${.CURDIR}/$$1; \ shift; \ d=${DESTDIR}$$1; \ shift; \ ${ECHO} ${MTREE_CMD} -deU ${MTREE_FOLLOWS_SYMLINKS} \ -f $$m -p $$d; \ ${MTREE_CMD} -deU ${MTREE_FOLLOWS_SYMLINKS} -f $$m -p $$d; \ done; true .if defined(NO_ROOT) @set ${MTREES}; \ while test $$# -ge 2; do \ m=${.CURDIR}/$$1; \ shift; \ d=$$1; \ test "$$d" == "/" && d=""; \ d=${DISTBASE}$$d; \ shift; \ ${ECHO} "${MTREE_CMD:N-W} -C -f $$m -K uname,gname | " \ "sed s#^\.#.$$d# | ${METALOG.add}" ; \ ${MTREE_CMD:N-W} -C -f $$m -K uname,gname | sed s#^\.#.$$d# | \ ${METALOG.add} ; \ done; true .endif - ${INSTALL_SYMLINK} usr/src/sys ${DESTDIR}/ + ${INSTALL_SYMLINK} usr/src/sys ${DESTDIR}/sys cd ${DESTDIR}/usr/share/man; \ for mandir in man*; do \ ${INSTALL_SYMLINK} ../$$mandir \ ${DESTDIR}/usr/share/man/en.ISO8859-1/; \ ${INSTALL_SYMLINK} ../$$mandir \ ${DESTDIR}/usr/share/man/en.UTF-8/; \ done cd ${DESTDIR}/usr/share/openssl/man; \ for mandir in man*; do \ ${INSTALL_SYMLINK} ../$$mandir \ ${DESTDIR}/usr/share/openssl/man/en.ISO8859-1/; \ done set - `grep "^[a-zA-Z]" ${.CURDIR}/man.alias`; \ while [ $$# -gt 0 ] ; do \ ${INSTALL_SYMLINK} "$$2" "${DESTDIR}/usr/share/man/$$1"; \ ${INSTALL_SYMLINK} "$$2" \ "${DESTDIR}/usr/share/openssl/man/$$1"; \ shift; shift; \ done set - `grep "^[a-zA-Z]" ${.CURDIR}/nls.alias`; \ while [ $$# -gt 0 ] ; do \ ${INSTALL_SYMLINK} "$$2" "${DESTDIR}/usr/share/nls/$$1"; \ shift; shift; \ done etc-examples: cd ${.CURDIR}; ${INSTALL} -o ${BINOWN} -g ${BINGRP} -m 444 \ ${BIN1} ${BIN2} nsmb.conf opieaccess \ ${DESTDIR}/usr/share/examples/etc ${_+_}cd ${.CURDIR}/defaults; ${MAKE} install \ DESTDIR=${DESTDIR}/usr/share/examples .include Index: user/attilio/vmobj-rwlock/lib/libc/stdlib/bsearch.3 =================================================================== --- user/attilio/vmobj-rwlock/lib/libc/stdlib/bsearch.3 (revision 247191) +++ user/attilio/vmobj-rwlock/lib/libc/stdlib/bsearch.3 (revision 247192) @@ -1,89 +1,95 @@ .\" Copyright (c) 1990, 1991, 1993, 1994 .\" The Regents of the University of California. All rights reserved. .\" .\" This code is derived from software contributed to Berkeley by .\" the American National Standards Committee X3, on Information .\" Processing Systems. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 4. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)bsearch.3 8.3 (Berkeley) 4/19/94 .\" $FreeBSD$ .\" -.Dd April 19, 1994 +.Dd February 22, 2013 .Dt BSEARCH 3 .Os .Sh NAME .Nm bsearch .Nd binary search of a sorted table .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In stdlib.h .Ft void * .Fn bsearch "const void *key" "const void *base" "size_t nmemb" "size_t size" "int (*compar) (const void *, const void *)" .Sh DESCRIPTION The .Fn bsearch function searches an array of .Fa nmemb objects, the initial member of which is pointed to by .Fa base , for a member that matches the object pointed to by .Fa key . The size of each member of the array is specified by .Fa size . .Pp The contents of the array should be in ascending sorted order according to the comparison function referenced by .Fa compar . The .Fa compar routine is expected to have two arguments which point to the .Fa key object and to an array member, in that order, and should return an integer less than, equal to, or greater than zero if the .Fa key object is found, respectively, to be less than, to match, or be greater than the array member. +See the +.Fa int_compare +sample function in +.Xr qsort 3 +for a comparison function that is also compatible with +.Fn bsearch . .Sh RETURN VALUES The .Fn bsearch function returns a pointer to a matching member of the array, or a null pointer if no match is found. If two members compare as equal, which member is matched is unspecified. .Sh SEE ALSO .Xr db 3 , .Xr lsearch 3 , .Xr qsort 3 .\" .Xr tsearch 3 .Sh STANDARDS The .Fn bsearch function conforms to .St -isoC . Index: user/attilio/vmobj-rwlock/lib/libc =================================================================== --- user/attilio/vmobj-rwlock/lib/libc (revision 247191) +++ user/attilio/vmobj-rwlock/lib/libc (revision 247192) Property changes on: user/attilio/vmobj-rwlock/lib/libc ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/lib/libc:r247097-247191 Index: user/attilio/vmobj-rwlock/share/man/man9/VFS_SET.9 =================================================================== --- user/attilio/vmobj-rwlock/share/man/man9/VFS_SET.9 (revision 247191) +++ user/attilio/vmobj-rwlock/share/man/man9/VFS_SET.9 (revision 247192) @@ -1,108 +1,111 @@ .\" .\" Copyright (C) 2001 Chad David . All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice(s), this list of conditions and the following disclaimer as .\" the first lines of this file unmodified other than the possible .\" addition of one or more copyright notices. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice(s), this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE .\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY .\" DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES .\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR .\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER .\" CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH .\" DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd January 28, 2013 +.Dd February 21, 2013 .Dt VFS_SET 9 .Os .Sh NAME .Nm VFS_SET .Nd set up loadable file system .Vt vfsconf .Sh SYNOPSIS .In sys/param.h .In sys/kernel.h .In sys/module.h .In sys/mount.h .Ft void .Fn VFS_SET "struct vfsops *vfsops" "fsname" "int flags" .Sh DESCRIPTION .Fn VFS_SET creates a .Vt vfsconf structure for the loadable module with the given .Fa vfsops , fsname and .Fa flags , and declares it by calling .Xr DECLARE_MODULE 9 using .Fn vfs_modevent as the event handler. .Pp Possible values for the .Fa flags argument are: .Bl -hang -width ".Dv VFCF_DELEGADMIN" .It Dv VFCF_STATIC File system should be statically available in the kernel. .It Dv VFCF_NETWORK Network exportable file system. .It Dv VFCF_READONLY Does not support write operations. .It Dv VFCF_SYNTHETIC Pseudo file system, data does not represent on-disk files. .It Dv VFCF_LOOPBACK Loopback file system layer. .It Dv VFCF_UNICODE File names are stored as Unicode. .It Dv VFCF_JAIL Can be mounted from within a jail if .Va security.jail.mount_allowed sysctl is set to .Dv 1 . .It Dv VFCF_DELEGADMIN Supports delegated administration if .Va vfs.usermount sysctl is set to .Dv 1 . +.It Dv VFCF_SBDRY +When in VFS method, the thread suspension is deferred to the user +boundary upon arrival of stop action. .El .Sh PSEUDOCODE .Bd -literal /* * Fill in the fields for which we have special methods. * The others are initially null. This tells vfs to change them to * pointers to vfs_std* functions during file system registration. */ static struct vfsops myfs_vfsops = { .vfs_mount = myfs_mount, .vfs_root = myfs_root, .vfs_statfs = myfs_statfs, .vfs_unmount = myfs_unmount, }; VFS_SET(myfs_vfsops, myfs, 0); .Ed .Sh SEE ALSO .Xr jail 2 , .Xr jail 8 , .Xr DECLARE_MODULE 9 , .Xr vfsconf 9 , .Xr vfs_modevent 9 .Sh AUTHORS This manual page was written by .An Chad David Aq davidc@acns.ab.ca . Index: user/attilio/vmobj-rwlock/sys/cam/ata/ata_pmp.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cam/ata/ata_pmp.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cam/ata/ata_pmp.c (revision 247192) @@ -1,785 +1,787 @@ /*- * Copyright (c) 2009 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #ifndef _KERNEL #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #ifdef _KERNEL typedef enum { PMP_STATE_NORMAL, PMP_STATE_PORTS, PMP_STATE_PRECONFIG, PMP_STATE_RESET, PMP_STATE_CONNECT, PMP_STATE_CHECK, PMP_STATE_CLEAR, PMP_STATE_CONFIG, PMP_STATE_SCAN } pmp_state; typedef enum { PMP_FLAG_SCTX_INIT = 0x200 } pmp_flags; typedef enum { PMP_CCB_PROBE = 0x01, } pmp_ccb_state; /* Offsets into our private area for storing information */ #define ccb_state ppriv_field0 #define ccb_bp ppriv_ptr1 struct pmp_softc { SLIST_ENTRY(pmp_softc) links; pmp_state state; pmp_flags flags; uint32_t pm_pid; uint32_t pm_prv; int pm_ports; int pm_step; int pm_try; int found; int reset; int frozen; int restart; int events; #define PMP_EV_RESET 1 #define PMP_EV_RESCAN 2 u_int caps; struct task sysctl_task; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; }; static periph_init_t pmpinit; static void pmpasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static void pmpsysctlinit(void *context, int pending); static periph_ctor_t pmpregister; static periph_dtor_t pmpcleanup; static periph_start_t pmpstart; static periph_oninv_t pmponinvalidate; static void pmpdone(struct cam_periph *periph, union ccb *done_ccb); #ifndef PMP_DEFAULT_TIMEOUT #define PMP_DEFAULT_TIMEOUT 30 /* Timeout in seconds */ #endif #ifndef PMP_DEFAULT_RETRY #define PMP_DEFAULT_RETRY 1 #endif #ifndef PMP_DEFAULT_HIDE_SPECIAL #define PMP_DEFAULT_HIDE_SPECIAL 1 #endif static int pmp_retry_count = PMP_DEFAULT_RETRY; static int pmp_default_timeout = PMP_DEFAULT_TIMEOUT; static int pmp_hide_special = PMP_DEFAULT_HIDE_SPECIAL; static SYSCTL_NODE(_kern_cam, OID_AUTO, pmp, CTLFLAG_RD, 0, "CAM Direct Access Disk driver"); SYSCTL_INT(_kern_cam_pmp, OID_AUTO, retry_count, CTLFLAG_RW, &pmp_retry_count, 0, "Normal I/O retry count"); TUNABLE_INT("kern.cam.pmp.retry_count", &pmp_retry_count); SYSCTL_INT(_kern_cam_pmp, OID_AUTO, default_timeout, CTLFLAG_RW, &pmp_default_timeout, 0, "Normal I/O timeout (in seconds)"); TUNABLE_INT("kern.cam.pmp.default_timeout", &pmp_default_timeout); SYSCTL_INT(_kern_cam_pmp, OID_AUTO, hide_special, CTLFLAG_RW, &pmp_hide_special, 0, "Hide extra ports"); TUNABLE_INT("kern.cam.pmp.hide_special", &pmp_hide_special); static struct periph_driver pmpdriver = { pmpinit, "pmp", TAILQ_HEAD_INITIALIZER(pmpdriver.units), /* generation */ 0, CAM_PERIPH_DRV_EARLY }; PERIPHDRIVER_DECLARE(pmp, pmpdriver); static MALLOC_DEFINE(M_ATPMP, "ata_pmp", "ata_pmp buffers"); static void pmpinit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, pmpasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("pmp: Failed to attach master async callback " "due to status 0x%x!\n", status); } } static void pmpfreeze(struct cam_periph *periph, int mask) { struct pmp_softc *softc = (struct pmp_softc *)periph->softc; struct cam_path *dpath; int i; mask &= ~softc->frozen; for (i = 0; i < 15; i++) { if ((mask & (1 << i)) == 0) continue; if (xpt_create_path(&dpath, periph, xpt_path_path_id(periph->path), i, 0) == CAM_REQ_CMP) { softc->frozen |= (1 << i); xpt_acquire_device(dpath->device); cam_freeze_devq_arg(dpath, RELSIM_RELEASE_RUNLEVEL, CAM_RL_BUS + 1); xpt_free_path(dpath); } } } static void pmprelease(struct cam_periph *periph, int mask) { struct pmp_softc *softc = (struct pmp_softc *)periph->softc; struct cam_path *dpath; int i; mask &= softc->frozen; for (i = 0; i < 15; i++) { if ((mask & (1 << i)) == 0) continue; if (xpt_create_path(&dpath, periph, xpt_path_path_id(periph->path), i, 0) == CAM_REQ_CMP) { softc->frozen &= ~(1 << i); cam_release_devq(dpath, RELSIM_RELEASE_RUNLEVEL, 0, CAM_RL_BUS + 1, FALSE); xpt_release_device(dpath->device); xpt_free_path(dpath); } } } static void pmponinvalidate(struct cam_periph *periph) { struct cam_path *dpath; int i; /* * De-register any async callbacks. */ xpt_register_async(0, pmpasync, periph, periph->path); for (i = 0; i < 15; i++) { if (xpt_create_path(&dpath, periph, xpt_path_path_id(periph->path), i, 0) == CAM_REQ_CMP) { xpt_async(AC_LOST_DEVICE, dpath, NULL); xpt_free_path(dpath); } } pmprelease(periph, -1); xpt_print(periph->path, "lost device\n"); } static void pmpcleanup(struct cam_periph *periph) { struct pmp_softc *softc; softc = (struct pmp_softc *)periph->softc; xpt_print(periph->path, "removing device entry\n"); cam_periph_unlock(periph); /* * If we can't free the sysctl tree, oh well... */ if ((softc->flags & PMP_FLAG_SCTX_INIT) != 0 && sysctl_ctx_free(&softc->sysctl_ctx) != 0) { xpt_print(periph->path, "can't remove sysctl context\n"); } free(softc, M_DEVBUF); cam_periph_lock(periph); } static void pmpasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { struct cam_periph *periph; struct pmp_softc *softc; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; if (cgd->protocol != PROTO_SATAPM) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(pmpregister, pmponinvalidate, pmpcleanup, pmpstart, "pmp", CAM_PERIPH_BIO, cgd->ccb_h.path, pmpasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) printf("pmpasync: Unable to attach to new device " "due to status 0x%x\n", status); break; } case AC_SCSI_AEN: case AC_SENT_BDR: case AC_BUS_RESET: softc = (struct pmp_softc *)periph->softc; cam_periph_async(periph, code, path, arg); if (code == AC_SCSI_AEN) softc->events |= PMP_EV_RESCAN; else softc->events |= PMP_EV_RESET; if (code == AC_SCSI_AEN && softc->state != PMP_STATE_NORMAL) break; xpt_hold_boot(); pmpfreeze(periph, softc->found); if (code == AC_SENT_BDR || code == AC_BUS_RESET) softc->found = 0; /* We have to reset everything. */ if (softc->state == PMP_STATE_NORMAL) { softc->state = PMP_STATE_PRECONFIG; cam_periph_acquire(periph); xpt_schedule(periph, CAM_PRIORITY_DEV); } else softc->restart = 1; break; default: cam_periph_async(periph, code, path, arg); break; } } static void pmpsysctlinit(void *context, int pending) { struct cam_periph *periph; struct pmp_softc *softc; char tmpstr[80], tmpstr2[80]; periph = (struct cam_periph *)context; if (cam_periph_acquire(periph) != CAM_REQ_CMP) return; softc = (struct pmp_softc *)periph->softc; snprintf(tmpstr, sizeof(tmpstr), "CAM PMP unit %d", periph->unit_number); snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number); sysctl_ctx_init(&softc->sysctl_ctx); softc->flags |= PMP_FLAG_SCTX_INIT; softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam_pmp), OID_AUTO, tmpstr2, CTLFLAG_RD, 0, tmpstr); if (softc->sysctl_tree == NULL) { printf("pmpsysctlinit: unable to allocate sysctl tree\n"); cam_periph_release(periph); return; } cam_periph_release(periph); } static cam_status pmpregister(struct cam_periph *periph, void *arg) { struct pmp_softc *softc; struct ccb_getdev *cgd; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) { printf("pmpregister: no getdev CCB, can't register device\n"); return(CAM_REQ_CMP_ERR); } softc = (struct pmp_softc *)malloc(sizeof(*softc), M_DEVBUF, M_NOWAIT|M_ZERO); if (softc == NULL) { printf("pmpregister: Unable to probe new device. " "Unable to allocate softc\n"); return(CAM_REQ_CMP_ERR); } periph->softc = softc; softc->pm_pid = ((uint32_t *)&cgd->ident_data)[0]; softc->pm_prv = ((uint32_t *)&cgd->ident_data)[1]; TASK_INIT(&softc->sysctl_task, 0, pmpsysctlinit, periph); xpt_announce_periph(periph, NULL); /* * Add async callbacks for bus reset and * bus device reset calls. I don't bother * checking if this fails as, in most cases, * the system will function just fine without * them and the only alternative would be to * not attach the device on failure. */ xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE | AC_SCSI_AEN, pmpasync, periph, periph->path); /* * Take an exclusive refcount on the periph while pmpstart is called * to finish the probe. The reference will be dropped in pmpdone at * the end of probe. */ (void)cam_periph_acquire(periph); xpt_hold_boot(); softc->state = PMP_STATE_PORTS; softc->events = PMP_EV_RESCAN; xpt_schedule(periph, CAM_PRIORITY_DEV); return(CAM_REQ_CMP); } static void pmpstart(struct cam_periph *periph, union ccb *start_ccb) { struct ccb_trans_settings cts; struct ccb_ataio *ataio; struct pmp_softc *softc; struct cam_path *dpath; int revision = 0; softc = (struct pmp_softc *)periph->softc; ataio = &start_ccb->ataio; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("pmpstart\n")); if (softc->restart) { softc->restart = 0; softc->state = min(softc->state, PMP_STATE_PRECONFIG); } /* Fetch user wanted device speed. */ if (softc->state == PMP_STATE_RESET || softc->state == PMP_STATE_CONNECT) { if (xpt_create_path(&dpath, periph, xpt_path_path_id(periph->path), softc->pm_step, 0) == CAM_REQ_CMP) { bzero(&cts, sizeof(cts)); xpt_setup_ccb(&cts.ccb_h, dpath, CAM_PRIORITY_NONE); cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS; cts.type = CTS_TYPE_USER_SETTINGS; xpt_action((union ccb *)&cts); if (cts.xport_specific.sata.valid & CTS_SATA_VALID_REVISION) revision = cts.xport_specific.sata.revision; xpt_free_path(dpath); } } switch (softc->state) { case PMP_STATE_PORTS: cam_fill_ataio(ataio, pmp_retry_count, pmpdone, /*flags*/CAM_DIR_NONE, 0, /*data_ptr*/NULL, /*dxfer_len*/0, pmp_default_timeout * 1000); ata_pm_read_cmd(ataio, 2, 15); break; case PMP_STATE_PRECONFIG: /* Get/update host SATA capabilities. */ bzero(&cts, sizeof(cts)); xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE); cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS; cts.type = CTS_TYPE_CURRENT_SETTINGS; xpt_action((union ccb *)&cts); if (cts.xport_specific.sata.valid & CTS_SATA_VALID_CAPS) softc->caps = cts.xport_specific.sata.caps; cam_fill_ataio(ataio, pmp_retry_count, pmpdone, /*flags*/CAM_DIR_NONE, 0, /*data_ptr*/NULL, /*dxfer_len*/0, pmp_default_timeout * 1000); ata_pm_write_cmd(ataio, 0x60, 15, 0x0); break; case PMP_STATE_RESET: cam_fill_ataio(ataio, pmp_retry_count, pmpdone, /*flags*/CAM_DIR_NONE, 0, /*data_ptr*/NULL, /*dxfer_len*/0, pmp_default_timeout * 1000); ata_pm_write_cmd(ataio, 2, softc->pm_step, (revision << 4) | ((softc->found & (1 << softc->pm_step)) ? 0 : 1)); break; case PMP_STATE_CONNECT: cam_fill_ataio(ataio, pmp_retry_count, pmpdone, /*flags*/CAM_DIR_NONE, 0, /*data_ptr*/NULL, /*dxfer_len*/0, pmp_default_timeout * 1000); ata_pm_write_cmd(ataio, 2, softc->pm_step, (revision << 4)); break; case PMP_STATE_CHECK: cam_fill_ataio(ataio, pmp_retry_count, pmpdone, /*flags*/CAM_DIR_NONE, 0, /*data_ptr*/NULL, /*dxfer_len*/0, pmp_default_timeout * 1000); ata_pm_read_cmd(ataio, 0, softc->pm_step); break; case PMP_STATE_CLEAR: softc->reset = 0; cam_fill_ataio(ataio, pmp_retry_count, pmpdone, /*flags*/CAM_DIR_NONE, 0, /*data_ptr*/NULL, /*dxfer_len*/0, pmp_default_timeout * 1000); ata_pm_write_cmd(ataio, 1, softc->pm_step, 0xFFFFFFFF); break; case PMP_STATE_CONFIG: cam_fill_ataio(ataio, pmp_retry_count, pmpdone, /*flags*/CAM_DIR_NONE, 0, /*data_ptr*/NULL, /*dxfer_len*/0, pmp_default_timeout * 1000); ata_pm_write_cmd(ataio, 0x60, 15, 0x07 | ((softc->caps & CTS_SATA_CAPS_H_AN) ? 0x08 : 0)); break; default: break; } xpt_action(start_ccb); } static void pmpdone(struct cam_periph *periph, union ccb *done_ccb) { struct ccb_trans_settings cts; struct pmp_softc *softc; struct ccb_ataio *ataio; struct cam_path *dpath; u_int32_t priority, res; int i; softc = (struct pmp_softc *)periph->softc; ataio = &done_ccb->ataio; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("pmpdone\n")); priority = done_ccb->ccb_h.pinfo.priority; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (cam_periph_error(done_ccb, 0, 0, NULL) == ERESTART) { return; } else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } goto done; } if (softc->restart) { softc->restart = 0; xpt_release_ccb(done_ccb); softc->state = min(softc->state, PMP_STATE_PRECONFIG); xpt_schedule(periph, priority); return; } switch (softc->state) { case PMP_STATE_PORTS: softc->pm_ports = (ataio->res.lba_high << 24) + (ataio->res.lba_mid << 16) + (ataio->res.lba_low << 8) + ataio->res.sector_count; if (pmp_hide_special) { /* * This PMP declares 6 ports, while only 5 of them * are real. Port 5 is a SEMB port, probing which * causes timeouts if external SEP is not connected * to PMP over I2C. */ - if (softc->pm_pid == 0x37261095 && softc->pm_ports == 6) + if ((softc->pm_pid == 0x37261095 || + softc->pm_pid == 0x38261095) && + softc->pm_ports == 6) softc->pm_ports = 5; /* * This PMP declares 7 ports, while only 5 of them * are real. Port 5 is a fake "Config Disk" with * 640 sectors size. Port 6 is a SEMB port. */ if (softc->pm_pid == 0x47261095 && softc->pm_ports == 7) softc->pm_ports = 5; /* * These PMPs have extra configuration port. */ if (softc->pm_pid == 0x57231095 || softc->pm_pid == 0x57331095 || softc->pm_pid == 0x57341095 || softc->pm_pid == 0x57441095) softc->pm_ports--; } printf("%s%d: %d fan-out ports\n", periph->periph_name, periph->unit_number, softc->pm_ports); softc->state = PMP_STATE_PRECONFIG; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; case PMP_STATE_PRECONFIG: softc->pm_step = 0; softc->state = PMP_STATE_RESET; softc->reset |= ~softc->found; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; case PMP_STATE_RESET: softc->pm_step++; if (softc->pm_step >= softc->pm_ports) { softc->pm_step = 0; cam_freeze_devq(periph->path); cam_release_devq(periph->path, RELSIM_RELEASE_AFTER_TIMEOUT, /*reduction*/0, /*timeout*/5, /*getcount_only*/0); softc->state = PMP_STATE_CONNECT; } xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; case PMP_STATE_CONNECT: softc->pm_step++; if (softc->pm_step >= softc->pm_ports) { softc->pm_step = 0; softc->pm_try = 0; cam_freeze_devq(periph->path); cam_release_devq(periph->path, RELSIM_RELEASE_AFTER_TIMEOUT, /*reduction*/0, /*timeout*/10, /*getcount_only*/0); softc->state = PMP_STATE_CHECK; } xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; case PMP_STATE_CHECK: res = (ataio->res.lba_high << 24) + (ataio->res.lba_mid << 16) + (ataio->res.lba_low << 8) + ataio->res.sector_count; if (((res & 0xf0f) == 0x103 && (res & 0x0f0) != 0) || (res & 0x600) != 0) { if (bootverbose) { printf("%s%d: port %d status: %08x\n", periph->periph_name, periph->unit_number, softc->pm_step, res); } /* Report device speed if it is online. */ if ((res & 0xf0f) == 0x103 && xpt_create_path(&dpath, periph, xpt_path_path_id(periph->path), softc->pm_step, 0) == CAM_REQ_CMP) { bzero(&cts, sizeof(cts)); xpt_setup_ccb(&cts.ccb_h, dpath, CAM_PRIORITY_NONE); cts.ccb_h.func_code = XPT_SET_TRAN_SETTINGS; cts.type = CTS_TYPE_CURRENT_SETTINGS; cts.xport_specific.sata.revision = (res & 0x0f0) >> 4; cts.xport_specific.sata.valid = CTS_SATA_VALID_REVISION; cts.xport_specific.sata.caps = softc->caps & (CTS_SATA_CAPS_H_PMREQ | CTS_SATA_CAPS_H_DMAAA | CTS_SATA_CAPS_H_AN); cts.xport_specific.sata.valid |= CTS_SATA_VALID_CAPS; xpt_action((union ccb *)&cts); xpt_free_path(dpath); } softc->found |= (1 << softc->pm_step); softc->pm_step++; } else { if (softc->pm_try < 10) { cam_freeze_devq(periph->path); cam_release_devq(periph->path, RELSIM_RELEASE_AFTER_TIMEOUT, /*reduction*/0, /*timeout*/10, /*getcount_only*/0); softc->pm_try++; } else { if (bootverbose) { printf("%s%d: port %d status: %08x\n", periph->periph_name, periph->unit_number, softc->pm_step, res); } softc->found &= ~(1 << softc->pm_step); if (xpt_create_path(&dpath, periph, done_ccb->ccb_h.path_id, softc->pm_step, 0) == CAM_REQ_CMP) { xpt_async(AC_LOST_DEVICE, dpath, NULL); xpt_free_path(dpath); } softc->pm_step++; } } if (softc->pm_step >= softc->pm_ports) { if (softc->reset & softc->found) { cam_freeze_devq(periph->path); cam_release_devq(periph->path, RELSIM_RELEASE_AFTER_TIMEOUT, /*reduction*/0, /*timeout*/1000, /*getcount_only*/0); } softc->state = PMP_STATE_CLEAR; softc->pm_step = 0; } xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; case PMP_STATE_CLEAR: softc->pm_step++; if (softc->pm_step >= softc->pm_ports) { softc->state = PMP_STATE_CONFIG; softc->pm_step = 0; } xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; case PMP_STATE_CONFIG: for (i = 0; i < softc->pm_ports; i++) { union ccb *ccb; if ((softc->found & (1 << i)) == 0) continue; if (xpt_create_path(&dpath, periph, xpt_path_path_id(periph->path), i, 0) != CAM_REQ_CMP) { printf("pmpdone: xpt_create_path failed\n"); continue; } /* If we did hard reset to this device, inform XPT. */ if ((softc->reset & softc->found & (1 << i)) != 0) xpt_async(AC_SENT_BDR, dpath, NULL); /* If rescan requested, scan this device. */ if (softc->events & PMP_EV_RESCAN) { ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { xpt_free_path(dpath); goto done; } xpt_setup_ccb(&ccb->ccb_h, dpath, CAM_PRIORITY_XPT); xpt_rescan(ccb); } else xpt_free_path(dpath); } break; default: break; } done: xpt_release_ccb(done_ccb); softc->state = PMP_STATE_NORMAL; softc->events = 0; xpt_release_boot(); pmprelease(periph, -1); cam_periph_release_locked(periph); } #endif /* _KERNEL */ Index: user/attilio/vmobj-rwlock/sys/cam/scsi/scsi_da.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cam/scsi/scsi_da.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cam/scsi/scsi_da.c (revision 247192) @@ -1,2915 +1,2919 @@ /*- * Implementation of SCSI Direct Access Peripheral driver for CAM. * * Copyright (c) 1997 Justin T. Gibbs. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #ifndef _KERNEL #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #ifndef _KERNEL #include #endif /* !_KERNEL */ #ifdef _KERNEL typedef enum { DA_STATE_PROBE, DA_STATE_PROBE2, DA_STATE_NORMAL } da_state; typedef enum { DA_FLAG_PACK_INVALID = 0x001, DA_FLAG_NEW_PACK = 0x002, DA_FLAG_PACK_LOCKED = 0x004, DA_FLAG_PACK_REMOVABLE = 0x008, DA_FLAG_SAW_MEDIA = 0x010, DA_FLAG_NEED_OTAG = 0x020, DA_FLAG_WENT_IDLE = 0x040, DA_FLAG_RETRY_UA = 0x080, DA_FLAG_OPEN = 0x100, DA_FLAG_SCTX_INIT = 0x200, DA_FLAG_CAN_RC16 = 0x400, DA_FLAG_PROBED = 0x800 } da_flags; typedef enum { DA_Q_NONE = 0x00, DA_Q_NO_SYNC_CACHE = 0x01, DA_Q_NO_6_BYTE = 0x02, DA_Q_NO_PREVENT = 0x04, DA_Q_4K = 0x08 } da_quirks; typedef enum { DA_CCB_PROBE = 0x01, DA_CCB_PROBE2 = 0x02, DA_CCB_BUFFER_IO = 0x03, DA_CCB_WAITING = 0x04, DA_CCB_DUMP = 0x05, DA_CCB_DELETE = 0x06, DA_CCB_TUR = 0x07, DA_CCB_TYPE_MASK = 0x0F, DA_CCB_RETRY_UA = 0x10 } da_ccb_state; typedef enum { DA_DELETE_NONE, DA_DELETE_DISABLE, DA_DELETE_ZERO, DA_DELETE_WS10, DA_DELETE_WS16, DA_DELETE_UNMAP, DA_DELETE_MAX = DA_DELETE_UNMAP } da_delete_methods; static const char *da_delete_method_names[] = { "NONE", "DISABLE", "ZERO", "WS10", "WS16", "UNMAP" }; /* Offsets into our private area for storing information */ #define ccb_state ppriv_field0 #define ccb_bp ppriv_ptr1 struct disk_params { u_int8_t heads; u_int32_t cylinders; u_int8_t secs_per_track; u_int32_t secsize; /* Number of bytes/sector */ u_int64_t sectors; /* total number sectors */ u_int stripesize; u_int stripeoffset; }; #define UNMAP_MAX_RANGES 512 struct da_softc { struct bio_queue_head bio_queue; struct bio_queue_head delete_queue; struct bio_queue_head delete_run_queue; SLIST_ENTRY(da_softc) links; LIST_HEAD(, ccb_hdr) pending_ccbs; da_state state; da_flags flags; da_quirks quirks; int minimum_cmd_size; int error_inject; int ordered_tag_count; int outstanding_cmds; int unmap_max_ranges; int unmap_max_lba; int delete_running; int tur; da_delete_methods delete_method; struct disk_params params; struct disk *disk; union ccb saved_ccb; struct task sysctl_task; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; struct callout sendordered_c; uint64_t wwpn; uint8_t unmap_buf[UNMAP_MAX_RANGES * 16 + 8]; struct scsi_read_capacity_data_long rcaplong; struct callout mediapoll_c; }; struct da_quirk_entry { struct scsi_inquiry_pattern inq_pat; da_quirks quirks; }; static const char quantum[] = "QUANTUM"; static const char microp[] = "MICROP"; static struct da_quirk_entry da_quirk_table[] = { /* SPI, FC devices */ { /* * Fujitsu M2513A MO drives. * Tested devices: M2513A2 firmware versions 1200 & 1300. * (dip switch selects whether T_DIRECT or T_OPTICAL device) * Reported by: W.Scholten */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* See above. */ {T_OPTICAL, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * This particular Fujitsu drive doesn't like the * synchronize cache command. * Reported by: Tom Jackson */ {T_DIRECT, SIP_MEDIA_FIXED, "FUJITSU", "M2954*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * This drive doesn't like the synchronize cache command * either. Reported by: Matthew Jacob * in NetBSD PR kern/6027, August 24, 1998. */ {T_DIRECT, SIP_MEDIA_FIXED, microp, "2217*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * This drive doesn't like the synchronize cache command * either. Reported by: Hellmuth Michaelis (hm@kts.org) * (PR 8882). */ {T_DIRECT, SIP_MEDIA_FIXED, microp, "2112*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. * Reported by: Blaz Zupan */ {T_DIRECT, SIP_MEDIA_FIXED, "NEC", "D3847*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. * Reported by: Blaz Zupan */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "MAVERICK 540S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS525S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. * Reported by: walter@pelissero.de */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS540S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't work correctly with 6 byte reads/writes. * Returns illegal request, and points to byte 9 of the * 6-byte CDB. * Reported by: Adam McDougall */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 4*", "*"}, /*quirks*/ DA_Q_NO_6_BYTE }, { /* See above. */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 2*", "*"}, /*quirks*/ DA_Q_NO_6_BYTE }, { /* * Doesn't like the synchronize cache command. * Reported by: walter@pelissero.de */ {T_DIRECT, SIP_MEDIA_FIXED, "CONNER", "CP3500*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * The CISS RAID controllers do not support SYNC_CACHE */ {T_DIRECT, SIP_MEDIA_FIXED, "COMPAQ", "RAID*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, /* USB mass storage devices supported by umass(4) */ { /* * EXATELECOM (Sigmatel) i-Bead 100/105 USB Flash MP3 Player * PR: kern/51675 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "EXATEL", "i-BEAD10*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Power Quotient Int. (PQI) USB flash key * PR: kern/53067 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "USB Flash Disk*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Creative Nomad MUVO mp3 player (USB) * PR: kern/53094 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "NOMAD_MUVO", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * Jungsoft NEXDISK USB flash key * PR: kern/54737 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "JUNGSOFT", "NEXDISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * FreeDik USB Mini Data Drive * PR: kern/54786 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "FreeDik*", "Mini Data Drive", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Sigmatel USB Flash MP3 Player * PR: kern/57046 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SigmaTel", "MSCN", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * Neuros USB Digital Audio Computer * PR: kern/63645 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "NEUROS", "dig. audio comp.", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * SEAGRAND NP-900 MP3 Player * PR: kern/64563 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SEAGRAND", "NP-900*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * iRiver iFP MP3 player (with UMS Firmware) * PR: kern/54881, i386/63941, kern/66124 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "iRiver", "iFP*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Frontier Labs NEX IA+ Digital Audio Player, rev 1.10/0.01 * PR: kern/70158 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "FL" , "Nex*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * ZICPlay USB MP3 Player with FM * PR: kern/75057 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "ACTIONS*" , "USB DISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * TEAC USB floppy mechanisms */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "TEAC" , "FD-05*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Kingston DataTraveler II+ USB Pen-Drive. * Reported by: Pawel Jakub Dawidek */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston" , "DataTraveler II+", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * USB DISK Pro PMAP * Reported by: jhs * PR: usb/96381 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, " ", "USB DISK Pro", "PMAP"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Motorola E398 Mobile Phone (TransFlash memory card). * Reported by: Wojciech A. Koszek * PR: usb/89889 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Motorola" , "Motorola Phone", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Qware BeatZkey! Pro * PR: usb/79164 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "GENERIC", "USB DISK DEVICE", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Time DPA20B 1GB MP3 Player * PR: usb/81846 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB2.0*", "(FS) FLASH DISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Samsung USB key 128Mb * PR: usb/90081 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB-DISK", "FreeDik-FlashUsb", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Kingston DataTraveler 2.0 USB Flash memory. * PR: usb/89196 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler 2.0", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Creative MUVO Slim mp3 player (USB) * PR: usb/86131 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "MuVo Slim", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * United MP5512 Portable MP3 Player (2-in-1 USB DISK/MP3) * PR: usb/80487 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "MUSIC DISK", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * SanDisk Micro Cruzer 128MB * PR: usb/75970 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SanDisk" , "Micro Cruzer", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * TOSHIBA TransMemory USB sticks * PR: kern/94660 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "TOSHIBA", "TransMemory", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * PNY USB Flash keys * PR: usb/75578, usb/72344, usb/65436 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "*" , "USB DISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Genesys 6-in-1 Card Reader * PR: usb/94647 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "STORAGE DEVICE*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Rekam Digital CAMERA * PR: usb/98713 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "CAMERA*", "4MP-9J6*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * iRiver H10 MP3 player * PR: usb/102547 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "H10*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * iRiver U10 MP3 player * PR: usb/92306 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "U10*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * X-Micro Flash Disk * PR: usb/96901 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "X-Micro", "Flash Disk", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * EasyMP3 EM732X USB 2.0 Flash MP3 Player * PR: usb/96546 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "EM732X", "MP3 Player*", "1.00"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Denver MP3 player * PR: usb/107101 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "DENVER", "MP3 PLAYER", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Philips USB Key Audio KEY013 * PR: usb/68412 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "PHILIPS", "Key*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT }, { /* * JNC MP3 Player * PR: usb/94439 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "JNC*" , "MP3 Player*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * SAMSUNG MP0402H * PR: usb/108427 */ {T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "MP0402H", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * I/O Magic USB flash - Giga Bank * PR: usb/108810 */ {T_DIRECT, SIP_MEDIA_FIXED, "GS-Magic", "stor*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * JoyFly 128mb USB Flash Drive * PR: 96133 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "Flash Disk*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * ChipsBnk usb stick * PR: 103702 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "ChipsBnk", "USB*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Storcase (Kingston) InfoStation IFS FC2/SATA-R 201A * PR: 129858 */ {T_DIRECT, SIP_MEDIA_FIXED, "IFS", "FC2/SATA-R*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Samsung YP-U3 mp3-player * PR: 125398 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Samsung", "YP-U3", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { {T_DIRECT, SIP_MEDIA_REMOVABLE, "Netac", "OnlyDisk*", "2000"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Sony Cyber-Shot DSC cameras * PR: usb/137035 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Sony", "Sony DSC", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT }, + { + {T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler G3", + "1.00"}, /*quirks*/ DA_Q_NO_PREVENT + }, /* ATA/SATA devices over SAS/USB/... */ { /* Hitachi Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "Hitachi", "H??????????E3*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD155UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD155UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD204UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD204UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DL*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST????DL", "*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???DM*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST???DM*", "*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DM*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST????DM", "*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500423AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "3AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500424AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "4AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640423AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "3AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640424AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "4AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750420AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "0AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750422AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "2AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750423AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "3AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Thin Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???LT*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Thin Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST???LT*", "*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* * Olympus FE-210 camera */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "FE210*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * LG UP3S MP3 player */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "LG", "UP3S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Laser MP3-2GA13 MP3 player */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "(HS) Flash Disk", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * LaCie external 250GB Hard drive des by Porsche * Submitted by: Ben Stuyts * PR: 121474 */ {T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HM250JI", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, }; static disk_strategy_t dastrategy; static dumper_t dadump; static periph_init_t dainit; static void daasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static void dasysctlinit(void *context, int pending); static int dacmdsizesysctl(SYSCTL_HANDLER_ARGS); static int dadeletemethodsysctl(SYSCTL_HANDLER_ARGS); static int dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method); static periph_ctor_t daregister; static periph_dtor_t dacleanup; static periph_start_t dastart; static periph_oninv_t daoninvalidate; static void dadone(struct cam_periph *periph, union ccb *done_ccb); static int daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags); static void daprevent(struct cam_periph *periph, int action); static void dareprobe(struct cam_periph *periph); static void dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector, struct scsi_read_capacity_data_long *rcaplong, size_t rcap_size); static timeout_t dasendorderedtag; static void dashutdown(void *arg, int howto); static timeout_t damediapoll; #ifndef DA_DEFAULT_POLL_PERIOD #define DA_DEFAULT_POLL_PERIOD 3 #endif #ifndef DA_DEFAULT_TIMEOUT #define DA_DEFAULT_TIMEOUT 60 /* Timeout in seconds */ #endif #ifndef DA_DEFAULT_RETRY #define DA_DEFAULT_RETRY 4 #endif #ifndef DA_DEFAULT_SEND_ORDERED #define DA_DEFAULT_SEND_ORDERED 1 #endif static int da_poll_period = DA_DEFAULT_POLL_PERIOD; static int da_retry_count = DA_DEFAULT_RETRY; static int da_default_timeout = DA_DEFAULT_TIMEOUT; static int da_send_ordered = DA_DEFAULT_SEND_ORDERED; static SYSCTL_NODE(_kern_cam, OID_AUTO, da, CTLFLAG_RD, 0, "CAM Direct Access Disk driver"); SYSCTL_INT(_kern_cam_da, OID_AUTO, poll_period, CTLFLAG_RW, &da_poll_period, 0, "Media polling period in seconds"); TUNABLE_INT("kern.cam.da.poll_period", &da_poll_period); SYSCTL_INT(_kern_cam_da, OID_AUTO, retry_count, CTLFLAG_RW, &da_retry_count, 0, "Normal I/O retry count"); TUNABLE_INT("kern.cam.da.retry_count", &da_retry_count); SYSCTL_INT(_kern_cam_da, OID_AUTO, default_timeout, CTLFLAG_RW, &da_default_timeout, 0, "Normal I/O timeout (in seconds)"); TUNABLE_INT("kern.cam.da.default_timeout", &da_default_timeout); SYSCTL_INT(_kern_cam_da, OID_AUTO, send_ordered, CTLFLAG_RW, &da_send_ordered, 0, "Send Ordered Tags"); TUNABLE_INT("kern.cam.da.send_ordered", &da_send_ordered); /* * DA_ORDEREDTAG_INTERVAL determines how often, relative * to the default timeout, we check to see whether an ordered * tagged transaction is appropriate to prevent simple tag * starvation. Since we'd like to ensure that there is at least * 1/2 of the timeout length left for a starved transaction to * complete after we've sent an ordered tag, we must poll at least * four times in every timeout period. This takes care of the worst * case where a starved transaction starts during an interval that * meets the requirement "don't send an ordered tag" test so it takes * us two intervals to determine that a tag must be sent. */ #ifndef DA_ORDEREDTAG_INTERVAL #define DA_ORDEREDTAG_INTERVAL 4 #endif static struct periph_driver dadriver = { dainit, "da", TAILQ_HEAD_INITIALIZER(dadriver.units), /* generation */ 0 }; PERIPHDRIVER_DECLARE(da, dadriver); static MALLOC_DEFINE(M_SCSIDA, "scsi_da", "scsi_da buffers"); static int daopen(struct disk *dp) { struct cam_periph *periph; struct da_softc *softc; int unit; int error; periph = (struct cam_periph *)dp->d_drv1; if (periph == NULL) { return (ENXIO); } if (cam_periph_acquire(periph) != CAM_REQ_CMP) { return (ENXIO); } cam_periph_lock(periph); if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) { cam_periph_unlock(periph); cam_periph_release(periph); return (error); } unit = periph->unit_number; softc = (struct da_softc *)periph->softc; softc->flags |= DA_FLAG_OPEN; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("daopen\n")); if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) { /* Invalidate our pack information. */ softc->flags &= ~DA_FLAG_PACK_INVALID; } dareprobe(periph); /* Wait for the disk size update. */ error = msleep(&softc->disk->d_mediasize, periph->sim->mtx, PRIBIO, "dareprobe", 0); if (error != 0) xpt_print(periph->path, "unable to retrieve capacity data"); if (periph->flags & CAM_PERIPH_INVALID || softc->disk->d_sectorsize == 0 || softc->disk->d_mediasize == 0) error = ENXIO; if (error == 0 && (softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 && (softc->quirks & DA_Q_NO_PREVENT) == 0) daprevent(periph, PR_PREVENT); if (error == 0) softc->flags |= DA_FLAG_SAW_MEDIA; cam_periph_unhold(periph); cam_periph_unlock(periph); if (error != 0) { softc->flags &= ~DA_FLAG_OPEN; cam_periph_release(periph); } return (error); } static int daclose(struct disk *dp) { struct cam_periph *periph; struct da_softc *softc; periph = (struct cam_periph *)dp->d_drv1; if (periph == NULL) return (0); cam_periph_lock(periph); if (cam_periph_hold(periph, PRIBIO) != 0) { cam_periph_unlock(periph); cam_periph_release(periph); return (0); } softc = (struct da_softc *)periph->softc; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("daclose\n")); if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0 && (softc->flags & DA_FLAG_PACK_INVALID) == 0) { union ccb *ccb; ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_synchronize_cache(&ccb->csio, /*retries*/1, /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG, /*begin_lba*/0,/* Cover the whole disk */ /*lb_count*/0, SSD_FULL_SIZE, 5 * 60 * 1000); cam_periph_runccb(ccb, daerror, /*cam_flags*/0, /*sense_flags*/SF_RETRY_UA | SF_QUIET_IR, softc->disk->d_devstat); xpt_release_ccb(ccb); } if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0) { if ((softc->quirks & DA_Q_NO_PREVENT) == 0) daprevent(periph, PR_ALLOW); /* * If we've got removeable media, mark the blocksize as * unavailable, since it could change when new media is * inserted. */ softc->disk->d_devstat->flags |= DEVSTAT_BS_UNAVAILABLE; } softc->flags &= ~DA_FLAG_OPEN; cam_periph_unhold(periph); cam_periph_unlock(periph); cam_periph_release(periph); return (0); } static void daschedule(struct cam_periph *periph) { struct da_softc *softc = (struct da_softc *)periph->softc; uint32_t prio; if (softc->state != DA_STATE_NORMAL) return; /* Check if cam_periph_getccb() was called. */ prio = periph->immediate_priority; /* Check if we have more work to do. */ if (bioq_first(&softc->bio_queue) || (!softc->delete_running && bioq_first(&softc->delete_queue)) || softc->tur) { prio = CAM_PRIORITY_NORMAL; } /* Schedule CCB if any of above is true. */ if (prio != CAM_PRIORITY_NONE) xpt_schedule(periph, prio); } /* * Actually translate the requested transfer into one the physical driver * can understand. The transfer is described by a buf and will include * only one physical transfer. */ static void dastrategy(struct bio *bp) { struct cam_periph *periph; struct da_softc *softc; periph = (struct cam_periph *)bp->bio_disk->d_drv1; if (periph == NULL) { biofinish(bp, NULL, ENXIO); return; } softc = (struct da_softc *)periph->softc; cam_periph_lock(periph); /* * If the device has been made invalid, error out */ if ((softc->flags & DA_FLAG_PACK_INVALID)) { cam_periph_unlock(periph); biofinish(bp, NULL, ENXIO); return; } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastrategy(%p)\n", bp)); /* * Place it in the queue of disk activities for this disk */ if (bp->bio_cmd == BIO_DELETE) { if (bp->bio_bcount == 0) biodone(bp); else bioq_disksort(&softc->delete_queue, bp); } else bioq_disksort(&softc->bio_queue, bp); /* * Schedule ourselves for performing the work. */ daschedule(periph); cam_periph_unlock(periph); return; } static int dadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct cam_periph *periph; struct da_softc *softc; u_int secsize; struct ccb_scsiio csio; struct disk *dp; int error = 0; dp = arg; periph = dp->d_drv1; if (periph == NULL) return (ENXIO); softc = (struct da_softc *)periph->softc; cam_periph_lock(periph); secsize = softc->params.secsize; if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) { cam_periph_unlock(periph); return (ENXIO); } if (length > 0) { xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); csio.ccb_h.ccb_state = DA_CCB_DUMP; scsi_read_write(&csio, /*retries*/0, dadone, MSG_ORDERED_Q_TAG, /*read*/FALSE, /*byte2*/0, /*minimum_cmd_size*/ softc->minimum_cmd_size, offset / secsize, length / secsize, /*data_ptr*/(u_int8_t *) virtual, /*dxfer_len*/length, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); xpt_polled_action((union ccb *)&csio); error = cam_periph_error((union ccb *)&csio, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if ((csio.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(csio.ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); if (error != 0) printf("Aborting dump due to I/O error.\n"); cam_periph_unlock(periph); return (error); } /* * Sync the disk cache contents to the physical media. */ if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) { xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); csio.ccb_h.ccb_state = DA_CCB_DUMP; scsi_synchronize_cache(&csio, /*retries*/0, /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG, /*begin_lba*/0,/* Cover the whole disk */ /*lb_count*/0, SSD_FULL_SIZE, 5 * 60 * 1000); xpt_polled_action((union ccb *)&csio); error = cam_periph_error((union ccb *)&csio, 0, SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR, NULL); if ((csio.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(csio.ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); } cam_periph_unlock(periph); return (error); } static int dagetattr(struct bio *bp) { int ret; struct cam_periph *periph; periph = (struct cam_periph *)bp->bio_disk->d_drv1; if (periph == NULL) return (ENXIO); cam_periph_lock(periph); ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute, periph->path); cam_periph_unlock(periph); if (ret == 0) bp->bio_completed = bp->bio_length; return ret; } static void dainit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, daasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("da: Failed to attach master async callback " "due to status 0x%x!\n", status); } else if (da_send_ordered) { /* Register our shutdown event handler */ if ((EVENTHANDLER_REGISTER(shutdown_post_sync, dashutdown, NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) printf("dainit: shutdown event registration failed!\n"); } } /* * Callback from GEOM, called when it has finished cleaning up its * resources. */ static void dadiskgonecb(struct disk *dp) { struct cam_periph *periph; periph = (struct cam_periph *)dp->d_drv1; cam_periph_release(periph); } static void daoninvalidate(struct cam_periph *periph) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; /* * De-register any async callbacks. */ xpt_register_async(0, daasync, periph, periph->path); softc->flags |= DA_FLAG_PACK_INVALID; /* * Return all queued I/O with ENXIO. * XXX Handle any transactions queued to the card * with XPT_ABORT_CCB. */ bioq_flush(&softc->bio_queue, NULL, ENXIO); bioq_flush(&softc->delete_queue, NULL, ENXIO); /* * Tell GEOM that we've gone away, we'll get a callback when it is * done cleaning up its resources. */ disk_gone(softc->disk); xpt_print(periph->path, "lost device - %d outstanding, %d refs\n", softc->outstanding_cmds, periph->refcount); } static void dacleanup(struct cam_periph *periph) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; xpt_print(periph->path, "removing device entry\n"); cam_periph_unlock(periph); /* * If we can't free the sysctl tree, oh well... */ if ((softc->flags & DA_FLAG_SCTX_INIT) != 0 && sysctl_ctx_free(&softc->sysctl_ctx) != 0) { xpt_print(periph->path, "can't remove sysctl context\n"); } callout_drain(&softc->mediapoll_c); disk_destroy(softc->disk); callout_drain(&softc->sendordered_c); free(softc, M_DEVBUF); cam_periph_lock(periph); } static void daasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { struct cam_periph *periph; struct da_softc *softc; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; if (cgd->protocol != PROTO_SCSI) break; if (SID_TYPE(&cgd->inq_data) != T_DIRECT && SID_TYPE(&cgd->inq_data) != T_RBC && SID_TYPE(&cgd->inq_data) != T_OPTICAL) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(daregister, daoninvalidate, dacleanup, dastart, "da", CAM_PERIPH_BIO, cgd->ccb_h.path, daasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) printf("daasync: Unable to attach to new device " "due to status 0x%x\n", status); return; } case AC_ADVINFO_CHANGED: { uintptr_t buftype; buftype = (uintptr_t)arg; if (buftype == CDAI_TYPE_PHYS_PATH) { struct da_softc *softc; softc = periph->softc; disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); } break; } case AC_UNIT_ATTENTION: { union ccb *ccb; int error_code, sense_key, asc, ascq; softc = (struct da_softc *)periph->softc; ccb = (union ccb *)arg; /* * Handle all UNIT ATTENTIONs except our own, * as they will be handled by daerror(). */ if (xpt_path_periph(ccb->ccb_h.path) != periph && scsi_extract_sense_ccb(ccb, &error_code, &sense_key, &asc, &ascq)) { if (asc == 0x2A && ascq == 0x09) { xpt_print(ccb->ccb_h.path, "capacity data has changed\n"); dareprobe(periph); } else if (asc == 0x28 && ascq == 0x00) disk_media_changed(softc->disk, M_NOWAIT); } cam_periph_async(periph, code, path, arg); break; } case AC_SCSI_AEN: softc = (struct da_softc *)periph->softc; if (!softc->tur) { if (cam_periph_acquire(periph) == CAM_REQ_CMP) { softc->tur = 1; daschedule(periph); } } /* FALLTHROUGH */ case AC_SENT_BDR: case AC_BUS_RESET: { struct ccb_hdr *ccbh; softc = (struct da_softc *)periph->softc; /* * Don't fail on the expected unit attention * that will occur. */ softc->flags |= DA_FLAG_RETRY_UA; LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le) ccbh->ccb_state |= DA_CCB_RETRY_UA; break; } default: break; } cam_periph_async(periph, code, path, arg); } static void dasysctlinit(void *context, int pending) { struct cam_periph *periph; struct da_softc *softc; char tmpstr[80], tmpstr2[80]; struct ccb_trans_settings cts; periph = (struct cam_periph *)context; /* * periph was held for us when this task was enqueued */ if (periph->flags & CAM_PERIPH_INVALID) { cam_periph_release(periph); return; } softc = (struct da_softc *)periph->softc; snprintf(tmpstr, sizeof(tmpstr), "CAM DA unit %d", periph->unit_number); snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number); sysctl_ctx_init(&softc->sysctl_ctx); softc->flags |= DA_FLAG_SCTX_INIT; softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam_da), OID_AUTO, tmpstr2, CTLFLAG_RD, 0, tmpstr); if (softc->sysctl_tree == NULL) { printf("dasysctlinit: unable to allocate sysctl tree\n"); cam_periph_release(periph); return; } /* * Now register the sysctl handler, so the user can change the value on * the fly. */ SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "delete_method", CTLTYPE_STRING | CTLFLAG_RW, softc, 0, dadeletemethodsysctl, "A", "BIO_DELETE execution method"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "minimum_cmd_size", CTLTYPE_INT | CTLFLAG_RW, &softc->minimum_cmd_size, 0, dacmdsizesysctl, "I", "Minimum CDB size"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "error_inject", CTLFLAG_RW, &softc->error_inject, 0, "error_inject leaf"); /* * Add some addressing info. */ memset(&cts, 0, sizeof (cts)); xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE); cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS; cts.type = CTS_TYPE_CURRENT_SETTINGS; cam_periph_lock(periph); xpt_action((union ccb *)&cts); cam_periph_unlock(periph); if (cts.ccb_h.status != CAM_REQ_CMP) { cam_periph_release(periph); return; } if (cts.protocol == PROTO_SCSI && cts.transport == XPORT_FC) { struct ccb_trans_settings_fc *fc = &cts.xport_specific.fc; if (fc->valid & CTS_FC_VALID_WWPN) { softc->wwpn = fc->wwpn; SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "wwpn", CTLFLAG_RD, &softc->wwpn, "World Wide Port Name"); } } cam_periph_release(periph); } static int dacmdsizesysctl(SYSCTL_HANDLER_ARGS) { int error, value; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); /* * Acceptable values here are 6, 10, 12 or 16. */ if (value < 6) value = 6; else if ((value > 6) && (value <= 10)) value = 10; else if ((value > 10) && (value <= 12)) value = 12; else if (value > 12) value = 16; *(int *)arg1 = value; return (0); } static int dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method) { if (delete_method < 0 || delete_method > DA_DELETE_MAX) return (EINVAL); softc->delete_method = delete_method; if (softc->delete_method > DA_DELETE_DISABLE) softc->disk->d_flags |= DISKFLAG_CANDELETE; else softc->disk->d_flags &= ~DISKFLAG_CANDELETE; return (0); } static int dadeletemethodsysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; const char *p; struct da_softc *softc; int i, error, value; softc = (struct da_softc *)arg1; value = softc->delete_method; if (value < 0 || value > DA_DELETE_MAX) p = "UNKNOWN"; else p = da_delete_method_names[value]; strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); for (i = 0; i <= DA_DELETE_MAX; i++) { if (strcmp(buf, da_delete_method_names[i]) != 0) continue; return dadeletemethodset(softc, i); } return (EINVAL); } static cam_status daregister(struct cam_periph *periph, void *arg) { struct da_softc *softc; struct ccb_pathinq cpi; struct ccb_getdev *cgd; char tmpstr[80]; caddr_t match; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) { printf("daregister: no getdev CCB, can't register device\n"); return(CAM_REQ_CMP_ERR); } softc = (struct da_softc *)malloc(sizeof(*softc), M_DEVBUF, M_NOWAIT|M_ZERO); if (softc == NULL) { printf("daregister: Unable to probe new device. " "Unable to allocate softc\n"); return(CAM_REQ_CMP_ERR); } LIST_INIT(&softc->pending_ccbs); softc->state = DA_STATE_PROBE; bioq_init(&softc->bio_queue); bioq_init(&softc->delete_queue); bioq_init(&softc->delete_run_queue); if (SID_IS_REMOVABLE(&cgd->inq_data)) softc->flags |= DA_FLAG_PACK_REMOVABLE; softc->unmap_max_ranges = UNMAP_MAX_RANGES; softc->unmap_max_lba = 1024*1024*2; periph->softc = softc; /* * See if this device has any quirks. */ match = cam_quirkmatch((caddr_t)&cgd->inq_data, (caddr_t)da_quirk_table, sizeof(da_quirk_table)/sizeof(*da_quirk_table), sizeof(*da_quirk_table), scsi_inquiry_match); if (match != NULL) softc->quirks = ((struct da_quirk_entry *)match)->quirks; else softc->quirks = DA_Q_NONE; /* Check if the SIM does not want 6 byte commands */ bzero(&cpi, sizeof(cpi)); xpt_setup_ccb(&cpi.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cpi.ccb_h.func_code = XPT_PATH_INQ; xpt_action((union ccb *)&cpi); if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE)) softc->quirks |= DA_Q_NO_6_BYTE; TASK_INIT(&softc->sysctl_task, 0, dasysctlinit, periph); /* * Take an exclusive refcount on the periph while dastart is called * to finish the probe. The reference will be dropped in dadone at * the end of probe. */ (void)cam_periph_hold(periph, PRIBIO); /* * Schedule a periodic event to occasionally send an * ordered tag to a device. */ callout_init_mtx(&softc->sendordered_c, periph->sim->mtx, 0); callout_reset(&softc->sendordered_c, (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL, dasendorderedtag, softc); mtx_unlock(periph->sim->mtx); /* * RBC devices don't have to support READ(6), only READ(10). */ if (softc->quirks & DA_Q_NO_6_BYTE || SID_TYPE(&cgd->inq_data) == T_RBC) softc->minimum_cmd_size = 10; else softc->minimum_cmd_size = 6; /* * Load the user's default, if any. */ snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.minimum_cmd_size", periph->unit_number); TUNABLE_INT_FETCH(tmpstr, &softc->minimum_cmd_size); /* * 6, 10, 12 and 16 are the currently permissible values. */ if (softc->minimum_cmd_size < 6) softc->minimum_cmd_size = 6; else if ((softc->minimum_cmd_size > 6) && (softc->minimum_cmd_size <= 10)) softc->minimum_cmd_size = 10; else if ((softc->minimum_cmd_size > 10) && (softc->minimum_cmd_size <= 12)) softc->minimum_cmd_size = 12; else if (softc->minimum_cmd_size > 12) softc->minimum_cmd_size = 16; /* Predict whether device may support READ CAPACITY(16). */ if (SID_ANSI_REV(&cgd->inq_data) >= SCSI_REV_SPC3) { softc->flags |= DA_FLAG_CAN_RC16; softc->state = DA_STATE_PROBE2; } /* * Register this media as a disk. */ softc->disk = disk_alloc(); softc->disk->d_devstat = devstat_new_entry(periph->periph_name, periph->unit_number, 0, DEVSTAT_BS_UNAVAILABLE, SID_TYPE(&cgd->inq_data) | XPORT_DEVSTAT_TYPE(cpi.transport), DEVSTAT_PRIORITY_DISK); softc->disk->d_open = daopen; softc->disk->d_close = daclose; softc->disk->d_strategy = dastrategy; softc->disk->d_dump = dadump; softc->disk->d_getattr = dagetattr; softc->disk->d_gone = dadiskgonecb; softc->disk->d_name = "da"; softc->disk->d_drv1 = periph; if (cpi.maxio == 0) softc->disk->d_maxsize = DFLTPHYS; /* traditional default */ else if (cpi.maxio > MAXPHYS) softc->disk->d_maxsize = MAXPHYS; /* for safety */ else softc->disk->d_maxsize = cpi.maxio; softc->disk->d_unit = periph->unit_number; softc->disk->d_flags = 0; if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE; cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor, sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr)); strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr)); cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)], cgd->inq_data.product, sizeof(cgd->inq_data.product), sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr)); softc->disk->d_hba_vendor = cpi.hba_vendor; softc->disk->d_hba_device = cpi.hba_device; softc->disk->d_hba_subvendor = cpi.hba_subvendor; softc->disk->d_hba_subdevice = cpi.hba_subdevice; /* * Acquire a reference to the periph before we register with GEOM. * We'll release this reference once GEOM calls us back (via * dadiskgonecb()) telling us that our provider has been freed. */ if (cam_periph_acquire(periph) != CAM_REQ_CMP) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); mtx_lock(periph->sim->mtx); return (CAM_REQ_CMP_ERR); } disk_create(softc->disk, DISK_VERSION); mtx_lock(periph->sim->mtx); /* * Add async callbacks for events of interest. * I don't bother checking if this fails as, * in most cases, the system will function just * fine without them and the only alternative * would be to not attach the device on failure. */ xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE | AC_ADVINFO_CHANGED | AC_SCSI_AEN | AC_UNIT_ATTENTION, daasync, periph, periph->path); /* * Emit an attribute changed notification just in case * physical path information arrived before our async * event handler was registered, but after anyone attaching * to our disk device polled it. */ disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); /* * Schedule a periodic media polling events. */ callout_init_mtx(&softc->mediapoll_c, periph->sim->mtx, 0); if ((softc->flags & DA_FLAG_PACK_REMOVABLE) && (cgd->inq_flags & SID_AEN) == 0 && da_poll_period != 0) callout_reset(&softc->mediapoll_c, da_poll_period * hz, damediapoll, periph); xpt_schedule(periph, CAM_PRIORITY_DEV); return(CAM_REQ_CMP); } static void dastart(struct cam_periph *periph, union ccb *start_ccb) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastart\n")); switch (softc->state) { case DA_STATE_NORMAL: { struct bio *bp, *bp1; uint8_t tag_code; /* Execute immediate CCB if waiting. */ if (periph->immediate_priority <= periph->pinfo.priority) { CAM_DEBUG(periph->path, CAM_DEBUG_SUBTRACE, ("queuing for immediate ccb\n")); start_ccb->ccb_h.ccb_state = DA_CCB_WAITING; SLIST_INSERT_HEAD(&periph->ccb_list, &start_ccb->ccb_h, periph_links.sle); periph->immediate_priority = CAM_PRIORITY_NONE; wakeup(&periph->ccb_list); /* May have more work to do, so ensure we stay scheduled */ daschedule(periph); break; } /* Run BIO_DELETE if not running yet. */ if (!softc->delete_running && (bp = bioq_first(&softc->delete_queue)) != NULL) { uint64_t lba; u_int count; if (softc->delete_method == DA_DELETE_UNMAP) { uint8_t *buf = softc->unmap_buf; uint64_t lastlba = (uint64_t)-1; uint32_t lastcount = 0; int blocks = 0, off, ranges = 0; softc->delete_running = 1; bzero(softc->unmap_buf, sizeof(softc->unmap_buf)); bp1 = bp; do { bioq_remove(&softc->delete_queue, bp1); if (bp1 != bp) bioq_insert_tail(&softc->delete_run_queue, bp1); lba = bp1->bio_pblkno; count = bp1->bio_bcount / softc->params.secsize; /* Try to extend the previous range. */ if (lba == lastlba) { lastcount += count; off = (ranges - 1) * 16 + 8; scsi_ulto4b(lastcount, &buf[off + 8]); } else if (count > 0) { off = ranges * 16 + 8; scsi_u64to8b(lba, &buf[off + 0]); scsi_ulto4b(count, &buf[off + 8]); lastcount = count; ranges++; } blocks += count; lastlba = lba + count; bp1 = bioq_first(&softc->delete_queue); if (bp1 == NULL || ranges >= softc->unmap_max_ranges || blocks + bp1->bio_bcount / softc->params.secsize > softc->unmap_max_lba) break; } while (1); scsi_ulto2b(ranges * 16 + 6, &buf[0]); scsi_ulto2b(ranges * 16, &buf[2]); scsi_unmap(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*byte2*/0, /*data_ptr*/ buf, /*dxfer_len*/ ranges * 16 + 8, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); start_ccb->ccb_h.ccb_state = DA_CCB_DELETE; goto out; } else if (softc->delete_method == DA_DELETE_ZERO || softc->delete_method == DA_DELETE_WS10 || softc->delete_method == DA_DELETE_WS16) { softc->delete_running = 1; lba = bp->bio_pblkno; count = 0; bp1 = bp; do { bioq_remove(&softc->delete_queue, bp1); if (bp1 != bp) bioq_insert_tail(&softc->delete_run_queue, bp1); count += bp1->bio_bcount / softc->params.secsize; bp1 = bioq_first(&softc->delete_queue); if (bp1 == NULL || lba + count != bp1->bio_pblkno || count + bp1->bio_bcount / softc->params.secsize > 0xffff) break; } while (1); scsi_write_same(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*byte2*/softc->delete_method == DA_DELETE_ZERO ? 0 : SWS_UNMAP, softc->delete_method == DA_DELETE_WS16 ? 16 : 10, /*lba*/lba, /*block_count*/count, /*data_ptr*/ __DECONST(void *, zero_region), /*dxfer_len*/ softc->params.secsize, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); start_ccb->ccb_h.ccb_state = DA_CCB_DELETE; goto out; } else { bioq_flush(&softc->delete_queue, NULL, 0); /* FALLTHROUGH */ } } /* Run regular command. */ bp = bioq_takefirst(&softc->bio_queue); if (bp == NULL) { if (softc->tur) { softc->tur = 0; scsi_test_unit_ready(&start_ccb->csio, /*retries*/ da_retry_count, dadone, MSG_SIMPLE_Q_TAG, SSD_FULL_SIZE, da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_TUR; xpt_action(start_ccb); } else xpt_release_ccb(start_ccb); break; } if (softc->tur) { softc->tur = 0; cam_periph_release_locked(periph); } if ((bp->bio_flags & BIO_ORDERED) != 0 || (softc->flags & DA_FLAG_NEED_OTAG) != 0) { softc->flags &= ~DA_FLAG_NEED_OTAG; softc->ordered_tag_count++; tag_code = MSG_ORDERED_Q_TAG; } else { tag_code = MSG_SIMPLE_Q_TAG; } switch (bp->bio_cmd) { case BIO_READ: case BIO_WRITE: scsi_read_write(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/tag_code, /*read_op*/bp->bio_cmd == BIO_READ, /*byte2*/0, softc->minimum_cmd_size, /*lba*/bp->bio_pblkno, /*block_count*/bp->bio_bcount / softc->params.secsize, /*data_ptr*/ bp->bio_data, /*dxfer_len*/ bp->bio_bcount, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); break; case BIO_FLUSH: /* * BIO_FLUSH doesn't currently communicate * range data, so we synchronize the cache * over the whole disk. We also force * ordered tag semantics the flush applies * to all previously queued I/O. */ scsi_synchronize_cache(&start_ccb->csio, /*retries*/1, /*cbfcnp*/dadone, MSG_ORDERED_Q_TAG, /*begin_lba*/0, /*lb_count*/0, SSD_FULL_SIZE, da_default_timeout*1000); break; } start_ccb->ccb_h.ccb_state = DA_CCB_BUFFER_IO; out: /* * Block out any asyncronous callbacks * while we touch the pending ccb list. */ LIST_INSERT_HEAD(&softc->pending_ccbs, &start_ccb->ccb_h, periph_links.le); softc->outstanding_cmds++; /* We expect a unit attention from this device */ if ((softc->flags & DA_FLAG_RETRY_UA) != 0) { start_ccb->ccb_h.ccb_state |= DA_CCB_RETRY_UA; softc->flags &= ~DA_FLAG_RETRY_UA; } start_ccb->ccb_h.ccb_bp = bp; xpt_action(start_ccb); /* May have more work to do, so ensure we stay scheduled */ daschedule(periph); break; } case DA_STATE_PROBE: { struct scsi_read_capacity_data *rcap; rcap = (struct scsi_read_capacity_data *) malloc(sizeof(*rcap), M_SCSIDA, M_NOWAIT|M_ZERO); if (rcap == NULL) { printf("dastart: Couldn't malloc read_capacity data\n"); /* da_free_periph??? */ break; } scsi_read_capacity(&start_ccb->csio, /*retries*/da_retry_count, dadone, MSG_SIMPLE_Q_TAG, rcap, SSD_FULL_SIZE, /*timeout*/5000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE; xpt_action(start_ccb); break; } case DA_STATE_PROBE2: { struct scsi_read_capacity_data_long *rcaplong; rcaplong = (struct scsi_read_capacity_data_long *) malloc(sizeof(*rcaplong), M_SCSIDA, M_NOWAIT|M_ZERO); if (rcaplong == NULL) { printf("dastart: Couldn't malloc read_capacity data\n"); /* da_free_periph??? */ break; } scsi_read_capacity_16(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*lba*/ 0, /*reladr*/ 0, /*pmi*/ 0, /*rcap_buf*/ (uint8_t *)rcaplong, /*rcap_buf_len*/ sizeof(*rcaplong), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE2; xpt_action(start_ccb); break; } } } static int cmd6workaround(union ccb *ccb) { struct scsi_rw_6 cmd6; struct scsi_rw_10 *cmd10; struct da_softc *softc; u_int8_t *cdb; struct bio *bp; int frozen; cdb = ccb->csio.cdb_io.cdb_bytes; softc = (struct da_softc *)xpt_path_periph(ccb->ccb_h.path)->softc; if (ccb->ccb_h.ccb_state == DA_CCB_DELETE) { if (softc->delete_method == DA_DELETE_UNMAP) { xpt_print(ccb->ccb_h.path, "UNMAP is not supported, " "switching to WRITE SAME(16) with UNMAP.\n"); dadeletemethodset(softc, DA_DELETE_WS16); } else if (softc->delete_method == DA_DELETE_WS16) { xpt_print(ccb->ccb_h.path, "WRITE SAME(16) with UNMAP is not supported, " "disabling BIO_DELETE.\n"); dadeletemethodset(softc, DA_DELETE_DISABLE); } else if (softc->delete_method == DA_DELETE_WS10) { xpt_print(ccb->ccb_h.path, "WRITE SAME(10) with UNMAP is not supported, " "disabling BIO_DELETE.\n"); dadeletemethodset(softc, DA_DELETE_DISABLE); } else if (softc->delete_method == DA_DELETE_ZERO) { xpt_print(ccb->ccb_h.path, "WRITE SAME(10) is not supported, " "disabling BIO_DELETE.\n"); dadeletemethodset(softc, DA_DELETE_DISABLE); } else dadeletemethodset(softc, DA_DELETE_DISABLE); while ((bp = bioq_takefirst(&softc->delete_run_queue)) != NULL) bioq_disksort(&softc->delete_queue, bp); bioq_insert_tail(&softc->delete_queue, (struct bio *)ccb->ccb_h.ccb_bp); ccb->ccb_h.ccb_bp = NULL; return (0); } /* Translation only possible if CDB is an array and cmd is R/W6 */ if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0 || (*cdb != READ_6 && *cdb != WRITE_6)) return 0; xpt_print(ccb->ccb_h.path, "READ(6)/WRITE(6) not supported, " "increasing minimum_cmd_size to 10.\n"); softc->minimum_cmd_size = 10; bcopy(cdb, &cmd6, sizeof(struct scsi_rw_6)); cmd10 = (struct scsi_rw_10 *)cdb; cmd10->opcode = (cmd6.opcode == READ_6) ? READ_10 : WRITE_10; cmd10->byte2 = 0; scsi_ulto4b(scsi_3btoul(cmd6.addr), cmd10->addr); cmd10->reserved = 0; scsi_ulto2b(cmd6.length, cmd10->length); cmd10->control = cmd6.control; ccb->csio.cdb_len = sizeof(*cmd10); /* Requeue request, unfreezing queue if necessary */ frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0; ccb->ccb_h.status = CAM_REQUEUE_REQ; xpt_action(ccb); if (frozen) { cam_release_devq(ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } return (ERESTART); } static void dadone(struct cam_periph *periph, union ccb *done_ccb) { struct da_softc *softc; struct ccb_scsiio *csio; u_int32_t priority; da_ccb_state state; softc = (struct da_softc *)periph->softc; priority = done_ccb->ccb_h.pinfo.priority; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone\n")); csio = &done_ccb->csio; state = csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK; switch (state) { case DA_CCB_BUFFER_IO: case DA_CCB_DELETE: { struct bio *bp, *bp1; bp = (struct bio *)done_ccb->ccb_h.ccb_bp; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { int error; int sf; if ((csio->ccb_h.ccb_state & DA_CCB_RETRY_UA) != 0) sf = SF_RETRY_UA; else sf = 0; error = daerror(done_ccb, CAM_RETRY_SELTO, sf); if (error == ERESTART) { /* * A retry was scheuled, so * just return. */ return; } bp = (struct bio *)done_ccb->ccb_h.ccb_bp; if (error != 0) { int queued_error; /* * return all queued I/O with EIO, so that * the client can retry these I/Os in the * proper order should it attempt to recover. */ queued_error = EIO; if (error == ENXIO && (softc->flags & DA_FLAG_PACK_INVALID)== 0) { /* * Catastrophic error. Mark our pack as * invalid. */ /* * XXX See if this is really a media * XXX change first? */ xpt_print(periph->path, "Invalidating pack\n"); softc->flags |= DA_FLAG_PACK_INVALID; queued_error = ENXIO; } bioq_flush(&softc->bio_queue, NULL, queued_error); if (bp != NULL) { bp->bio_error = error; bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; } } else if (bp != NULL) { bp->bio_resid = csio->resid; bp->bio_error = 0; if (bp->bio_resid != 0) bp->bio_flags |= BIO_ERROR; } if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } else if (bp != NULL) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) panic("REQ_CMP with QFRZN"); bp->bio_resid = csio->resid; if (csio->resid > 0) bp->bio_flags |= BIO_ERROR; if (softc->error_inject != 0) { bp->bio_error = softc->error_inject; bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; softc->error_inject = 0; } } /* * Block out any asyncronous callbacks * while we touch the pending ccb list. */ LIST_REMOVE(&done_ccb->ccb_h, periph_links.le); softc->outstanding_cmds--; if (softc->outstanding_cmds == 0) softc->flags |= DA_FLAG_WENT_IDLE; if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) { xpt_print(periph->path, "oustanding %d\n", softc->outstanding_cmds); } if (state == DA_CCB_DELETE) { while ((bp1 = bioq_takefirst(&softc->delete_run_queue)) != NULL) { bp1->bio_resid = bp->bio_resid; bp1->bio_error = bp->bio_error; if (bp->bio_flags & BIO_ERROR) bp1->bio_flags |= BIO_ERROR; biodone(bp1); } softc->delete_running = 0; if (bp != NULL) biodone(bp); daschedule(periph); } else if (bp != NULL) biodone(bp); break; } case DA_CCB_PROBE: case DA_CCB_PROBE2: { struct scsi_read_capacity_data *rdcap; struct scsi_read_capacity_data_long *rcaplong; char announce_buf[80]; rdcap = NULL; rcaplong = NULL; if (state == DA_CCB_PROBE) rdcap =(struct scsi_read_capacity_data *)csio->data_ptr; else rcaplong = (struct scsi_read_capacity_data_long *) csio->data_ptr; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { struct disk_params *dp; uint32_t block_size; uint64_t maxsector; u_int lbppbe; /* LB per physical block exponent. */ u_int lalba; /* Lowest aligned LBA. */ if (state == DA_CCB_PROBE) { block_size = scsi_4btoul(rdcap->length); maxsector = scsi_4btoul(rdcap->addr); lbppbe = 0; lalba = 0; /* * According to SBC-2, if the standard 10 * byte READ CAPACITY command returns 2^32, * we should issue the 16 byte version of * the command, since the device in question * has more sectors than can be represented * with the short version of the command. */ if (maxsector == 0xffffffff) { softc->state = DA_STATE_PROBE2; free(rdcap, M_SCSIDA); xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } } else { block_size = scsi_4btoul(rcaplong->length); maxsector = scsi_8btou64(rcaplong->addr); lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE; lalba = scsi_2btoul(rcaplong->lalba_lbp); } /* * Because GEOM code just will panic us if we * give them an 'illegal' value we'll avoid that * here. */ if (block_size == 0 && maxsector == 0) { snprintf(announce_buf, sizeof(announce_buf), "0MB (no media?)"); } else if (block_size >= MAXPHYS || block_size == 0) { xpt_print(periph->path, "unsupportable block size %ju\n", (uintmax_t) block_size); announce_buf[0] = '\0'; cam_periph_invalidate(periph); } else { /* * We pass rcaplong into dasetgeom(), * because it will only use it if it is * non-NULL. */ dasetgeom(periph, block_size, maxsector, rcaplong, sizeof(*rcaplong)); if ((lalba & SRC16_LBPME_A) && softc->delete_method == DA_DELETE_NONE) dadeletemethodset(softc, DA_DELETE_UNMAP); dp = &softc->params; snprintf(announce_buf, sizeof(announce_buf), "%juMB (%ju %u byte sectors: %dH %dS/T " "%dC)", (uintmax_t) (((uintmax_t)dp->secsize * dp->sectors) / (1024*1024)), (uintmax_t)dp->sectors, dp->secsize, dp->heads, dp->secs_per_track, dp->cylinders); } } else { int error; announce_buf[0] = '\0'; /* * Retry any UNIT ATTENTION type errors. They * are expected at boot. */ error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) { /* * A retry was scheuled, so * just return. */ return; } else if (error != 0) { int asc, ascq; int sense_key, error_code; int have_sense; cam_status status; struct ccb_getdev cgd; /* Don't wedge this device's queue */ status = done_ccb->ccb_h.status; if ((status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); xpt_setup_ccb(&cgd.ccb_h, done_ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); if (scsi_extract_sense_ccb(done_ccb, &error_code, &sense_key, &asc, &ascq)) have_sense = TRUE; else have_sense = FALSE; /* * If we tried READ CAPACITY(16) and failed, * fallback to READ CAPACITY(10). */ if ((state == DA_CCB_PROBE2) && (softc->flags & DA_FLAG_CAN_RC16) && (((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) || ((have_sense) && (error_code == SSD_CURRENT_ERROR) && (sense_key == SSD_KEY_ILLEGAL_REQUEST)))) { softc->flags &= ~DA_FLAG_CAN_RC16; softc->state = DA_STATE_PROBE; free(rdcap, M_SCSIDA); xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } else /* * Attach to anything that claims to be a * direct access or optical disk device, * as long as it doesn't return a "Logical * unit not supported" (0x25) error. */ if ((have_sense) && (asc != 0x25) && (error_code == SSD_CURRENT_ERROR)) { const char *sense_key_desc; const char *asc_desc; scsi_sense_desc(sense_key, asc, ascq, &cgd.inq_data, &sense_key_desc, &asc_desc); snprintf(announce_buf, sizeof(announce_buf), "Attempt to query device " "size failed: %s, %s", sense_key_desc, asc_desc); } else { if (have_sense) scsi_sense_print( &done_ccb->csio); else { xpt_print(periph->path, "got CAM status %#x\n", done_ccb->ccb_h.status); } xpt_print(periph->path, "fatal error, " "failed to attach to device\n"); /* * Free up resources. */ cam_periph_invalidate(periph); } } } free(csio->data_ptr, M_SCSIDA); if (announce_buf[0] != '\0' && ((softc->flags & DA_FLAG_PROBED) == 0)) { /* * Create our sysctl variables, now that we know * we have successfully attached. */ /* increase the refcount */ if (cam_periph_acquire(periph) == CAM_REQ_CMP) { taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task); xpt_announce_periph(periph, announce_buf); } else { xpt_print(periph->path, "fatal error, " "could not acquire reference count\n"); } } /* * Since our peripheral may be invalidated by an error * above or an external event, we must release our CCB * before releasing the probe lock on the peripheral. * The peripheral will only go away once the last lock * is removed, and we need it around for the CCB release * operation. */ xpt_release_ccb(done_ccb); softc->state = DA_STATE_NORMAL; daschedule(periph); wakeup(&softc->disk->d_mediasize); if ((softc->flags & DA_FLAG_PROBED) == 0) { softc->flags |= DA_FLAG_PROBED; cam_periph_unhold(periph); } else cam_periph_release_locked(periph); return; } case DA_CCB_WAITING: { /* Caller will release the CCB */ wakeup(&done_ccb->ccb_h.cbfcnp); return; } case DA_CCB_DUMP: /* No-op. We're polling */ return; case DA_CCB_TUR: { if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_RECOVERY | SF_NO_PRINT) == ERESTART) return; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } xpt_release_ccb(done_ccb); cam_periph_release_locked(periph); return; } default: break; } xpt_release_ccb(done_ccb); } static void dareprobe(struct cam_periph *periph) { struct da_softc *softc; cam_status status; softc = (struct da_softc *)periph->softc; /* Probe in progress; don't interfere. */ if ((softc->flags & DA_FLAG_PROBED) == 0) return; status = cam_periph_acquire(periph); KASSERT(status == CAM_REQ_CMP, ("dareprobe: cam_periph_acquire failed")); if (softc->flags & DA_FLAG_CAN_RC16) softc->state = DA_STATE_PROBE2; else softc->state = DA_STATE_PROBE; xpt_schedule(periph, CAM_PRIORITY_DEV); } static int daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags) { struct da_softc *softc; struct cam_periph *periph; int error, error_code, sense_key, asc, ascq; periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct da_softc *)periph->softc; /* * Automatically detect devices that do not support * READ(6)/WRITE(6) and upgrade to using 10 byte cdbs. */ error = 0; if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) { error = cmd6workaround(ccb); } else if (scsi_extract_sense_ccb(ccb, &error_code, &sense_key, &asc, &ascq)) { if (sense_key == SSD_KEY_ILLEGAL_REQUEST) error = cmd6workaround(ccb); /* * If the target replied with CAPACITY DATA HAS CHANGED UA, * query the capacity and notify upper layers. */ else if (sense_key == SSD_KEY_UNIT_ATTENTION && asc == 0x2A && ascq == 0x09) { xpt_print(periph->path, "capacity data has changed\n"); dareprobe(periph); sense_flags |= SF_NO_PRINT; } else if (sense_key == SSD_KEY_UNIT_ATTENTION && asc == 0x28 && ascq == 0x00) disk_media_changed(softc->disk, M_NOWAIT); else if (sense_key == SSD_KEY_NOT_READY && asc == 0x3a && (softc->flags & DA_FLAG_SAW_MEDIA)) { softc->flags &= ~DA_FLAG_SAW_MEDIA; disk_media_gone(softc->disk, M_NOWAIT); } } if (error == ERESTART) return (ERESTART); /* * XXX * Until we have a better way of doing pack validation, * don't treat UAs as errors. */ sense_flags |= SF_RETRY_UA; return(cam_periph_error(ccb, cam_flags, sense_flags, &softc->saved_ccb)); } static void damediapoll(void *arg) { struct cam_periph *periph = arg; struct da_softc *softc = periph->softc; if (!softc->tur && softc->outstanding_cmds == 0) { if (cam_periph_acquire(periph) == CAM_REQ_CMP) { softc->tur = 1; daschedule(periph); } } /* Queue us up again */ if (da_poll_period != 0) callout_schedule(&softc->mediapoll_c, da_poll_period * hz); } static void daprevent(struct cam_periph *periph, int action) { struct da_softc *softc; union ccb *ccb; int error; softc = (struct da_softc *)periph->softc; if (((action == PR_ALLOW) && (softc->flags & DA_FLAG_PACK_LOCKED) == 0) || ((action == PR_PREVENT) && (softc->flags & DA_FLAG_PACK_LOCKED) != 0)) { return; } ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_prevent(&ccb->csio, /*retries*/1, /*cbcfp*/dadone, MSG_SIMPLE_Q_TAG, action, SSD_FULL_SIZE, 5000); error = cam_periph_runccb(ccb, daerror, CAM_RETRY_SELTO, SF_RETRY_UA | SF_QUIET_IR, softc->disk->d_devstat); if (error == 0) { if (action == PR_ALLOW) softc->flags &= ~DA_FLAG_PACK_LOCKED; else softc->flags |= DA_FLAG_PACK_LOCKED; } xpt_release_ccb(ccb); } static void dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector, struct scsi_read_capacity_data_long *rcaplong, size_t rcap_len) { struct ccb_calc_geometry ccg; struct da_softc *softc; struct disk_params *dp; u_int lbppbe, lalba; int error; softc = (struct da_softc *)periph->softc; dp = &softc->params; dp->secsize = block_len; dp->sectors = maxsector + 1; if (rcaplong != NULL) { lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE; lalba = scsi_2btoul(rcaplong->lalba_lbp); lalba &= SRC16_LALBA_A; } else { lbppbe = 0; lalba = 0; } if (lbppbe > 0) { dp->stripesize = block_len << lbppbe; dp->stripeoffset = (dp->stripesize - block_len * lalba) % dp->stripesize; } else if (softc->quirks & DA_Q_4K) { dp->stripesize = 4096; dp->stripeoffset = 0; } else { dp->stripesize = 0; dp->stripeoffset = 0; } /* * Have the controller provide us with a geometry * for this disk. The only time the geometry * matters is when we boot and the controller * is the only one knowledgeable enough to come * up with something that will make this a bootable * device. */ xpt_setup_ccb(&ccg.ccb_h, periph->path, CAM_PRIORITY_NORMAL); ccg.ccb_h.func_code = XPT_CALC_GEOMETRY; ccg.block_size = dp->secsize; ccg.volume_size = dp->sectors; ccg.heads = 0; ccg.secs_per_track = 0; ccg.cylinders = 0; xpt_action((union ccb*)&ccg); if ((ccg.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { /* * We don't know what went wrong here- but just pick * a geometry so we don't have nasty things like divide * by zero. */ dp->heads = 255; dp->secs_per_track = 255; dp->cylinders = dp->sectors / (255 * 255); if (dp->cylinders == 0) { dp->cylinders = 1; } } else { dp->heads = ccg.heads; dp->secs_per_track = ccg.secs_per_track; dp->cylinders = ccg.cylinders; } /* * If the user supplied a read capacity buffer, and if it is * different than the previous buffer, update the data in the EDT. * If it's the same, we don't bother. This avoids sending an * update every time someone opens this device. */ if ((rcaplong != NULL) && (bcmp(rcaplong, &softc->rcaplong, min(sizeof(softc->rcaplong), rcap_len)) != 0)) { struct ccb_dev_advinfo cdai; xpt_setup_ccb(&cdai.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cdai.ccb_h.func_code = XPT_DEV_ADVINFO; cdai.buftype = CDAI_TYPE_RCAPLONG; cdai.flags |= CDAI_FLAG_STORE; cdai.bufsiz = rcap_len; cdai.buf = (uint8_t *)rcaplong; xpt_action((union ccb *)&cdai); if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE); if (cdai.ccb_h.status != CAM_REQ_CMP) { xpt_print(periph->path, "%s: failed to set read " "capacity advinfo\n", __func__); /* Use cam_error_print() to decode the status */ cam_error_print((union ccb *)&cdai, CAM_ESF_CAM_STATUS, CAM_EPF_ALL); } else { bcopy(rcaplong, &softc->rcaplong, min(sizeof(softc->rcaplong), rcap_len)); } } softc->disk->d_sectorsize = softc->params.secsize; softc->disk->d_mediasize = softc->params.secsize * (off_t)softc->params.sectors; softc->disk->d_stripesize = softc->params.stripesize; softc->disk->d_stripeoffset = softc->params.stripeoffset; /* XXX: these are not actually "firmware" values, so they may be wrong */ softc->disk->d_fwsectors = softc->params.secs_per_track; softc->disk->d_fwheads = softc->params.heads; softc->disk->d_devstat->block_size = softc->params.secsize; softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE; if (softc->delete_method > DA_DELETE_DISABLE) softc->disk->d_flags |= DISKFLAG_CANDELETE; else softc->disk->d_flags &= ~DISKFLAG_CANDELETE; error = disk_resize(softc->disk, M_NOWAIT); if (error != 0) xpt_print(periph->path, "disk_resize(9) failed, error = %d\n", error); } static void dasendorderedtag(void *arg) { struct da_softc *softc = arg; if (da_send_ordered) { if ((softc->ordered_tag_count == 0) && ((softc->flags & DA_FLAG_WENT_IDLE) == 0)) { softc->flags |= DA_FLAG_NEED_OTAG; } if (softc->outstanding_cmds > 0) softc->flags &= ~DA_FLAG_WENT_IDLE; softc->ordered_tag_count = 0; } /* Queue us up again */ callout_reset(&softc->sendordered_c, (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL, dasendorderedtag, softc); } /* * Step through all DA peripheral drivers, and if the device is still open, * sync the disk cache to physical media. */ static void dashutdown(void * arg, int howto) { struct cam_periph *periph; struct da_softc *softc; int error; TAILQ_FOREACH(periph, &dadriver.units, unit_links) { union ccb ccb; cam_periph_lock(periph); softc = (struct da_softc *)periph->softc; /* * We only sync the cache if the drive is still open, and * if the drive is capable of it.. */ if (((softc->flags & DA_FLAG_OPEN) == 0) || (softc->quirks & DA_Q_NO_SYNC_CACHE)) { cam_periph_unlock(periph); continue; } xpt_setup_ccb(&ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL); ccb.ccb_h.ccb_state = DA_CCB_DUMP; scsi_synchronize_cache(&ccb.csio, /*retries*/0, /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG, /*begin_lba*/0, /* whole disk */ /*lb_count*/0, SSD_FULL_SIZE, 60 * 60 * 1000); xpt_polled_action(&ccb); error = cam_periph_error(&ccb, 0, SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR, NULL); if ((ccb.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(ccb.ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); cam_periph_unlock(periph); } } #else /* !_KERNEL */ /* * XXX This is only left out of the kernel build to silence warnings. If, * for some reason this function is used in the kernel, the ifdefs should * be moved so it is included both in the kernel and userland. */ void scsi_format_unit(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, u_int16_t ileave, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_format_unit *scsi_cmd; scsi_cmd = (struct scsi_format_unit *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = FORMAT_UNIT; scsi_cmd->byte2 = byte2; scsi_ulto2b(ileave, scsi_cmd->interleave); cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } #endif /* _KERNEL */ Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 247192) @@ -1,5156 +1,5160 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefor exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (rangeing from 512 bytes to * 128K bytes). We therefor choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for * adjusting the cache use method 2. We therefor provide two * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * * Buffers do not have their own mutexs, rather they rely on the * hash table mutexs for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexs). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each arc state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an arc list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * Arc buffers may have an associated eviction callback function. * This function will be invoked prior to removing the buffer (e.g. * in arc_do_user_evicts()). Note however that the data associated * with the buffer may be evicted prior to the callback. The callback * must be made with *no locks held* (to prevent deadlock). Additionally, * the users of callbacks must ensure that their private data is * protected from simultaneous callbacks from arc_buf_evict() * and arc_do_user_evicts(). * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif #include #include #include #include #include #ifdef illumos #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; int arc_procfd; #endif #endif /* illumos */ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; extern int zfs_write_limit_shift; extern uint64_t zfs_write_limit_max; extern kmutex_t zfs_write_limit_lock; #define ARC_REDUCE_DNLC_PERCENT 3 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ ARC_RECLAIM_CONS /* Conservative reclaim strategy */ } arc_reclaim_strategy_t; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ static int arc_shrink_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static int arc_min_prefetch_lifespan; static int arc_dead; extern int zfs_prefetch_disable; /* * The arc has filled available memory and has now warmed up. */ static boolean_t arc_warm; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); SYSCTL_DECL(_vfs_zfs); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, "Minimum ARC size"); /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ #define ARCS_LOCK_PAD CACHE_LINE_SIZE struct arcs_lock { kmutex_t arcs_lock; #ifdef _KERNEL unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; #endif }; /* * must be power of two for mask use to work * */ #define ARC_BUFC_NUMDATALISTS 16 #define ARC_BUFC_NUMMETADATALISTS 16 #define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) typedef struct arc_state { uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ uint64_t arcs_size; /* total amount of data in this state */ list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); } arc_state_t; #define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; kstat_named_t arcstat_misses; kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_misses; kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_misses; kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_misses; kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_allocated; kstat_named_t arcstat_deleted; kstat_named_t arcstat_stolen; kstat_named_t arcstat_recycle_miss; kstat_named_t arcstat_mutex_miss; kstat_named_t arcstat_evict_skip; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_p; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; kstat_named_t arcstat_hdr_size; kstat_named_t arcstat_data_size; kstat_named_t arcstat_other_size; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_hdr_miss; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_l2_write_trylock_fail; kstat_named_t arcstat_l2_write_passed_headroom; kstat_named_t arcstat_l2_write_spa_mismatch; kstat_named_t arcstat_l2_write_in_l2; kstat_named_t arcstat_l2_write_hdr_io_in_progress; kstat_named_t arcstat_l2_write_not_cacheable; kstat_named_t arcstat_l2_write_full; kstat_named_t arcstat_l2_write_buffer_iter; kstat_named_t arcstat_l2_write_pios; kstat_named_t arcstat_l2_write_buffer_bytes_scanned; kstat_named_t arcstat_l2_write_buffer_list_iter; kstat_named_t arcstat_l2_write_buffer_list_null_iter; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_duplicate_buffers; kstat_named_t arcstat_duplicate_buffers_size; kstat_named_t arcstat_duplicate_reads; } arc_stats_t; static arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "allocated", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "stolen", KSTAT_DATA_UINT64 }, { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, { "l2_write_in_l2", KSTAT_DATA_UINT64 }, { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, { "l2_write_full", KSTAT_DATA_UINT64 }, { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, { "l2_write_pios", KSTAT_DATA_UINT64 }, { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "duplicate_buffers", KSTAT_DATA_UINT64 }, { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, { "duplicate_reads", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)); #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } #define ARCSTAT_MAXSTAT(stat) \ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; static uint64_t arc_meta_used; static uint64_t arc_meta_limit; static uint64_t arc_meta_max = 0; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0, "ARC metadata used"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0, "ARC metadata limit"); typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_done_func_t *acb_done; arc_buf_t *acb_buf; zio_t *acb_zio_dummy; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_done_func_t *awcb_ready; arc_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; uint64_t b_cksum0; kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; void *b_thawed; arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; uint32_t b_flags; uint32_t b_datacnt; arc_callback_t *b_acb; kcondvar_t b_cv; /* immutable */ arc_buf_contents_t b_type; uint64_t b_size; uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; list_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; /* self protecting */ refcount_t b_refcnt; l2arc_buf_hdr_t *b_l2hdr; list_node_t b_l2node; }; static arc_buf_t *arc_eviction_list; static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); #ifdef illumos static void arc_buf_watch(arc_buf_t *buf); #endif /* illumos */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) /* * Private ARC flags. These flags are private ARC only flags that will show up * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can * be passed in as arc_flags in things like arc_read. However, these flags * should never be passed and should only be set by ARC code. When adding new * public flags, make sure not to smash the private ones. */ #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ (hdr)->b_l2hdr != NULL) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) /* * Other sizes */ #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) /* * Hash table routines */ #define HT_LOCK_PAD CACHE_LINE_SIZE struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; #endif }; #define BUF_LOCKS 256 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) /* * L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, &l2arc_write_max, 0, "max write size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, &l2arc_write_boost, 0, "extra write during warmup"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, &l2arc_headroom, 0, "number of dev writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, &l2arc_feed_secs, 0, "interval seconds"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, &l2arc_feed_min_ms, 0, "min interval milliseconds"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, &l2arc_noprefetch, 0, "don't cache prefetch bufs"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, &l2arc_feed_again, 0, "turbo warmup"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, &l2arc_norw, 0, "no reads during writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, &ARC_anon.arcs_size, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, &ARC_mru.arcs_size, 0, "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, &ARC_mfu.arcs_size, 0, "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, &ARC_l2c_only.arcs_size, 0, "size of mru state"); /* * L2ARC Internals */ typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_write; /* desired write size, bytes */ uint64_t l2ad_boost; /* warmup write boost, bytes */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ list_t *l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_t *l2rcb_buf; /* read buffer */ spa_t *l2rcb_spa; /* spa */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ }; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; void (*l2df_func)(void *, size_t); list_node_t l2df_list_node; } l2arc_data_free_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static void l2arc_read_done(zio_t *zio); static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; crc ^= (spa>>8) ^ birth; return (crc); } #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ (buf)->b_dva.dva_word[1] == 0 && \ (buf)->b_birth == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; hdr->b_cksum0 = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *buf; mutex_enter(hash_lock); for (buf = buf_hash_table.ht_table[idx]; buf != NULL; buf = buf->b_hash_next) { if (BUF_EQUAL(spa, dva, birth, buf)) { *lockp = hash_lock; return (buf); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fbuf; uint32_t i; ASSERT(!HDR_IN_HASH_TABLE(buf)); *lockp = hash_lock; mutex_enter(hash_lock); for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; fbuf = fbuf->b_hash_next, i++) { if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) return (fbuf); } buf->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = buf; buf->b_flags |= ARC_IN_HASH_TABLE; /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } ARCSTAT_BUMP(arcstat_hash_elements); ARCSTAT_MAXSTAT(arcstat_hash_elements); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *buf) { arc_buf_hdr_t *fbuf, **bufp; uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(buf)); bufp = &buf_hash_table.ht_table[idx]; while ((fbuf = *bufp) != buf) { ASSERT(fbuf != NULL); bufp = &fbuf->b_hash_next; } *bufp = buf->b_hash_next; buf->b_hash_next = NULL; buf->b_flags &= ~ARC_IN_HASH_TABLE; /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { int i; kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ /* ARGSUSED */ static int hdr_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *buf = vbuf; bzero(buf, sizeof (arc_buf_hdr_t)); refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ /* ARGSUSED */ static void hdr_dest(void *vbuf, void *unused) { arc_buf_hdr_t *buf = vbuf; ASSERT(BUF_EMPTY(buf)); refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); mutex_destroy(&buf->b_freeze_lock); arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* * Reclaim callback -- invoked when memory is low. */ /* ARGSUSED */ static void hdr_recl(void *unused) { dprintf("hdr_recl called\n"); /* * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). */ if (!arc_dead) cv_signal(&arc_reclaim_thr_cv); } static void buf_init(void) { uint64_t *ct; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average 64K block size. The table will take up * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). */ while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) { mutex_init(&buf_hash_table.ht_locks[i].ht_lock, NULL, MUTEX_DEFAULT, NULL); } } #define ARC_MINTIME (hz>>4) /* 62 ms */ static void arc_cksum_verify(arc_buf_t *buf) { zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum == NULL || (buf->b_hdr->b_flags & ARC_IO_ERROR)) { mutex_exit(&buf->b_hdr->b_freeze_lock); return; } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&buf->b_hdr->b_freeze_lock); } static int arc_cksum_equal(arc_buf_t *buf) { zio_cksum_t zc; int equal; mutex_enter(&buf->b_hdr->b_freeze_lock); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); mutex_exit(&buf->b_hdr->b_freeze_lock); return (equal); } static void arc_cksum_compute(arc_buf_t *buf, boolean_t force) { if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { mutex_exit(&buf->b_hdr->b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_freeze_lock); #ifdef illumos arc_buf_watch(buf); #endif /* illumos */ } #ifdef illumos #ifndef _KERNEL typedef struct procctl { long cmd; prwatch_t prwatch; } procctl_t; #endif /* ARGSUSED */ static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = 0; ctl.prwatch.pr_wflags = 0; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } /* ARGSUSED */ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = buf->b_hdr->b_size; ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } #endif /* illumos */ void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_state != arc_anon) panic("modifying non-anon buffer!"); if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_thawed) kmem_free(buf->b_hdr->b_thawed, 1); buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); } mutex_exit(&buf->b_hdr->b_freeze_lock); #ifdef illumos arc_buf_unwatch(buf); #endif /* illumos */ } void arc_buf_freeze(arc_buf_t *buf) { kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); } static void get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock) { uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); if (ab->b_type == ARC_BUFC_METADATA) buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); else { buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); buf_hashid += ARC_BUFC_NUMMETADATALISTS; } *list = &state->arcs_lists[buf_hashid]; *lock = ARCS_LOCK(state, buf_hashid); } static void add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) { ASSERT(MUTEX_HELD(hash_lock)); if ((refcount_add(&ab->b_refcnt, tag) == 1) && (ab->b_state != arc_anon)) { uint64_t delta = ab->b_size * ab->b_datacnt; uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; list_t *list; kmutex_t *lock; get_buf_info(ab, ab->b_state, &list, &lock); ASSERT(!MUTEX_HELD(lock)); mutex_enter(lock); ASSERT(list_link_active(&ab->b_arc_node)); list_remove(list, ab); if (GHOST_STATE(ab->b_state)) { ASSERT0(ab->b_datacnt); ASSERT3P(ab->b_buf, ==, NULL); delta = ab->b_size; } ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); mutex_exit(lock); /* remove the prefetch flag if we get a reference */ if (ab->b_flags & ARC_PREFETCH) ab->b_flags &= ~ARC_PREFETCH; } } static int remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) { int cnt; arc_state_t *state = ab->b_state; ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && (state != arc_anon)) { uint64_t *size = &state->arcs_lsize[ab->b_type]; list_t *list; kmutex_t *lock; get_buf_info(ab, state, &list, &lock); ASSERT(!MUTEX_HELD(lock)); mutex_enter(lock); ASSERT(!list_link_active(&ab->b_arc_node)); list_insert_head(list, ab); ASSERT(ab->b_datacnt > 0); atomic_add_64(size, ab->b_size * ab->b_datacnt); mutex_exit(lock); } return (cnt); } /* * Move the supplied buffer to the indicated state. The mutex * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) { arc_state_t *old_state = ab->b_state; int64_t refcnt = refcount_count(&ab->b_refcnt); uint64_t from_delta, to_delta; list_t *list; kmutex_t *lock; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(new_state != old_state); ASSERT(refcnt == 0 || ab->b_datacnt > 0); ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); from_delta = to_delta = ab->b_datacnt * ab->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon) { int use_mutex; uint64_t *size = &old_state->arcs_lsize[ab->b_type]; get_buf_info(ab, old_state, &list, &lock); use_mutex = !MUTEX_HELD(lock); if (use_mutex) mutex_enter(lock); ASSERT(list_link_active(&ab->b_arc_node)); list_remove(list, ab); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { /* ghost elements have a ghost size */ ASSERT(ab->b_buf == NULL); from_delta = ab->b_size; } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); if (use_mutex) mutex_exit(lock); } if (new_state != arc_anon) { int use_mutex; uint64_t *size = &new_state->arcs_lsize[ab->b_type]; get_buf_info(ab, new_state, &list, &lock); use_mutex = !MUTEX_HELD(lock); if (use_mutex) mutex_enter(lock); list_insert_head(list, ab); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { ASSERT(ab->b_datacnt == 0); ASSERT(ab->b_buf == NULL); to_delta = ab->b_size; } atomic_add_64(size, to_delta); if (use_mutex) mutex_exit(lock); } } ASSERT(!BUF_EMPTY(ab)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) buf_hash_remove(ab); /* adjust state sizes */ if (to_delta) atomic_add_64(&new_state->arcs_size, to_delta); if (from_delta) { ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } ab->b_state = new_state; /* adjust l2arc hdr stats */ if (new_state == arc_l2c_only) l2arc_hdr_stat_add(); else if (old_state == arc_l2c_only) l2arc_hdr_stat_remove(); } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, space); break; } atomic_add_64(&arc_meta_used, space); atomic_add_64(&arc_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, -space); break; } ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; atomic_add_64(&arc_meta_used, -space); ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } void * arc_data_buf_alloc(uint64_t size) { if (arc_evict_needed(ARC_BUFC_DATA)) cv_signal(&arc_reclaim_thr_cv); atomic_add_64(&arc_size, size); return (zio_data_buf_alloc(size)); } void arc_data_buf_free(void *buf, uint64_t size) { zio_data_buf_free(buf, size); ASSERT(arc_size >= size); atomic_add_64(&arc_size, -size); } arc_buf_t * arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; arc_buf_t *buf; ASSERT3U(size, >, 0); hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; hdr->b_spa = spa_load_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; hdr->b_buf = buf; arc_get_data_buf(buf); hdr->b_datacnt = 1; hdr->b_flags = 0; ASSERT(refcount_is_zero(&hdr->b_refcnt)); (void) refcount_add(&hdr->b_refcnt, tag); return (buf); } static char *arc_onloan_tag = "onloan"; /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); (void) refcount_add(&hdr->b_refcnt, tag); (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; ASSERT(buf->b_data != NULL); hdr = buf->b_hdr; (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; atomic_add_64(&arc_loaned_bytes, hdr->b_size); } static arc_buf_t * arc_buf_clone(arc_buf_t *from) { arc_buf_t *buf; arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; ASSERT(hdr->b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = hdr->b_buf; hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); /* * This buffer already exists in the arc so create a duplicate * copy for the caller. If the buffer is associated with user data * then track the size and number of duplicates. These stats will be * updated as duplicate buffers are created and destroyed. */ if (hdr->b_type == ARC_BUFC_DATA) { ARCSTAT_BUMP(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); } hdr->b_datacnt += 1; return (buf); } void arc_buf_add_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; /* * Check to see if this buffer is evicted. Callers * must verify b_data != NULL to know if the add_ref * was successful. */ mutex_enter(&buf->b_evict_lock); if (buf->b_data == NULL) { mutex_exit(&buf->b_evict_lock); return; } hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); } /* * Free the arc data buffer. If it is an l2arc write in progress, * the buffer is placed on l2arc_free_on_write to be freed later. */ static void arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) { arc_buf_hdr_t *hdr = buf->b_hdr; if (HDR_L2_WRITING(hdr)) { l2arc_data_free_t *df; df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); df->l2df_data = buf->b_data; df->l2df_size = hdr->b_size; df->l2df_func = free_func; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { free_func(buf->b_data, hdr->b_size); } } static void arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) { arc_buf_t **bufp; /* free up data associated with the buf */ if (buf->b_data) { arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif /* illumos */ if (!recycle) { if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf, zio_buf_free); arc_space_return(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); arc_buf_data_free(buf, zio_data_buf_free); ARCSTAT_INCR(arcstat_data_size, -size); atomic_add_64(&arc_size, -size); } } if (list_link_active(&buf->b_hdr->b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); ASSERT(state != arc_anon); ASSERT3U(*cnt, >=, size); atomic_add_64(cnt, -size); } ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; /* * If we're destroying a duplicate buffer make sure * that the appropriate statistics are updated. */ if (buf->b_hdr->b_datacnt > 1 && buf->b_hdr->b_type == ARC_BUFC_DATA) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } /* only remove the buf if requested */ if (!all) return; /* remove the buf from the hdr list */ for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; buf->b_next = NULL; ASSERT(buf->b_efunc == NULL); /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; if (l2hdr != NULL) { boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); /* * To prevent arc_free() and l2arc_evict() from * attempting to free the same buffer at the same time, * a FREE_IN_PROGRESS flag is given to arc_free() to * give it priority. l2arc_evict() can't destroy this * header while we are waiting on l2arc_buflist_mtx. * * The hdr may be removed from l2ad_buflist before we * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. */ if (!buflist_held) { mutex_enter(&l2arc_buflist_mtx); l2hdr = hdr->b_l2hdr; } if (l2hdr != NULL) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); if (hdr->b_state == arc_l2c_only) l2arc_hdr_stat_remove(); hdr->b_l2hdr = NULL; } if (!buflist_held) mutex_exit(&l2arc_buflist_mtx); } if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); buf_discard_identity(hdr); } while (hdr->b_buf) { arc_buf_t *buf = hdr->b_buf; if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_buf, FALSE, FALSE); hdr->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_eviction_mtx); } else { arc_buf_destroy(hdr->b_buf, FALSE, TRUE); } } if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } if (hdr->b_thawed) { kmem_free(hdr->b_thawed, 1); hdr->b_thawed = NULL; } ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); ASSERT3P(hdr->b_acb, ==, NULL); kmem_cache_free(hdr_cache, hdr); } void arc_buf_free(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; int hashed = hdr->b_state != arc_anon; ASSERT(buf->b_efunc == NULL); ASSERT(buf->b_data != NULL); if (hashed) { kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); } else { ASSERT(buf == hdr->b_buf); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { int destroy_hdr; /* * We are in the middle of an async write. Don't destroy * this buffer unless the write completes before we finish * decrementing the reference count. */ mutex_enter(&arc_eviction_mtx); (void) remove_reference(hdr, NULL, tag); ASSERT(refcount_is_zero(&hdr->b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); mutex_exit(&arc_eviction_mtx); if (destroy_hdr) arc_hdr_destroy(hdr); } else { if (remove_reference(hdr, NULL, tag) > 0) arc_buf_destroy(buf, FALSE, TRUE); else arc_hdr_destroy(hdr); } } int arc_buf_remove_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); int no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { ASSERT(hdr->b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT(hdr->b_state != arc_anon); ASSERT(buf->b_data != NULL); (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_datacnt > 1) { if (no_callback) arc_buf_destroy(buf, FALSE, TRUE); } else if (no_callback) { ASSERT(hdr->b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; } ASSERT(no_callback || hdr->b_datacnt > 1 || refcount_is_zero(&hdr->b_refcnt)); mutex_exit(hash_lock); return (no_callback); } int arc_buf_size(arc_buf_t *buf) { return (buf->b_hdr->b_size); } /* * Called from the DMU to determine if the current buffer should be * evicted. In order to ensure proper locking, the eviction must be initiated * from the DMU. Return true if the buffer is associated with user data and * duplicate buffers still exist. */ boolean_t arc_buf_eviction_needed(arc_buf_t *buf) { arc_buf_hdr_t *hdr; boolean_t evict_needed = B_FALSE; if (zfs_disable_dup_eviction) return (B_FALSE); mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(); let that function * perform the eviction. */ ASSERT(buf->b_data == NULL); mutex_exit(&buf->b_evict_lock); return (B_FALSE); } else if (buf->b_data == NULL) { /* * We have already been added to the arc eviction list; * recommend eviction. */ ASSERT3P(hdr, ==, &arc_eviction_hdr); mutex_exit(&buf->b_evict_lock); return (B_TRUE); } if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) evict_needed = B_TRUE; mutex_exit(&buf->b_evict_lock); return (evict_needed); } /* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. * If the recycle flag is set, then attempt to "recycle" a buffer: * - look for a buffer to evict that is `bytes' long. * - return the data block from this buffer rather than freeing it. * This flag is used by callers that are trying to make space for a * new buffer in a full arc cache. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so may not catch all candidates. * It may also return without evicting as much space as requested. */ static void * arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; int64_t bytes_remaining; arc_buf_hdr_t *ab, *ab_prev = NULL; list_t *evicted_list, *list, *evicted_list_start, *list_start; kmutex_t *lock, *evicted_lock; kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; static int evict_metadata_offset, evict_data_offset; int i, idx, offset, list_count, count; ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; if (type == ARC_BUFC_METADATA) { offset = 0; list_count = ARC_BUFC_NUMMETADATALISTS; list_start = &state->arcs_lists[0]; evicted_list_start = &evicted_state->arcs_lists[0]; idx = evict_metadata_offset; } else { offset = ARC_BUFC_NUMMETADATALISTS; list_start = &state->arcs_lists[offset]; evicted_list_start = &evicted_state->arcs_lists[offset]; list_count = ARC_BUFC_NUMDATALISTS; idx = evict_data_offset; } bytes_remaining = evicted_state->arcs_lsize[type]; count = 0; evict_start: list = &list_start[idx]; evicted_list = &evicted_list_start[idx]; lock = ARCS_LOCK(state, (offset + idx)); evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); mutex_enter(lock); mutex_enter(evicted_lock); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); bytes_remaining -= (ab->b_size * ab->b_datacnt); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(ab) || (spa && ab->b_spa != spa) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && ddi_get_lbolt() - ab->b_arc_access < arc_min_prefetch_lifespan)) { skipped++; continue; } /* "lookahead" for better eviction candidate */ if (recycle && ab->b_size != bytes && ab_prev && ab_prev->b_size == bytes) continue; hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { ASSERT0(refcount_count(&ab->b_refcnt)); ASSERT(ab->b_datacnt > 0); while (ab->b_buf) { arc_buf_t *buf = ab->b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { missed += 1; break; } if (buf->b_data) { bytes_evicted += ab->b_size; if (recycle && ab->b_type == type && ab->b_size == bytes && !HDR_L2_WRITING(ab)) { stolen = buf->b_data; recycle = FALSE; } } if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); arc_buf_destroy(buf, buf->b_data == stolen, FALSE); ab->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&arc_eviction_mtx); mutex_exit(&buf->b_evict_lock); } else { mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, buf->b_data == stolen, TRUE); } } if (ab->b_l2hdr) { ARCSTAT_INCR(arcstat_evict_l2_cached, ab->b_size); } else { if (l2arc_write_eligible(ab->b_spa, ab)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, ab->b_size); } else { ARCSTAT_INCR( arcstat_evict_l2_ineligible, ab->b_size); } } if (ab->b_datacnt == 0) { arc_change_state(evicted_state, ab, hash_lock); ASSERT(HDR_IN_HASH_TABLE(ab)); ab->b_flags |= ARC_IN_HASH_TABLE; ab->b_flags &= ~ARC_BUF_AVAILABLE; DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); } if (!have_lock) mutex_exit(hash_lock); if (bytes >= 0 && bytes_evicted >= bytes) break; if (bytes_remaining > 0) { mutex_exit(evicted_lock); mutex_exit(lock); idx = ((idx + 1) & (list_count - 1)); count++; goto evict_start; } } else { missed += 1; } } mutex_exit(evicted_lock); mutex_exit(lock); idx = ((idx + 1) & (list_count - 1)); count++; if (bytes_evicted < bytes) { if (count < list_count) goto evict_start; else dprintf("only evicted %lld bytes from %x", (longlong_t)bytes_evicted, state); } if (type == ARC_BUFC_METADATA) evict_metadata_offset = idx; else evict_data_offset = idx; if (skipped) ARCSTAT_INCR(arcstat_evict_skip, skipped); if (missed) ARCSTAT_INCR(arcstat_mutex_miss, missed); /* * We have just evicted some date into the ghost state, make * sure we also adjust the ghost state size if necessary. */ if (arc_no_grow && arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mru_ghost->arcs_lsize[type], mru_over); arc_evict_ghost(arc_mru_ghost, 0, todelete); } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c); arc_evict_ghost(arc_mfu_ghost, 0, todelete); } } if (stolen) ARCSTAT_BUMP(arcstat_stolen); return (stolen); } /* * Remove buffers from list until we've removed the specified number of * bytes. Destroy the buffers that are removed. */ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; arc_buf_hdr_t marker = { 0 }; list_t *list, *list_start; kmutex_t *hash_lock, *lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; static int evict_offset; int list_count, idx = evict_offset; int offset, count = 0; ASSERT(GHOST_STATE(state)); /* * data lists come after metadata lists */ list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; list_count = ARC_BUFC_NUMDATALISTS; offset = ARC_BUFC_NUMMETADATALISTS; evict_start: list = &list_start[idx]; lock = ARCS_LOCK(state, idx + offset); mutex_enter(lock); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); if (spa && ab->b_spa != spa) continue; /* ignore markers */ if (ab->b_spa == 0) continue; hash_lock = HDR_LOCK(ab); /* caller may be trying to modify this buffer, skip it */ if (MUTEX_HELD(hash_lock)) continue; if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); ARCSTAT_BUMP(arcstat_deleted); bytes_deleted += ab->b_size; if (ab->b_l2hdr != NULL) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, ab, hash_lock); mutex_exit(hash_lock); } else { arc_change_state(arc_anon, ab, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(ab); } DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; } else if (bytes < 0) { /* * Insert a list marker and then wait for the * hash lock to become available. Once its * available, restart from where we left off. */ list_insert_after(list, ab, &marker); mutex_exit(lock); mutex_enter(hash_lock); mutex_exit(hash_lock); mutex_enter(lock); ab_prev = list_prev(list, &marker); list_remove(list, &marker); } else bufs_skipped += 1; } mutex_exit(lock); idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); count++; if (count < list_count) goto evict_start; evict_offset = idx; if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && (bytes < 0 || bytes_deleted < bytes)) { list_start = &state->arcs_lists[0]; list_count = ARC_BUFC_NUMMETADATALISTS; offset = count = 0; goto evict_start; } if (bufs_skipped) { ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); ASSERT(bytes >= 0); } if (bytes_deleted < bytes) dprintf("only deleted %lld bytes from %p", (longlong_t)bytes_deleted, state); } static void arc_adjust(void) { int64_t adjustment, delta; /* * Adjust MRU size */ adjustment = MIN((int64_t)(arc_size - arc_c), (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p)); if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); } /* * Adjust MFU size */ adjustment = arc_size - arc_c; if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); adjustment -= delta; } if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { int64_t delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); } /* * Adjust ghost lists */ adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { delta = MIN(arc_mru_ghost->arcs_size, adjustment); arc_evict_ghost(arc_mru_ghost, 0, delta); } adjustment = arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { delta = MIN(arc_mfu_ghost->arcs_size, adjustment); arc_evict_ghost(arc_mfu_ghost, 0, delta); } } static void arc_do_user_evicts(void) { static arc_buf_t *tmp_arc_eviction_list; /* * Move list over to avoid LOR */ restart: mutex_enter(&arc_eviction_mtx); tmp_arc_eviction_list = arc_eviction_list; arc_eviction_list = NULL; mutex_exit(&arc_eviction_mtx); while (tmp_arc_eviction_list != NULL) { arc_buf_t *buf = tmp_arc_eviction_list; tmp_arc_eviction_list = buf->b_next; mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; mutex_exit(&buf->b_evict_lock); if (buf->b_efunc != NULL) VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; buf->b_private = NULL; kmem_cache_free(buf_cache, buf); } if (arc_eviction_list != NULL) goto restart; } /* * Flush all *evictable* data from the cache for the given spa. * NOTE: this will not touch "active" (i.e. referenced) data. */ void arc_flush(spa_t *spa) { uint64_t guid = 0; if (spa) guid = spa_load_guid(spa); while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } arc_evict_ghost(arc_mru_ghost, guid, -1); arc_evict_ghost(arc_mfu_ghost, guid, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); mutex_exit(&arc_reclaim_thr_lock); ASSERT(spa || arc_eviction_list == NULL); } void arc_shrink(void) { if (arc_c > arc_c_min) { uint64_t to_free; #ifdef _KERNEL to_free = arc_c >> arc_shrink_shift; #else to_free = arc_c >> arc_shrink_shift; #endif if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else arc_c = arc_c_min; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_c > arc_size) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } if (arc_size > arc_c) arc_adjust(); } static int needfree = 0; static int arc_reclaim_needed(void) { #ifdef _KERNEL if (needfree) return (1); /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ if (vm_paging_needed()) return (1); #ifdef sun /* * take 'desfree' extra pages, so we reclaim sooner, rather than later */ extra = desfree; /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. * lotsfree is the high-water mark for pageout, and needfree is the * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ if (freemem < lotsfree + needfree + extra) return (1); /* * check to make sure that swapfs has enough space so that anon * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); #if defined(__i386) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ if (btop(vmem_size(heap_arena, VMEM_FREE)) < (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) return (1); #endif #else /* !sun */ if (kmem_used() > (kmem_size() * 3) / 4) return (1); #endif /* sun */ #else if (spa_get_random(100) == 0) return (1); #endif return (0); } extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; static void arc_kmem_reap_now(arc_reclaim_strategy_t strat) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* * We are exceeding our meta-data cache limit. * Purge some DNLC entries to release holds on meta-data. */ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif /* * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) arc_shrink(); for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_cache); } static void arc_reclaim_thread(void *dummy __unused) { clock_t growtime = 0; arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { if (arc_reclaim_needed()) { if (arc_no_grow) { if (last_reclaim == ARC_RECLAIM_CONS) { last_reclaim = ARC_RECLAIM_AGGR; } else { last_reclaim = ARC_RECLAIM_CONS; } } else { arc_no_grow = TRUE; last_reclaim = ARC_RECLAIM_AGGR; membar_producer(); } /* reset the growth delay for every reclaim */ growtime = ddi_get_lbolt() + (arc_grow_retry * hz); if (needfree && last_reclaim == ARC_RECLAIM_CONS) { /* * If needfree is TRUE our vm_lowmem hook * was called and in that case we must free some * memory, so switch to aggressive mode. */ arc_no_grow = TRUE; last_reclaim = ARC_RECLAIM_AGGR; } arc_kmem_reap_now(last_reclaim); arc_warm = B_TRUE; } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { arc_no_grow = FALSE; } arc_adjust(); if (arc_eviction_list != NULL) arc_do_user_evicts(); #ifdef _KERNEL if (needfree) { needfree = 0; wakeup(&needfree); } #endif /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock, hz); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); } arc_thread_exit = 0; cv_broadcast(&arc_reclaim_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ thread_exit(); } /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); if (state == arc_l2c_only) return; ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); if (arc_reclaim_needed()) { cv_signal(&arc_reclaim_thr_cv); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if the cache has reached its limits and eviction is required * prior to insert. */ static int arc_evict_needed(arc_buf_contents_t type) { if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); #ifdef sun #ifdef _KERNEL /* * If zio data pages are being allocated out of a separate heap segment, * then enforce that the size of available vmem for this area remains * above about 1/32nd free. */ if (type == ARC_BUFC_DATA && zio_arena != NULL && vmem_size(zio_arena, VMEM_FREE) < (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) return (1); #endif #endif /* sun */ if (arc_reclaim_needed()) return (1); return (arc_size > arc_c); } /* * The buffer, supplied as the first argument, needs a data block. * So, if we are at cache max, determine which cache should be victimized. * We have the following cases: * * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> * In this situation if we're out of space, but the resident size of the MFU is * under the limit, victimize the MFU cache to satisfy this insertion request. * * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> * Here, we've used up all of the available space for the MRU, so we need to * evict from our own cache instead. Evict from the set of resident MRU * entries. * * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> * c minus p represents the MFU space in the cache, since p is the size of the * cache that is dedicated to the MRU. In this situation there's still space on * the MFU side, so the MRU side needs to be victimized. * * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> * MFU's resident set is consuming more space than it has been allotted. In * this situation, we must victimize our own cache, the MFU, for this insertion. */ static void arc_get_data_buf(arc_buf_t *buf) { arc_state_t *state = buf->b_hdr->b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; arc_adapt(size, state); /* * We have not yet reached cache maximum size, * just allocate a new buffer. */ if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } goto out; } /* * If we are prefetching from the mfu ghost list, this buffer * will end up on the mru list; so steal space from there. */ if (state == arc_mfu_ghost) state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; else if (state == arc_mru_ghost) state = arc_mru; if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; state = (arc_mfu->arcs_lsize[type] >= size && arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); out: /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(buf->b_hdr->b_state)) { arc_buf_hdr_t *hdr = buf->b_hdr; atomic_add_64(&hdr->b_state->arcs_size, size); if (list_link_active(&hdr->b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); atomic_add_64(&hdr->b_state->arcs_lsize[type], size); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (arc_size < arc_c && hdr->b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } ARCSTAT_BUMP(arcstat_allocated); } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); if (buf->b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ ASSERT(buf->b_arc_access == 0); buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); arc_change_state(arc_mru, buf, hash_lock); } else if (buf->b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ if ((buf->b_flags & ARC_PREFETCH) != 0) { if (refcount_count(&buf->b_refcnt) == 0) { ASSERT(list_link_active(&buf->b_arc_node)); } else { buf->b_flags &= ~ARC_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); } buf->b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ if (now > buf->b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ buf->b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } ARCSTAT_BUMP(arcstat_mru_hits); } else if (buf->b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ if (buf->b_flags & ARC_PREFETCH) { new_state = arc_mru; if (refcount_count(&buf->b_refcnt) > 0) buf->b_flags &= ~ARC_PREFETCH; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); } buf->b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (buf->b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ if ((buf->b_flags & ARC_PREFETCH) != 0) { ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = ddi_get_lbolt(); } else if (buf->b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ if (buf->b_flags & ARC_PREFETCH) { /* * This is a prefetch access... * move this block back to the MRU state. */ ASSERT0(refcount_count(&buf->b_refcnt)); new_state = arc_mru; } buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (buf->b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } else { ASSERT(!"invalid arc state"); } } /* a generic arc_done_func_t which you can use */ /* ARGSUSED */ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) bcopy(buf->b_data, arg, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg) == 1); } /* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { VERIFY(arc_buf_remove_ref(buf, arg) == 1); *bufp = NULL; } else { *bufp = buf; ASSERT(buf->b_data); } } static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr, *found; arc_buf_t *buf; arc_buf_t *abuf; /* buffer we're assigning to callback */ kmutex_t *hash_lock; arc_callback_t *callback_list, *acb; int freeable = FALSE; buf = zio->io_private; hdr = buf->b_hdr; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); hdr->b_flags &= ~ARC_L2_EVICTED; if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) hdr->b_flags &= ~ARC_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : dmu_ot_byteswap[bswap].ob_func; func(buf->b_data, hdr->b_size); } arc_cksum_compute(buf, B_FALSE); #ifdef illumos arc_buf_watch(buf); #endif /* illumos */ if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* create copies of the data buffer for the callers */ abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { if (abuf == NULL) { ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); } acb->acb_buf = abuf; abuf = NULL; } } hdr->b_acb = NULL; hdr->b_flags &= ~ARC_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); ASSERT(hdr->b_datacnt == 1); hdr->b_flags |= ARC_BUF_AVAILABLE; } ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); if (zio->io_error != 0) { hdr->b_flags |= ARC_IO_ERROR; if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_cv); if (hash_lock) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_state, ==, arc_anon); freeable = refcount_is_zero(&hdr->b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done) acb->acb_done(zio, acb->acb_buf, acb->acb_private); if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; - arc_buf_t *buf; + arc_buf_t *buf = NULL; kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_load_guid(spa); top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), &hash_lock); if (hdr && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { if (*arc_flags & ARC_WAIT) { cv_wait(&hdr->b_cv, hash_lock); mutex_exit(hash_lock); goto top; } ASSERT(*arc_flags & ARC_NOWAIT); if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; hdr->b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); return (0); } mutex_exit(hash_lock); return (0); } ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); if (done) { add_reference(hdr, hash_lock, private); /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_buf; ASSERT(buf); ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { ASSERT(buf->b_efunc == NULL); hdr->b_flags &= ~ARC_BUF_AVAILABLE; } else { buf = arc_buf_clone(buf); } } else if (*arc_flags & ARC_PREFETCH && refcount_count(&hdr->b_refcnt) == 0) { hdr->b_flags |= ARC_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_L2CACHE) hdr->b_flags |= ARC_L2CACHE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, hits); if (done) done(NULL, buf, private); } else { uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; - uint64_t addr; + uint64_t addr = 0; boolean_t devw = B_FALSE; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_PREFETCH) { (void) remove_reference(hdr, hash_lock, private); hdr->b_flags |= ARC_PREFETCH; } if (*arc_flags & ARC_L2CACHE) hdr->b_flags |= ARC_L2CACHE; if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_INDIRECT; } else { /* this block is in the ghost cache */ ASSERT(GHOST_STATE(hdr->b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(refcount_count(&hdr->b_refcnt)); ASSERT(hdr->b_buf == NULL); /* if this is a prefetch, we don't have a reference */ if (*arc_flags & ARC_PREFETCH) hdr->b_flags |= ARC_PREFETCH; else add_reference(hdr, hash_lock, private); if (*arc_flags & ARC_L2CACHE) hdr->b_flags |= ARC_L2CACHE; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; hdr->b_buf = buf; ASSERT(hdr->b_datacnt == 0); hdr->b_datacnt = 1; arc_get_data_buf(buf); arc_access(hdr, hash_lock); } ASSERT(!GHOST_STATE(hdr->b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; ASSERT(hdr->b_acb == NULL); hdr->b_acb = acb; hdr->b_flags |= ARC_IO_IN_PROGRESS; if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr->b_dev->l2ad_writing; addr = hdr->b_l2hdr->b_daddr; /* * Lock out device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); #ifdef _KERNEL curthread->td_ru.ru_inblock++; #endif if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (hdr->b_l2hdr != NULL && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_buf = buf; cb->l2rcb_spa = spa; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; + ASSERT(addr >= VDEV_LABEL_START_SIZE && + addr + size < vd->vdev_psize - + VDEV_LABEL_END_SIZE); + /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). */ rzio = zio_read_phys(pio, vd, addr, size, buf->b_data, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); return (0); } ASSERT(*arc_flags & ARC_WAIT); if (zio_wait(rzio) == 0) return (0); /* l2arc read error; goto zio_read() */ } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); if (l2arc_ndev != 0) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); if (*arc_flags & ARC_WAIT) return (zio_wait(rzio)); ASSERT(*arc_flags & ARC_NOWAIT); zio_nowait(rzio); } return (0); } void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); ASSERT(buf->b_hdr->b_state != arc_anon); ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); ASSERT(buf->b_efunc == NULL); ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); buf->b_efunc = func; buf->b_private = private; } /* * This is used by the DMU to let the ARC know that a buffer is * being evicted, so the ARC should clean up. If this arc buf * is not yet in the evicted state, it will be put there. */ int arc_buf_evict(arc_buf_t *buf) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; arc_buf_t **bufp; list_t *list, *evicted_list; kmutex_t *lock, *evicted_lock; mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); mutex_exit(&buf->b_evict_lock); return (0); } else if (buf->b_data == NULL) { arc_buf_t copy = *buf; /* structure assignment */ /* * We are on the eviction list; process this buffer now * but let arc_do_user_evicts() do the reaping. */ buf->b_efunc = NULL; mutex_exit(&buf->b_evict_lock); VERIFY(copy.b_efunc(©) == 0); return (1); } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); /* * Pull this buffer off of the hdr */ bufp = &hdr->b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; ASSERT(buf->b_data != NULL); arc_buf_destroy(buf, FALSE, FALSE); if (hdr->b_datacnt == 0) { arc_state_t *old_state = hdr->b_state; arc_state_t *evicted_state; ASSERT(hdr->b_buf == NULL); ASSERT(refcount_is_zero(&hdr->b_refcnt)); evicted_state = (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; get_buf_info(hdr, old_state, &list, &lock); get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock); mutex_enter(lock); mutex_enter(evicted_lock); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdr->b_flags |= ARC_IN_HASH_TABLE; hdr->b_flags &= ~ARC_BUF_AVAILABLE; mutex_exit(evicted_lock); mutex_exit(lock); } mutex_exit(hash_lock); mutex_exit(&buf->b_evict_lock); VERIFY(buf->b_efunc(buf) == 0); buf->b_efunc = NULL; buf->b_private = NULL; buf->b_hdr = NULL; buf->b_next = NULL; kmem_cache_free(buf_cache, buf); return (1); } /* * Release this buffer from the cache. This must be done * after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock = NULL; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size; /* * It would be nice to assert that if it's DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); if (hdr->b_state == arc_anon) { /* this buffer is already released */ ASSERT(buf->b_efunc == NULL); } else { hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); } l2hdr = hdr->b_l2hdr; if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); hdr->b_l2hdr = NULL; - buf_size = hdr->b_size; } + buf_size = hdr->b_size; /* * Do we have more than one buf? */ if (hdr->b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; uint32_t flags = hdr->b_flags; ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; buf->b_next = NULL; ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); if (refcount_is_zero(&hdr->b_refcnt)) { uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } /* * We're releasing a duplicate user data buffer, update * our statistics accordingly. */ if (hdr->b_type == ARC_BUFC_DATA) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -hdr->b_size); } hdr->b_datacnt -= 1; arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif /* illumos */ mutex_exit(hash_lock); nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; nhdr->b_type = type; nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; nhdr->b_flags = flags & ARC_L2_WRITING; nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; if (hash_lock) mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; if (l2hdr) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -buf_size); mutex_exit(&l2arc_buflist_mtx); } } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } int arc_has_callback(arc_buf_t *buf) { int callback; mutex_enter(&buf->b_evict_lock); callback = (buf->b_efunc != NULL); mutex_exit(&buf->b_evict_lock); return (callback); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); callback->awcb_ready(zio, buf, callback->awcb_private); /* * If the IO is already in progress, then this is a re-write * attempt, so we need to thaw and re-compute the cksum. * It is the responsibility of the callback to handle the * accounting for any re-write attempt. */ if (HDR_IO_IN_PROGRESS(hdr)) { mutex_enter(&hdr->b_freeze_lock); if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } mutex_exit(&hdr->b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); hdr->b_flags |= ARC_IO_IN_PROGRESS; } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(hdr->b_acb == NULL); if (zio->io_error == 0) { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; } else { ASSERT(BUF_EMPTY(hdr)); } /* * If the block to be written was all-zero, we may have * compressed it away. In this case no write was performed * so there will be no dva/birth/checksum. The buffer must * therefore remain anonymous (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT(zio->io_error == 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(refcount_is_zero(&exists->b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_datacnt == 1); ASSERT(hdr->b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } hdr->b_flags &= ~ARC_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ if (!exists && hdr->b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } ASSERT(!refcount_is_zero(&hdr->b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); ASSERT(hdr->b_acb == NULL); if (l2arc) hdr->b_flags |= ARC_L2CACHE; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); return (zio); } static int arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); static uint64_t page_load = 0; static uint64_t last_txg = 0; #ifdef sun #if defined(__i386) available_memory = MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); #endif #endif /* sun */ if (available_memory >= zfs_write_limit_max) return (0); if (txg > last_txg) { last_txg = txg; page_load = 0; } /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ if (curproc == pageproc) { if (page_load > available_memory / 4) return (ERESTART); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; return (0); } else if (page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); return (EAGAIN); } page_load = 0; if (arc_size > arc_c_min) { uint64_t evictable_memory = arc_mru->arcs_lsize[ARC_BUFC_DATA] + arc_mru->arcs_lsize[ARC_BUFC_METADATA] + arc_mfu->arcs_lsize[ARC_BUFC_DATA] + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; available_memory += MIN(evictable_memory, arc_size - arc_c_min); } if (inflight_data > available_memory / 4) { ARCSTAT_INCR(arcstat_memory_throttle_count, 1); return (ERESTART); } #endif return (0); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; #ifdef ZFS_DEBUG /* * Once in a while, fail for no reason. Everything should cope. */ if (spa_get_random(10000) == 0) { dprintf("forcing random failure\n"); return (ERESTART); } #endif if (reserve > arc_c/4 && !arc_no_grow) arc_c = MIN(arc_c_max, reserve * 4); if (reserve > arc_c) return (ENOMEM); /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefor need to * make sure that there is sufficient available memory for this. */ if (error = arc_memory_throttle(reserve, anon_size, txg)) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve>>10, arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, reserve>>10, arc_c>>10); return (ERESTART); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static kmutex_t arc_lowmem_lock; #ifdef _KERNEL static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { /* Serialize access via arc_lowmem_lock. */ mutex_enter(&arc_lowmem_lock); mutex_enter(&arc_reclaim_thr_lock); needfree = 1; cv_signal(&arc_reclaim_thr_cv); /* * It is unsafe to block here in arbitrary threads, because we can come * here from ARC itself and may hold ARC locks and thus risk a deadlock * with ARC reclaim thread. */ if (curproc == pageproc) { while (needfree) msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); } mutex_exit(&arc_reclaim_thr_lock); mutex_exit(&arc_lowmem_lock); } #endif void arc_init(void) { int i, prefetch_tunable_set = 0; mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ arc_c = kmem_size() / 8; #ifdef sun #ifdef _KERNEL /* * On architectures where the physical memory can be larger * than the addressable space (intel in 32-bit mode), we may * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif #endif /* sun */ /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ arc_c_min = MAX(arc_c / 4, 64<<18); /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ if (arc_c * 8 >= 1<<30) arc_c_max = (arc_c * 8) - (1<<30); else arc_c_max = arc_c_min; arc_c_max = MAX(arc_c * 5, arc_c_max); #ifdef _KERNEL /* * Allow the tunables to override our calculations if they are * reasonable (ie. over 16MB) */ if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size()) arc_c_max = zfs_arc_max; if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif arc_c = arc_c_max; arc_p = (arc_c >> 1); /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; if (zfs_arc_grow_retry > 0) arc_grow_retry = zfs_arc_grow_retry; if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; zfs_arc_min = arc_c_min; zfs_arc_max = arc_c_max; arc_anon = &ARC_anon; arc_mru = &ARC_mru; arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; arc_l2c_only = &ARC_l2c_only; arc_size = 0; for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { mutex_init(&arc_anon->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mru->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_mru->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mru_ghost->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu_ghost->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu_ghost->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_l2c_only->arcs_lists[i], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); } buf_init(); arc_thread_exit = 0; arc_eviction_list = NULL; mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; kstat_install(arc_ksp); } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); #ifdef _KERNEL arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); #endif arc_dead = FALSE; arc_warm = B_FALSE; if (zfs_write_limit_max == 0) zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; else zfs_write_limit_shift = 0; mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); #ifdef _KERNEL if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) prefetch_tunable_set = 1; #ifdef __i386__ if (prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default on i386 " "-- to enable,\n"); printf(" add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #else if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default if less " "than 4GB of RAM is present;\n" " to enable, add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #endif /* Warn about ZFS memory and address space requirements. */ if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " "expect unstable behavior.\n"); } if (kmem_size() < 512 * (1 << 20)) { printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " "expect unstable behavior.\n"); printf(" Consider tuning vm.kmem_size and " "vm.kmem_size_max\n"); printf(" in /boot/loader.conf.\n"); } #endif } void arc_fini(void) { int i; mutex_enter(&arc_reclaim_thr_lock); arc_thread_exit = 1; cv_signal(&arc_reclaim_thr_cv); while (arc_thread_exit != 0) cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); mutex_exit(&arc_reclaim_thr_lock); arc_flush(NULL); arc_dead = TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } mutex_destroy(&arc_eviction_mtx); mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { list_destroy(&arc_mru->arcs_lists[i]); list_destroy(&arc_mru_ghost->arcs_lists[i]); list_destroy(&arc_mfu->arcs_lists[i]); list_destroy(&arc_mfu_ghost->arcs_lists[i]); list_destroy(&arc_l2c_only->arcs_lists[i]); mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); } mutex_destroy(&zfs_write_limit_lock); buf_fini(); ASSERT(arc_loaned_bytes == 0); mutex_destroy(&arc_lowmem_lock); #ifdef _KERNEL if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); #endif } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (ab->b_spa != spa_guid) { ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); return (B_FALSE); } if (ab->b_l2hdr != NULL) { ARCSTAT_BUMP(arcstat_l2_write_in_l2); return (B_FALSE); } if (HDR_IO_IN_PROGRESS(ab)) { ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); return (B_FALSE); } if (!HDR_L2CACHE(ab)) { ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); return (B_FALSE); } return (B_TRUE); } static uint64_t l2arc_write_size(l2arc_dev_t *dev) { uint64_t size; size = dev->l2ad_write; if (arc_warm == B_FALSE) size += dev->l2ad_boost; return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } static void l2arc_hdr_stat_add(void) { ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); } static void l2arc_hdr_stat_remove(void) { ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; } while (vdev_is_dead(next->l2ad_vdev)); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev)) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write() { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT(df->l2df_data != NULL); ASSERT(df->l2df_func != NULL); df->l2df_func(df->l2df_data, df->l2df_size); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *ab, *ab_prev; l2arc_buf_hdr_t *abl2; kmutex_t *hash_lock; cb = zio->io_private; ASSERT(cb != NULL); dev = cb->l2wcb_dev; ASSERT(dev != NULL); head = cb->l2wcb_head; ASSERT(head != NULL); buflist = dev->l2ad_buflist; ASSERT(buflist != NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); mutex_enter(&l2arc_buflist_mtx); /* * All writes completed, or an error was hit. */ for (ab = list_prev(buflist, head); ab; ab = ab_prev) { ab_prev = list_prev(buflist, ab); hash_lock = HDR_LOCK(ab); if (!mutex_tryenter(hash_lock)) { /* * This buffer misses out. It may be in a stage * of eviction. Its ARC_L2_WRITING flag will be * left set, denying reads to this buffer. */ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); continue; } if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, ab); abl2 = ab->b_l2hdr; ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } /* * Allow ARC to begin reads to this L2ARC entry. */ ab->b_flags &= ~ARC_L2_WRITING; mutex_exit(hash_lock); } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); kmem_cache_free(hdr_cache, head); mutex_exit(&l2arc_buflist_mtx); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; arc_buf_t *buf; kmutex_t *hash_lock; int equal; ASSERT(zio->io_vd != NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; ASSERT(cb != NULL); buf = cb->l2rcb_buf; ASSERT(buf != NULL); hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * Check this survived the L2ARC journey. */ equal = arc_cksum_equal(buf); if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = buf; zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ arc_read_done(zio); } else { mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = EIO; } if (!equal) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, buf->b_data, zio->io_size, arc_read_done, buf, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static list_t * l2arc_list_locked(int list_num, kmutex_t **lock) { - list_t *list; + list_t *list = NULL; int idx; ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); if (list_num < ARC_BUFC_NUMMETADATALISTS) { idx = list_num; list = &arc_mfu->arcs_lists[idx]; *lock = ARCS_LOCK(arc_mfu, idx); } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { idx = list_num - ARC_BUFC_NUMMETADATALISTS; list = &arc_mru->arcs_lists[idx]; *lock = ARCS_LOCK(arc_mru, idx); } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + ARC_BUFC_NUMDATALISTS)) { idx = list_num - ARC_BUFC_NUMMETADATALISTS; list = &arc_mfu->arcs_lists[idx]; *lock = ARCS_LOCK(arc_mfu, idx); } else { idx = list_num - ARC_BUFC_NUMLISTS; list = &arc_mru->arcs_lists[idx]; *lock = ARCS_LOCK(arc_mru, idx); } ASSERT(!(MUTEX_HELD(*lock))); mutex_enter(*lock); return (list); } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; l2arc_buf_hdr_t *abl2; arc_buf_hdr_t *ab, *ab_prev; kmutex_t *hash_lock; uint64_t taddr; buflist = dev->l2ad_buflist; if (buflist == NULL) return; if (!all && dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. */ return; } if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { /* * When nearing the end of the device, evict to the end * before the device write hand jumps to the start. */ taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); top: mutex_enter(&l2arc_buflist_mtx); for (ab = list_tail(buflist); ab; ab = ab_prev) { ab_prev = list_prev(buflist, ab); hash_lock = HDR_LOCK(ab); if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&l2arc_buflist_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } if (HDR_L2_WRITE_HEAD(ab)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ list_remove(buflist, ab); mutex_exit(hash_lock); continue; } if (!all && ab->b_l2hdr != NULL && (ab->b_l2hdr->b_daddr > taddr || ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (HDR_FREE_IN_PROGRESS(ab)) { /* * Already on the path to destruction. */ mutex_exit(hash_lock); continue; } if (ab->b_state == arc_l2c_only) { ASSERT(!HDR_L2_READING(ab)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ arc_change_state(arc_anon, ab, hash_lock); arc_hdr_destroy(ab); } else { /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(ab)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); ab->b_flags |= ARC_L2_EVICTED; } /* * Tell ARC this no longer exists in L2ARC. */ if (ab->b_l2hdr != NULL) { abl2 = ab->b_l2hdr; ab->b_l2hdr = NULL; kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); } list_remove(buflist, ab); /* * This may have been leftover after a * failed write. */ ab->b_flags &= ~ARC_L2_WRITING; } mutex_exit(hash_lock); } mutex_exit(&l2arc_buflist_mtx); vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); dev->l2ad_evict = taddr; } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *ab, *ab_prev, *head; l2arc_buf_hdr_t *hdrl2; list_t *list; uint64_t passed_sz, write_sz, buf_sz, headroom; void *buf_data; kmutex_t *hash_lock, *list_lock; boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); int try; ASSERT(dev->l2ad_vdev != NULL); pio = NULL; write_sz = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); head->b_flags |= ARC_L2_WRITE_HEAD; ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); /* * Copy buffers for L2ARC writing. */ mutex_enter(&l2arc_buflist_mtx); for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { list = l2arc_list_locked(try, &list_lock); passed_sz = 0; ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ headroom = target_sz * l2arc_headroom; if (arc_warm == B_FALSE) ab = list_head(list); else ab = list_tail(list); if (ab == NULL) ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); for (; ab; ab = ab_prev) { if (arc_warm == B_FALSE) ab_prev = list_next(list, ab); else ab_prev = list_prev(list, ab); ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size); hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (!have_lock && !mutex_tryenter(hash_lock)) { ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); /* * Skip this buffer rather than waiting. */ continue; } passed_sz += ab->b_size; if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); break; } if (!l2arc_write_eligible(guid, ab)) { mutex_exit(hash_lock); continue; } if ((write_sz + ab->b_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_full); break; } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ list_insert_head(dev->l2ad_buflist, head); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); ARCSTAT_BUMP(arcstat_l2_write_pios); } /* * Create and add a new L2ARC header. */ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); hdrl2->b_dev = dev; hdrl2->b_daddr = dev->l2ad_hand; ab->b_flags |= ARC_L2_WRITING; ab->b_l2hdr = hdrl2; list_insert_head(dev->l2ad_buflist, ab); buf_data = ab->b_buf->b_data; buf_sz = ab->b_size; /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ arc_cksum_verify(ab->b_buf); arc_cksum_compute(ab->b_buf, B_TRUE); mutex_exit(hash_lock); wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); /* * Keep the clock hand suitably device-aligned. */ buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); write_sz += buf_sz; dev->l2ad_hand += buf_sz; } mutex_exit(list_lock); if (full == B_TRUE) break; } mutex_exit(&l2arc_buflist_mtx); if (pio == NULL) { ASSERT0(write_sz); kmem_cache_free(hdr_cache, head); return (0); } ASSERT3U(write_sz, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); ARCSTAT_INCR(arcstat_l2_size, write_sz); vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { vdev_space_update(dev->l2ad_vdev, dev->l2ad_end - dev->l2ad_hand, 0, 0); dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; } dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; return (write_sz); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static void l2arc_feed_thread(void *dummy __unused) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next - ddi_get_lbolt()); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT(spa != NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; adddev->l2ad_write = l2arc_write_max; adddev->l2ad_boost = l2arc_write_boost; adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; ASSERT3U(adddev->l2ad_write, >, 0); /* * This is a list of all ARC buffers that are still valid on the * device. */ adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *dev, *nextdev, *remdev = NULL; /* * Find the device by vdev */ mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { nextdev = list_next(l2arc_dev_list, dev); if (vd == dev->l2ad_vdev) { remdev = dev; break; } } ASSERT(remdev != NULL); /* * Remove device from global list */ list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(remdev->l2ad_buflist); kmem_free(remdev->l2ad_buflist, sizeof (list_t)); kmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; l2arc_writes_sent = 0; l2arc_writes_done = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { /* * This is called from dmu_fini(), which is called from spa_fini(); * Because of this, we can assume that all l2arc devices have * already been removed when the pools themselves were removed. */ l2arc_do_free_on_write(); mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_buflist_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, minclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 247192) @@ -1,1849 +1,1848 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * Enable/disable nopwrite feature. */ int zfs_nopwrite_enabled = 1; SYSCTL_DECL(_vfs_zfs); TUNABLE_INT("vfs.zfs.nopwrite_enabled", &zfs_nopwrite_enabled); SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, { DMU_BSWAP_UINT64, TRUE, "object array" }, { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, { DMU_BSWAP_UINT64, TRUE, "bpobj" }, { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, { DMU_BSWAP_ZAP, TRUE, "DSL props" }, { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, { DMU_BSWAP_UINT8, FALSE, "zvol object" }, { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, { DMU_BSWAP_UINT8, TRUE, "SPA history" }, { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, { DMU_BSWAP_UINT8, TRUE, "FUID table" }, { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, { DMU_BSWAP_UINT8, TRUE, "System attributes" }, { DMU_BSWAP_ZAP, TRUE, "SA master node" }, { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, { DMU_BSWAP_ZAP, TRUE, "scan translations" }, { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); blkid = dbuf_whichblock(dn, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { err = EIO; } else { err = dbuf_read(db, NULL, db_flags); if (err) { dbuf_rele(db, tag); db = NULL; } } dnode_rele(dn, FTAG); *dbp = &db->db; /* NULL db plus first field offset is NULL */ return (err); } int dmu_bonus_max(void) { return (DN_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = EINVAL; } else if (newsize < 0 || newsize > db_fake->db_size) { error = EINVAL; } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = EINVAL; } else if (dn->dn_bonus != db) { error = EINVAL; } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; dmu_buf_impl_t *db; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); (void) atomic_inc_32_nv(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); *dbp = &db->db; return (0); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); ASSERT(db != NULL); err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else dbuf_rele(db, tag); return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = EINVAL; } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = ENOENT; } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dsl_pool_t *dp = NULL; dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; hrtime_t start; ASSERT(length <= DMU_MAX_ACCESS); dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) dbuf_flags |= DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset+length, 1ULL<> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (EIO); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); if (dn->dn_objset->os_dsl_dataset) dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; - if (dp && dsl_pool_sync_context(dp)) - start = gethrtime(); + start = gethrtime(); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); return (EIO); } /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); #ifdef _KERNEL else curthread->td_ru.ru_oublock++; #endif dbp[i] = &db->db; } rw_exit(&dn->dn_struct_rwlock); /* wait for async i/o */ err = zio_wait(zio); /* track read overhead when we are in sync context */ if (dp && dsl_pool_sync_context(dp)) dp->dp_read_overhead += gethrtime() - start; if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = EIO; mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) { dnode_t *dn; uint64_t blkid; int nblks, i, err; if (zfs_prefetch_disable) return; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, blkid); rw_exit(&dn->dn_struct_rwlock); return; } /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset+len, 1<> blkshift; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) dbuf_prefetch(dn, blkid+i); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) { uint64_t len = *start - limit; uint64_t blkcnt = 0; uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT(limit <= *start); if (len <= iblkrange * maxblks) { *start = limit; return (0); } ASSERT(ISP2(iblkrange)); while (*start > limit && blkcnt < maxblks) { int err; /* find next allocated L1 indirect */ err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no more, then we are done */ if (err == ESRCH) { *start = limit; return (0); } else if (err) { return (err); } blkcnt += 1; /* reset offset to end of "next" block back */ *start = P2ALIGN(*start, iblkrange); if (*start <= limit) *start = limit; else *start -= 1; } return (0); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length, boolean_t free_dnode) { dmu_tx_t *tx; uint64_t object_size, start, end, len; boolean_t trunc = (length == DMU_OBJECT_END); int align, err; align = 1 << dn->dn_datablkshift; ASSERT(align > 0); object_size = align == 1 ? dn->dn_datablksz : (dn->dn_maxblkid + 1) << dn->dn_datablkshift; end = offset + length; if (trunc || end > object_size) end = object_size; if (end <= offset) return (0); length = end - offset; while (length) { start = end; /* assert(offset <= start) */ err = get_next_chunk(dn, &start, offset); if (err) return (err); len = trunc ? DMU_OBJECT_END : end - start; tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, start, len); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } dnode_free_range(dn, start, trunc ? -1 : len, tx); if (start == 0 && free_dnode) { ASSERT(trunc); dnode_free(dn, tx); } length -= end - start; dmu_tx_commit(tx); end = start; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); dnode_rele(dn, FTAG); return (err); } int dmu_free_object(objset_t *os, uint64_t object) { dnode_t *dn; dmu_tx_t *tx; int err; err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err != 0) return (err); if (dn->dn_nlevels == 1) { tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } } else { err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); } dnode_rele(dn, FTAG); return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == -1ULL || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; dmu_buf_t **dbp; int numbufs, err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); bcopy((char *)db->db_data + bufoff, buf, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } dnode_rele(dn, FTAG); return (err); } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); bcopy(buf, (char *)db->db_data + bufoff, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } /* * DMU support for xuio */ kstat_t *xuio_ksp = NULL; int dmu_xuio_init(xuio_t *xuio, int nblk) { dmu_xuio_t *priv; uio_t *uio = &xuio->xu_uio; uio->uio_iovcnt = nblk; uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); priv->cnt = nblk; priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); priv->iovp = uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); return (0); } void dmu_xuio_fini(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int nblk = priv->cnt; kmem_free(priv->iovp, nblk * sizeof (iovec_t)); kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); kmem_free(priv, sizeof (dmu_xuio_t)); if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); } /* * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { struct iovec *iov; uio_t *uio = &xuio->xu_uio; dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int i = priv->next++; ASSERT(i < priv->cnt); ASSERT(off + n <= arc_buf_size(abuf)); iov = uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; priv->bufs[i] = abuf; return (0); } int dmu_xuio_cnt(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); return (priv->cnt); } arc_buf_t * dmu_xuio_arcbuf(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); return (priv->bufs[i]); } void dmu_xuio_clear(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); priv->bufs[i] = NULL; } static void xuio_stat_init(void) { xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (xuio_ksp != NULL) { xuio_ksp->ks_data = &xuio_stats; kstat_install(xuio_ksp); } } static void xuio_stat_fini(void) { if (xuio_ksp != NULL) { kstat_delete(xuio_ksp); xuio_ksp = NULL; } } void xuio_stat_wbuf_copied() { XUIOSTAT_BUMP(xuiostat_wbuf_copied); } void xuio_stat_wbuf_nocopy() { XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); } #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; xuio_t *xuio = NULL; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, &numbufs, &dbp); if (err) return (err); #ifdef UIO_XUIO if (uio->uio_extflg == UIO_XUIO) xuio = (xuio_t *)uio; #endif for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); if (xuio) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); if (!err) { uio->uio_resid -= tocpy; uio->uio_loffset += tocpy; } if (abuf == dbuf_abuf) XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); } if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } static int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); /* * XXX uiomove could block forever (eg. nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that uiomove won't * block. */ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #ifdef sun int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(pp, S_READ); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(pp, va); pp = pp->p_next; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #endif /* sun */ #endif /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; spa_t *spa; DB_GET_SPA(&spa, db); return (arc_loan_buf(spa, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_size(buf); uint64_t blkid; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(dbuf); if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { objset_t *os; uint64_t object; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); os = dn->dn_objset; object = dn->dn_object; DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); ASSERT(zio_checksum_table[chksum].ci_dedup); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; blkptr_t *bp_orig = &zio->io_bp_orig; if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { /* * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) * then there is nothing to do here. Otherwise, free the * newly allocated block in this txg. */ if (zio->io_flags & ZIO_FLAG_NOPWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); return (EIO); /* Make zl_get_data do txg_waited_synced() */ } dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { blkptr_t *bp = zgd->zgd_bp; dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; dmu_sync_arg_t *dsa; zbookmark_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (EEXIST); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = db->db_last_dirty; while (dr && dr->dr_txg != txg) dr = dr->dr_next; if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (ENOENT); } ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); /* * Assume the on-disk data is X, the current syncing data is Y, * and the current in-memory data is Z (currently in dmu_sync). * X and Z are identical but Y is has been modified. Normally, * when X and Z are the same we will perform a nopwrite but if Y * is different we must disable nopwrite since the resulting write * of Y to disk can free the block containing X. If we allowed a * nopwrite to occur the block pointing to Z would reference a freed * block. Since this is a rare case we simplify this by disabling * nopwrite if the current dmu_sync-ing dbuf has been modified in * a previous transaction. */ if (dr->dr_next) zp.zp_nopwrite = B_FALSE; ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (EALREADY); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, dmu_sync_ready, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* XXX assumes dnode_hold will not get an i/o error */ (void) dnode_hold(os, object, FTAG, &dn); ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* XXX assumes dnode_hold will not get an i/o error */ (void) dnode_hold(os, object, FTAG, &dn); ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } int zfs_mdcomp_disable = 0; TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW, &zfs_mdcomp_disable, 0, "Disable metadata compression"); void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB; /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (zio_checksum_table[checksum].ci_correctable < 1 || zio_checksum_table[checksum].ci_eck) checksum = ZIO_CHECKSUM_FLETCHER_4; } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(dn->dn_compress, compress); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checkum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!zio_checksum_table[checksum].ci_dedup) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have a cryptographically secure * checksum that has no known collisions (i.e. SHA-256) * and compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually exclusive. */ nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; zp->zp_compress = compress; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; } int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int i, err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); /* * Sync any current changes before * we go trundling through the block pointers. */ for (i = 0; i < TXG_SIZE; i++) { if (list_link_active(&dn->dn_dirty_link[i])) break; } if (i != TXG_SIZE) { dnode_rele(dn, FTAG); txg_wait_synced(dmu_objset_pool(os), 0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); } err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp; rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. * This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add 1 for dnode space */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + 1; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } /* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { } void dmu_init(void) { zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); dmu_objset_init(); dnode_init(); dbuf_init(); zfetch_init(); l2arc_init(); arc_init(); } void dmu_fini(void) { arc_fini(); l2arc_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 247192) @@ -1,1786 +1,1787 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Needed to close a window in dnode_move() that allows the objset to be freed * before it can be safely accessed. */ krwlock_t os_lock; void dmu_objset_init(void) { rw_init(&os_lock, NULL, RW_DEFAULT, NULL); } void dmu_objset_fini(void) { rw_destroy(&os_lock); } spa_t * dmu_objset_spa(objset_t *os) { return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { return (os->os_zil); } dsl_pool_t * dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } uint64_t dmu_objset_syncprop(objset_t *os) { return (os->os_sync); } uint64_t dmu_objset_logbias(objset_t *os) { return (os->os_logbias); } static void checksum_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); } static void copies_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); ASSERT(newval <= spa_max_replication(os->os_spa)); os->os_copies = newval; } static void dedup_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; spa_t *spa = os->os_spa; enum zio_checksum checksum; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_secondary_cache = newval; } static void sync_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || newval == ZFS_SYNC_DISABLED); os->os_sync = newval; if (os->os_zil) zil_set_sync(os->os_zil, newval); } static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; ASSERT(newval == ZFS_LOGBIAS_LATENCY || newval == ZFS_LOGBIAS_THROUGHPUT); os->os_logbias = newval; if (os->os_zil) zil_set_logbias(os->os_zil, newval); } void dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); osp->os_flags = BSWAP_64(osp->os_flags); if (size == sizeof (objset_phys_t)) { dnode_byteswap(&osp->os_userused_dnode); dnode_byteswap(&osp->os_groupused_dnode); } } int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_t **osp) { objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { uint32_t aflags = ARC_WAIT; zbookmark_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_L2CACHE; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = EIO; return (err); } /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { arc_buf_t *buf = arc_buf_alloc(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); (void) arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } os->os_phys = os->os_phys_buf->b_data; os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; os->os_phys_buf = arc_buf_alloc(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); } /* * Note: the changed_cb will be called once before the register * func returns, thus changing the checksum/compression from the * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ if (ds) { err = dsl_prop_register(ds, "primarycache", primary_cache_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "secondarycache", secondary_cache_changed_cb, os); if (!dsl_dataset_is_snapshot(ds)) { if (err == 0) err = dsl_prop_register(ds, "checksum", checksum_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "compression", compression_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "copies", copies_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "dedup", dedup_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "logbias", logbias_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "sync", sync_changed_cb, os); } if (err) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); kmem_free(os, sizeof (objset_t)); return (err); } } else if (ds == NULL) { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_LZJB; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; os->os_dedup_verify = 0; os->os_logbias = 0; os->os_sync = 0; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; } if (ds == NULL || !dsl_dataset_is_snapshot(ds)) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); list_create(&os->os_free_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); } list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); DMU_META_DNODE(os) = dnode_special_open(os, &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { DMU_USERUSED_DNODE(os) = dnode_special_open(os, &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, &os->os_userused_dnode); DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } /* * We should be the only thread trying to do this because we * have ds_opening_lock */ if (ds) { mutex_enter(&ds->ds_lock); ASSERT(ds->ds_objset == NULL); ds->ds_objset = os; mutex_exit(&ds->ds_lock); } *osp = os; return (0); } int dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; mutex_enter(&ds->ds_opening_lock); *osp = ds->ds_objset; if (*osp == NULL) { err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, dsl_dataset_get_blkptr(ds), osp); } mutex_exit(&ds->ds_opening_lock); return (err); } /* called from zpl */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp) { dsl_dataset_t *ds; int err; err = dsl_dataset_hold(name, tag, &ds); if (err) return (err); err = dmu_objset_from_ds(ds, osp); if (err) dsl_dataset_rele(ds, tag); return (err); } /* called from zpl */ int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { dsl_dataset_t *ds; int err; err = dsl_dataset_own(name, B_FALSE, tag, &ds); if (err) return (err); err = dmu_objset_from_ds(ds, osp); if (err) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dmu_objset_disown(*osp, tag); return (EINVAL); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { dmu_objset_disown(*osp, tag); return (EROFS); } return (err); } void dmu_objset_rele(objset_t *os, void *tag) { dsl_dataset_rele(os->os_dsl_dataset, tag); } void dmu_objset_disown(objset_t *os, void *tag) { dsl_dataset_disown(os->os_dsl_dataset, tag); } int dmu_objset_evict_dbufs(objset_t *os) { dnode_t *dn; mutex_enter(&os->os_lock); /* process the mdn last, since the other dnodes have holds on it */ list_remove(&os->os_dnodes, DMU_META_DNODE(os)); list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); /* * Find the first dnode with holds. We have to do this dance * because dnode_add_ref() only works if you already have a * hold. If there are no holds then it has no dbufs so OK to * skip. */ for (dn = list_head(&os->os_dnodes); dn && !dnode_add_ref(dn, FTAG); dn = list_next(&os->os_dnodes, dn)) continue; while (dn) { dnode_t *next_dn = dn; do { next_dn = list_next(&os->os_dnodes, next_dn); } while (next_dn && !dnode_add_ref(next_dn, FTAG)); mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&os->os_lock); dn = next_dn; } dn = list_head(&os->os_dnodes); mutex_exit(&os->os_lock); return (dn != DMU_META_DNODE(os)); } void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { if (!dsl_dataset_is_snapshot(ds)) { VERIFY(0 == dsl_prop_unregister(ds, "checksum", checksum_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "compression", compression_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "copies", copies_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "dedup", dedup_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "logbias", logbias_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "sync", sync_changed_cb, os)); } VERIFY(0 == dsl_prop_unregister(ds, "primarycache", primary_cache_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", secondary_cache_changed_cb, os)); } if (os->os_sa) sa_tear_down(os); /* * We should need only a single pass over the dnode list, since * nothing can be added to the list at this point. */ (void) dmu_objset_evict_dbufs(os); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); ASSERT3P(list_head(&os->os_dnodes), ==, NULL); VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in * use. We consider the objset valid before the barrier and invalid * after the barrier. */ rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); kmem_free(os, sizeof (objset_t)); } timestruc_t dmu_objset_snap_cmtime(objset_t *os) { return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } /* called from dsl for meta-objset */ objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); if (ds != NULL) VERIFY(0 == dmu_objset_from_ds(ds, &os)); else VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); /* * We don't want to have to increase the meta-dnode's nlevels * later, because then we could do it in quescing context while * we are also accessing it in open context. * * This precaution is not necessary for the MOS (ds == NULL), * because the MOS is only updated in syncing context. * This is most fortunate: the MOS is the only objset that * needs to be synced multiple times as spa_sync() iterates * to convergence, so minimizing its dn_nlevels matters. */ if (ds != NULL) { int levels = 1; /* * Determine the number of levels necessary for the meta-dnode * to contain DN_MAX_OBJECT dnodes. */ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < DN_MAX_OBJECT * sizeof (dnode_phys_t)) levels++; mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = mdn->dn_nlevels = levels; } ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); os->os_phys->os_type = type; if (dmu_objset_userused_enabled(os)) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; os->os_flags = os->os_phys->os_flags; } dsl_dataset_dirty(ds, tx); return (os); } struct oscarg { void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *userarg; dsl_dataset_t *clone_origin; const char *lastname; dmu_objset_type_t type; uint64_t flags; cred_t *cr; }; /*ARGSUSED*/ static int dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct oscarg *oa = arg2; objset_t *mos = dd->dd_pool->dp_meta_objset; int err; uint64_t ddobj; err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, oa->lastname, sizeof (uint64_t), 1, &ddobj); if (err != ENOENT) return (err ? err : EEXIST); if (oa->clone_origin != NULL) { /* You can't clone across pools. */ if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) return (EXDEV); /* You can only clone snapshots, not the head datasets. */ if (!dsl_dataset_is_snapshot(oa->clone_origin)) return (EINVAL); } return (0); } static void dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; spa_t *spa = dd->dd_pool->dp_spa; struct oscarg *oa = arg2; uint64_t obj; ASSERT(dmu_tx_is_syncing(tx)); obj = dsl_dataset_create_sync(dd, oa->lastname, oa->clone_origin, oa->flags, oa->cr, tx); if (oa->clone_origin == NULL) { dsl_pool_t *dp = dd->dd_pool; dsl_dataset_t *ds; blkptr_t *bp; objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); bp = dsl_dataset_get_blkptr(ds); ASSERT(BP_IS_HOLE(bp)); os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); if (oa->userfunc) oa->userfunc(os, oa->userarg, oa->cr, tx); dsl_dataset_rele(ds, FTAG); } spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dsl_dir_t *pdd; const char *tail; int err = 0; struct oscarg oa = { 0 }; ASSERT(strchr(name, '@') == NULL); err = dsl_dir_open(name, FTAG, &pdd, &tail); if (err) return (err); if (tail == NULL) { dsl_dir_close(pdd, FTAG); return (EEXIST); } oa.userfunc = func; oa.userarg = arg; oa.lastname = tail; oa.type = type; oa.flags = flags; oa.cr = CRED(); err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, dmu_objset_create_sync, pdd, &oa, 5); dsl_dir_close(pdd, FTAG); return (err); } int dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) { dsl_dir_t *pdd; const char *tail; int err = 0; struct oscarg oa = { 0 }; ASSERT(strchr(name, '@') == NULL); err = dsl_dir_open(name, FTAG, &pdd, &tail); if (err) return (err); if (tail == NULL) { dsl_dir_close(pdd, FTAG); return (EEXIST); } oa.lastname = tail; oa.clone_origin = clone_origin; oa.flags = flags; oa.cr = CRED(); err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, dmu_objset_create_sync, pdd, &oa, 5); dsl_dir_close(pdd, FTAG); return (err); } int dmu_objset_destroy(const char *name, boolean_t defer) { dsl_dataset_t *ds; int error; error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); if (error == 0) { error = dsl_dataset_destroy(ds, FTAG, defer); /* dsl_dataset_destroy() closes the ds. */ } return (error); } struct snaparg { dsl_sync_task_group_t *dstg; char *snapname; char *htag; char failed[MAXPATHLEN]; boolean_t recursive; boolean_t needsuspend; boolean_t temporary; nvlist_t *props; struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ dsl_dataset_t *newds; }; static int snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) { objset_t *os = arg1; struct snaparg *sn = arg2; int error; /* The props have already been checked by zfs_check_userprops(). */ error = dsl_dataset_snapshot_check(os->os_dsl_dataset, sn->snapname, tx); if (error) return (error); if (sn->temporary) { /* * Ideally we would just call * dsl_dataset_user_hold_check() and * dsl_dataset_destroy_check() here. However the * dataset we want to hold and destroy is the snapshot * that we just confirmed we can create, but it won't * exist until after these checks are run. Do any * checks we can here and if more checks are added to * those routines in the future, similar checks may be * necessary here. */ if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) return (ENOTSUP); /* * Not checking number of tags because the tag will be * unique, as it will be the only tag. */ if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) return (E2BIG); sn->ha = kmem_alloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); sn->ha->temphold = B_TRUE; sn->ha->htag = sn->htag; } return (error); } static void snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) { objset_t *os = arg1; dsl_dataset_t *ds = os->os_dsl_dataset; struct snaparg *sn = arg2; dsl_dataset_snapshot_sync(ds, sn->snapname, tx); if (sn->props) { dsl_props_arg_t pa; pa.pa_props = sn->props; pa.pa_source = ZPROP_SRC_LOCAL; dsl_props_set_sync(ds->ds_prev, &pa, tx); } if (sn->temporary) { struct dsl_ds_destroyarg da; dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); sn->ha = NULL; sn->newds = ds->ds_prev; da.ds = ds->ds_prev; da.defer = B_TRUE; dsl_dataset_destroy_sync(&da, FTAG, tx); } } static int dmu_objset_snapshot_one(const char *name, void *arg) { struct snaparg *sn = arg; objset_t *os; int err; char *cp; /* * If the objset starts with a '%', then ignore it unless it was * explicitly named (ie, not recursive). These hidden datasets * are always inconsistent, and by not opening them here, we can * avoid a race with dsl_dir_destroy_check(). */ cp = strrchr(name, '/'); if (cp && cp[1] == '%' && sn->recursive) return (0); (void) strcpy(sn->failed, name); /* * Check permissions if we are doing a recursive snapshot. The * permission checks for the starting dataset have already been * performed in zfs_secpolicy_snapshot() */ if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) return (err); err = dmu_objset_hold(name, sn, &os); if (err != 0) return (err); /* * If the objset is in an inconsistent state (eg, in the process * of being destroyed), don't snapshot it. As with %hidden * datasets, we return EBUSY if this name was explicitly * requested (ie, not recursive), and otherwise ignore it. */ if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { dmu_objset_rele(os, sn); return (sn->recursive ? 0 : EBUSY); } if (sn->needsuspend) { err = zil_suspend(dmu_objset_zil(os)); if (err) { dmu_objset_rele(os, sn); return (err); } } dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, os, sn, 3); return (0); } int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) { dsl_sync_task_t *dst; struct snaparg sn; spa_t *spa; minor_t minor; int err; (void) strcpy(sn.failed, fsname); err = spa_open(fsname, &spa, FTAG); if (err) return (err); if (temporary) { if (cleanup_fd < 0) { spa_close(spa, FTAG); return (EINVAL); } if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { spa_close(spa, FTAG); return (err); } } sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); sn.snapname = snapname; sn.htag = tag; sn.props = props; sn.recursive = recursive; sn.needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); sn.temporary = temporary; sn.ha = NULL; sn.newds = NULL; if (recursive) { err = dmu_objset_find(fsname, dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); } else { err = dmu_objset_snapshot_one(fsname, &sn); } if (err == 0) err = dsl_sync_task_group_wait(sn.dstg); for (dst = list_head(&sn.dstg->dstg_tasks); dst; dst = list_next(&sn.dstg->dstg_tasks, dst)) { objset_t *os = dst->dst_arg1; dsl_dataset_t *ds = os->os_dsl_dataset; if (dst->dst_err) { dsl_dataset_name(ds, sn.failed); } else if (temporary) { dsl_register_onexit_hold_cleanup(sn.newds, tag, minor); } if (sn.needsuspend) zil_resume(dmu_objset_zil(os)); #ifdef __FreeBSD__ #ifdef _KERNEL if (dst->dst_err == 0 && dmu_objset_type(os) == DMU_OST_ZVOL) { char name[MAXNAMELEN]; dmu_objset_name(os, name); strlcat(name, "@", sizeof(name)); strlcat(name, snapname, sizeof(name)); zvol_create_minors(name); } #endif #endif dmu_objset_rele(os, &sn); } if (err) (void) strcpy(fsname, sn.failed); if (temporary) zfs_onexit_fd_rele(cleanup_fd); dsl_sync_task_group_destroy(sn.dstg); spa_close(spa, FTAG); return (err); } static void dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { dnode_t *dn; while (dn = list_head(list)) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* * Initialize dn_zio outside dnode_sync() because the * meta-dnode needs to set it ouside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); list_remove(list, dn); if (newlist) { (void) dnode_add_ref(dn, newlist); list_insert_tail(newlist, dn); } dnode_sync(dn, tx); } } /* ARGSUSED */ static void dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(bp == os->os_rootbp); ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); ASSERT(BP_GET_LEVEL(bp) == 0); /* * Update rootbp fill count: it should be the number of objects * allocated in the object set (not counting the "special" * objects that are stored in the objset_phys_t -- the meta * dnode and user/group accounting objects). */ bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; } /* ARGSUSED */ static void dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } } /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_t zb; zio_prop_t zp; zio_t *zio; list_t *list; list_t *newlist = NULL; dbuf_dirty_record_t *dr; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); ASSERT(dmu_tx_is_syncing(tx)); /* XXX the write_done callback should really give us the tx... */ os->os_synctx = tx; if (os->os_dsl_dataset == NULL) { /* * This is the MOS. If we have upgraded, * spa_max_replication() could change, so reset * os_copies here. */ os->os_copies = spa_max_replication(os->os_spa); } /* * Create the root block IO */ SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); dmu_write_policy(os, NULL, 0, 0, &zp); zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, dmu_objset_write_ready, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync special dnodes - the parent IO for the sync is the root block */ DMU_META_DNODE(os)->dn_zio = zio; dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; if (DMU_USERUSED_DNODE(os) && DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { DMU_USERUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_USERUSED_DNODE(os), tx); DMU_GROUPUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; if (dmu_objset_userused_enabled(os)) { newlist = &os->os_synced_dnodes; /* * We must create the list here because it uses the * dn_dirty_link[] of this txg. */ list_create(newlist, sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[txgoff])); } dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while (dr = list_head(list)) { ASSERT(dr->dr_dbuf->db_level == 0); list_remove(list, dr); if (dr->dr_zio) zio_nowait(dr->dr_zio); } /* * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); os->os_phys->os_zil_header = os->os_zil_header; zio_nowait(zio); } boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg) { return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); } static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) { used_cbs[ost] = cb; } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && used_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } static void do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) { if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { int64_t delta = DNODE_SIZE + used; if (subtract) delta = -delta; VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, user, delta, tx)); VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, group, delta, tx)); } } void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { dnode_t *dn; list_t *list = &os->os_synced_dnodes; ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); while (dn = list_head(list)) { int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); /* Allocate the user/groupused objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY(0 == zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); VERIFY(0 == zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } /* * We intentionally modify the zap object even if the * net delta is zero. Otherwise * the block of the zap obj could be shared between * datasets but need to be different between them after * a bprewrite. */ flags = dn->dn_id_flags; ASSERT(flags); if (flags & DN_ID_OLD_EXIST) { do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); } if (flags & DN_ID_NEW_EXIST) { do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, B_FALSE, tx); } mutex_enter(&dn->dn_mtx); dn->dn_oldused = 0; dn->dn_oldflags = 0; if (dn->dn_id_flags & DN_ID_NEW_EXIST) { dn->dn_olduid = dn->dn_newuid; dn->dn_oldgid = dn->dn_newgid; dn->dn_id_flags |= DN_ID_OLD_EXIST; if (dn->dn_bonuslen == 0) dn->dn_id_flags |= DN_ID_CHKED_SPILL; else dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); mutex_exit(&dn->dn_mtx); list_remove(list, dn); dnode_rele(dn, list); } } /* * Returns a pointer to data to find uid/gid from * * If a dirty record for transaction group that is syncing can't * be found then NULL is returned. In the NULL case it is assumed * the uid/gid aren't changing. */ static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { dbuf_dirty_record_t *dr, **drp; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg == tx->tx_txg) break; if (dr == NULL) { data = NULL; } else { dnode_t *dn; DB_DNODE_ENTER(dr->dr_dbuf); dn = DB_DNODE(dr->dr_dbuf); if (dn->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; DB_DNODE_EXIT(dr->dr_dbuf); } return (data); } void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; - uint64_t *user, *group; + uint64_t *user = NULL; + uint64_t *group = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; if (!dmu_objset_userused_enabled(dn->dn_objset)) return; if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| DN_ID_CHKED_SPILL))) return; if (before && dn->dn_bonuslen != 0) data = DN_BONUS(dn->dn_phys); else if (!before && dn->dn_bonuslen != 0) { if (dn->dn_bonus) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); data = dmu_objset_userquota_find_data(db, tx); } else { data = DN_BONUS(dn->dn_phys); } } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { int rf = 0; if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; error = dmu_spill_hold_by_dnode(dn, rf | DB_RF_MUST_SUCCEED, FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); data = (before) ? db->db.db_data : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { mutex_enter(&dn->dn_mtx); dn->dn_id_flags |= DN_ID_CHKED_BONUS; mutex_exit(&dn->dn_mtx); return; } if (before) { ASSERT(data); user = &dn->dn_olduid; group = &dn->dn_oldgid; } else if (data) { user = &dn->dn_newuid; group = &dn->dn_newgid; } /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, user, group); /* * Preserve existing uid/gid when the callback can't determine * what the new uid/gid are and the callback returned EEXIST. * The EEXIST error tells us to just use the existing uid/gid. * If we don't know what the old values are then just assign * them to 0, since that is a new file being created. */ if (!before && data == NULL && error == EEXIST) { if (flags & DN_ID_OLD_EXIST) { dn->dn_newuid = dn->dn_olduid; dn->dn_newgid = dn->dn_oldgid; } else { dn->dn_newuid = 0; dn->dn_newgid = 0; } error = 0; } if (db) mutex_exit(&db->db_mtx); mutex_enter(&dn->dn_mtx); if (error == 0 && before) dn->dn_id_flags |= DN_ID_OLD_EXIST; if (error == 0 && !before) dn->dn_id_flags |= DN_ID_NEW_EXIST; if (have_spill) { dn->dn_id_flags |= DN_ID_CHKED_SPILL; } else { dn->dn_id_flags |= DN_ID_CHKED_BONUS; } mutex_exit(&dn->dn_mtx); if (have_spill) dmu_buf_rele((dmu_buf_t *)db, FTAG); } boolean_t dmu_objset_userspace_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); } int dmu_objset_userspace_upgrade(objset_t *os) { uint64_t obj; int err = 0; if (dmu_objset_userspace_present(os)) return (0); if (!dmu_objset_userused_enabled(os)) return (ENOTSUP); if (dmu_objset_is_snapshot(os)) return (EINVAL); /* * We simply need to mark every object dirty, so that it will be * synced out and now accounted. If this is called * concurrently, or if we already did some work before crashing, * that's fine, since we track each object's accounted state * independently. */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { dmu_tx_t *tx; dmu_buf_t *db; int objerr; if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr) { dmu_tx_abort(tx); continue; } dmu_buf_will_dirty(db, tx); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { stat->dds_type = os->os_phys->os_type; if (os->os_dsl_dataset) dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { ASSERT(os->os_dsl_dataset || os->os_phys->os_type == DMU_OST_META); if (os->os_dsl_dataset != NULL) dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, os->os_phys->os_type); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, dmu_objset_userspace_present(os)); } int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); else return (B_FALSE); } int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (ds->ds_phys->ds_snapnames_zapobj == 0) return (ENOENT); return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, real, maxlen, conflict)); } int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; if (ds->ds_phys->ds_snapnames_zapobj == 0) return (ENOENT); zap_cursor_init_serialized(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_phys->ds_snapnames_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (ENOENT); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (ENAMETOOLONG); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; if (case_conflict) *case_conflict = attr.za_normalization_conflict; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ if (os->os_dsl_dataset->ds_object != dd->dd_phys->dd_head_dataset_obj) return (ENOENT); zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, dd->dd_phys->dd_child_dir_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (ENOENT); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (ENAMETOOLONG); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } struct findarg { int (*func)(const char *, void *); void *arg; }; /* ARGSUSED */ static int findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) { struct findarg *fa = arg; return (fa->func(dsname, fa->arg)); } /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. * Perhaps change all callers to use dmu_objset_find_spa()? */ int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags) { struct findarg fa; fa.func = func; fa.arg = arg; return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); } /* * Find all objsets under name, call func on each */ int dmu_objset_find_spa(spa_t *spa, const char *name, int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_pool_t *dp; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; char *child; uint64_t thisobj; int err; if (name == NULL) name = spa_name(spa); err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); if (err) return (err); /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_close(dd, FTAG); return (0); } thisobj = dd->dd_phys->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); dp = dd->dd_pool; /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dd->dd_phys->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT(attr->za_integer_length == sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); child = kmem_asprintf("%s/%s", name, attr->za_name); err = dmu_objset_find_spa(spa, child, func, arg, flags); strfree(child); if (err) break; } zap_cursor_fini(&zc); if (err) { dsl_dir_close(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { if (!dsl_pool_sync_context(dp)) rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (!dsl_pool_sync_context(dp)) rw_exit(&dp->dp_config_rwlock); if (err == 0) { uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT(attr->za_integer_length == sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); child = kmem_asprintf("%s@%s", name, attr->za_name); err = func(spa, attr->za_first_integer, child, arg); strfree(child); if (err) break; } zap_cursor_fini(&zc); } } dsl_dir_close(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); if (err) return (err); /* * Apply to self if appropriate. */ err = func(spa, thisobj, name, arg); return (err); } /* ARGSUSED */ int dmu_objset_prefetch(const char *name, void *arg) { dsl_dataset_t *ds; if (dsl_dataset_hold(name, FTAG, &ds)) return (0); if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); (void) arc_read(NULL, dsl_dataset_get_spa(ds), &ds->ds_phys->ds_bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } mutex_exit(&ds->ds_opening_lock); } dsl_dataset_rele(ds, FTAG); return (0); } void dmu_objset_set_user(objset_t *os, void *user_ptr) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); return (os->os_user_ptr); } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c (revision 247192) @@ -1,4364 +1,4361 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Portions Copyright (c) 2011 Martin Matuska */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static char *dsl_reaper = "the grim reaper"; static dsl_checkfunc_t dsl_dataset_destroy_begin_check; static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; static dsl_syncfunc_t dsl_dataset_set_reservation_sync; #define SWITCH64(x, y) \ { \ uint64_t __tmp = (x); \ (x) = (y); \ (y) = __tmp; \ } #define DS_REF_MAX (1ULL << 62) #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE #define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) /* * Figure out how much of this delta should be propogated to the dsl_dir * layer. If there's a refreservation, that space has already been * partially accounted for in our ancestors. */ static int64_t parent_delta(dsl_dataset_t *ds, int64_t delta) { uint64_t old_bytes, new_bytes; if (ds->ds_reserved == 0) return (delta); old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); return (new_bytes - old_bytes); } void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ if (BP_IS_HOLE(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { dsl_pool_mos_diduse_space(tx->tx_pool, used, compressed, uncompressed); return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); ds->ds_phys->ds_referenced_bytes += used; ds->ds_phys->ds_compressed_bytes += compressed; ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); dsl_dir_transfer_space(ds->ds_dir, used - delta, DD_USED_REFRSRV, DD_USED_HEAD, tx); mutex_exit(&ds->ds_dir->dd_lock); } int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) { if (BP_IS_HOLE(bp)) return (0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(bp->blk_birth <= tx->tx_txg); int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); ASSERT(used > 0); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); dsl_pool_mos_diduse_space(tx->tx_pool, -used, -compressed, -uncompressed); return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); ASSERT(!dsl_dataset_is_snapshot(ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { int64_t delta; dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); ASSERT(ds->ds_phys->ds_unique_bytes >= used || !DS_UNIQUE_IS_ACCURATE(ds)); delta = parent_delta(ds, -used); ds->ds_phys->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, -compressed, -uncompressed, tx); dsl_dir_transfer_space(ds->ds_dir, -used - delta, DD_USED_REFRSRV, DD_USED_HEAD, tx); mutex_exit(&ds->ds_dir->dd_lock); } else { dprintf_bp(bp, "putting on dead list: %s", ""); if (async) { /* * We are here as part of zio's write done callback, * which means we're a zio interrupt thread. We can't * call dsl_deadlist_insert() now because it may block * waiting for I/O. Instead, put bp on the deferred * queue and let dsl_pool_sync() finish the job. */ bplist_append(&ds->ds_pending_deadlist, bp); } else { dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object && bp->blk_birth > ds->ds_prev->ds_phys->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); ds->ds_prev->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } } mutex_enter(&ds->ds_lock); ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used); ds->ds_phys->ds_referenced_bytes -= used; ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); ds->ds_phys->ds_compressed_bytes -= compressed; ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); ds->ds_phys->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); return (used); } uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) { uint64_t trysnap = 0; if (ds == NULL) return (0); /* * The snapshot creation could fail, but that would cause an * incorrect FALSE return, which would only result in an * overestimation of the amount of space that an operation would * consume, which is OK. * * There's also a small window where we could miss a pending * snapshot, because we could set the sync task in the quiescing * phase. So this should only be used as a guess. */ if (ds->ds_trysnap_txg > spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) trysnap = ds->ds_trysnap_txg; return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); } boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, uint64_t blk_birth) { if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) return (B_FALSE); ddt_prefetch(dsl_dataset_get_spa(ds), bp); return (B_TRUE); } /* ARGSUSED */ static void dsl_dataset_evict(dmu_buf_t *db, void *dsv) { dsl_dataset_t *ds = dsv; ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); unique_remove(ds->ds_fsid_guid); if (ds->ds_objset != NULL) dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { dsl_dataset_drop_ref(ds->ds_prev, ds); ds->ds_prev = NULL; } bplist_destroy(&ds->ds_pending_deadlist); if (db != NULL) { dsl_deadlist_close(&ds->ds_deadlist); } else { ASSERT(ds->ds_deadlist.dl_dbuf == NULL); ASSERT(!ds->ds_deadlist.dl_oldfmt); } if (ds->ds_dir) dsl_dir_close(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); if (mutex_owned(&ds->ds_lock)) mutex_exit(&ds->ds_lock); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_recvlock); if (mutex_owned(&ds->ds_opening_lock)) mutex_exit(&ds->ds_opening_lock); mutex_destroy(&ds->ds_opening_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); } static int dsl_dataset_get_snapname(dsl_dataset_t *ds) { dsl_dataset_phys_t *headphys; int err; dmu_buf_t *headdbuf; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; if (ds->ds_snapname[0]) return (0); if (ds->ds_phys->ds_next_snap_obj == 0) return (0); err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &headdbuf); if (err) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); dmu_buf_rele(headdbuf, FTAG); return (err); } static int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; matchtype_t mt; int err; if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; err = zap_lookup_norm(mos, snapobj, name, 8, 1, value, mt, NULL, 0, NULL); if (err == ENOTSUP && mt == MT_FIRST) err = zap_lookup(mos, snapobj, name, 8, 1, value); return (err); } static int dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; matchtype_t mt; int err; dsl_dir_snap_cmtime_update(ds->ds_dir); if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; err = zap_remove_norm(mos, snapobj, name, mt, tx); if (err == ENOTSUP && mt == MT_FIRST) err = zap_remove(mos, snapobj, name, tx); return (err); } static int dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; dsl_dataset_t *ds; int err; dmu_object_info_t doi; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); if (err) return (err); /* Make sure dsobj has the correct object type. */ dmu_object_info_from_db(dbuf, &doi); if (doi.doi_type != DMU_OT_DSL_DATASET) return (EINVAL); ds = dmu_buf_get_user(dbuf); if (ds == NULL) { - dsl_dataset_t *winner; + dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; ds->ds_phys = dbuf->db_data; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&ds->ds_rwlock, 0, 0, 0); cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); bplist_create(&ds->ds_pending_deadlist); dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), offsetof(dmu_sendarg_t, dsa_link)); if (err == 0) { err = dsl_dir_open_obj(dp, ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); } if (err) { mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); } if (!dsl_dataset_is_snapshot(ds)) { ds->ds_snapname[0] = '\0'; if (ds->ds_phys->ds_prev_snap_obj) { err = dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev); } } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { err = zap_count( ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_phys->ds_userrefs_obj, &ds->ds_userrefs); } } if (err == 0 && !dsl_dataset_is_snapshot(ds)) { /* * In sync context, we're called with either no lock * or with the write lock. If we're not syncing, * we're always called with the read lock held. */ boolean_t need_lock = !RW_WRITE_HELD(&dp->dp_config_rwlock) && dsl_pool_sync_context(dp); if (need_lock) rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_prop_get_ds(ds, "refreservation", sizeof (uint64_t), 1, &ds->ds_reserved, NULL); if (err == 0) { err = dsl_prop_get_ds(ds, "refquota", sizeof (uint64_t), 1, &ds->ds_quota, NULL); } if (need_lock) rw_exit(&dp->dp_config_rwlock); } else { ds->ds_reserved = ds->ds_quota = 0; } - if (err == 0) { - winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, - dsl_dataset_evict); - } - if (err || winner) { + if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, + &ds->ds_phys, dsl_dataset_evict)) != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) dsl_dataset_drop_ref(ds->ds_prev, ds); dsl_dir_close(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); kmem_free(ds, sizeof (dsl_dataset_t)); if (err) { dmu_buf_rele(dbuf, tag); return (err); } ds = winner; } else { ds->ds_fsid_guid = unique_insert(ds->ds_phys->ds_fsid_guid); } } ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(ds->ds_phys, ==, dbuf->db_data); ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); mutex_enter(&ds->ds_lock); if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { mutex_exit(&ds->ds_lock); dmu_buf_rele(ds->ds_dbuf, tag); return (ENOENT); } mutex_exit(&ds->ds_lock); *dsp = ds; return (0); } static int dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) { dsl_pool_t *dp = ds->ds_dir->dd_pool; /* * In syncing context we don't want the rwlock lock: there * may be an existing writer waiting for sync phase to * finish. We don't need to worry about such writers, since * sync phase is single-threaded, so the writer can't be * doing anything while we are active. */ if (dsl_pool_sync_context(dp)) { ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); return (0); } /* * Normal users will hold the ds_rwlock as a READER until they * are finished (i.e., call dsl_dataset_rele()). "Owners" will * drop their READER lock after they set the ds_owner field. * * If the dataset is being destroyed, the destroy thread will * obtain a WRITER lock for exclusive access after it's done its * open-context work and then change the ds_owner to * dsl_reaper once destruction is assured. So threads * may block here temporarily, until the "destructability" of * the dataset is determined. */ ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); mutex_enter(&ds->ds_lock); while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { rw_exit(&dp->dp_config_rwlock); cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); if (DSL_DATASET_IS_DESTROYED(ds)) { mutex_exit(&ds->ds_lock); dsl_dataset_drop_ref(ds, tag); rw_enter(&dp->dp_config_rwlock, RW_READER); return (ENOENT); } /* * The dp_config_rwlock lives above the ds_lock. And * we need to check DSL_DATASET_IS_DESTROYED() while * holding the ds_lock, so we have to drop and reacquire * the ds_lock here. */ mutex_exit(&ds->ds_lock); rw_enter(&dp->dp_config_rwlock, RW_READER); mutex_enter(&ds->ds_lock); } mutex_exit(&ds->ds_lock); return (0); } int dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); if (err) return (err); return (dsl_dataset_hold_ref(*dsp, tag)); } int dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); if (err) return (err); if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { dsl_dataset_rele(*dsp, tag); *dsp = NULL; return (EBUSY); } return (0); } int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) { dsl_dir_t *dd; dsl_pool_t *dp; const char *snapname; uint64_t obj; int err = 0; err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); if (err) return (err); dp = dd->dd_pool; obj = dd->dd_phys->dd_head_dataset_obj; rw_enter(&dp->dp_config_rwlock, RW_READER); if (obj) err = dsl_dataset_get_ref(dp, obj, tag, dsp); else err = ENOENT; if (err) goto out; err = dsl_dataset_hold_ref(*dsp, tag); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { dsl_dataset_t *ds = NULL; if (*snapname++ != '@') { dsl_dataset_rele(*dsp, tag); err = ENOENT; goto out; } dprintf("looking for snapshot '%s'\n", snapname); err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); if (err == 0) err = dsl_dataset_get_ref(dp, obj, tag, &ds); dsl_dataset_rele(*dsp, tag); ASSERT3U((err == 0), ==, (ds != NULL)); if (ds) { mutex_enter(&ds->ds_lock); if (ds->ds_snapname[0] == 0) (void) strlcpy(ds->ds_snapname, snapname, sizeof (ds->ds_snapname)); mutex_exit(&ds->ds_lock); err = dsl_dataset_hold_ref(ds, tag); *dsp = err ? NULL : ds; } } out: rw_exit(&dp->dp_config_rwlock); dsl_dir_close(dd, FTAG); return (err); } int dsl_dataset_own(const char *name, boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp) { int err = dsl_dataset_hold(name, tag, dsp); if (err) return (err); if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { dsl_dataset_rele(*dsp, tag); return (EBUSY); } return (0); } void dsl_dataset_name(dsl_dataset_t *ds, char *name) { if (ds == NULL) { (void) strcpy(name, "mos"); } else { dsl_dir_name(ds->ds_dir, name); VERIFY(0 == dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { (void) strcat(name, "@"); /* * We use a "recursive" mutex so that we * can call dprintf_ds() with ds_lock held. */ if (!MUTEX_HELD(&ds->ds_lock)) { mutex_enter(&ds->ds_lock); (void) strcat(name, ds->ds_snapname); mutex_exit(&ds->ds_lock); } else { (void) strcat(name, ds->ds_snapname); } } } } static int dsl_dataset_namelen(dsl_dataset_t *ds) { int result; if (ds == NULL) { result = 3; /* "mos" */ } else { result = dsl_dir_namelen(ds->ds_dir); VERIFY(0 == dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { ++result; /* adding one for the @-sign */ if (!MUTEX_HELD(&ds->ds_lock)) { mutex_enter(&ds->ds_lock); result += strlen(ds->ds_snapname); mutex_exit(&ds->ds_lock); } else { result += strlen(ds->ds_snapname); } } } return (result); } void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) { dmu_buf_rele(ds->ds_dbuf, tag); } void dsl_dataset_rele(dsl_dataset_t *ds, void *tag) { if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { rw_exit(&ds->ds_rwlock); } dsl_dataset_drop_ref(ds, tag); } void dsl_dataset_disown(dsl_dataset_t *ds, void *tag) { ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; if (RW_WRITE_HELD(&ds->ds_rwlock)) { rw_exit(&ds->ds_rwlock); cv_broadcast(&ds->ds_exclusive_cv); } mutex_exit(&ds->ds_lock); if (ds->ds_dbuf) dsl_dataset_drop_ref(ds, tag); else dsl_dataset_evict(NULL, ds); } boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) { boolean_t gotit = FALSE; mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { ds->ds_owner = tag; if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) rw_exit(&ds->ds_rwlock); gotit = TRUE; } mutex_exit(&ds->ds_lock); return (gotit); } void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) { ASSERT3P(owner, ==, ds->ds_owner); if (!RW_WRITE_HELD(&ds->ds_rwlock)) rw_enter(&ds->ds_rwlock, RW_WRITER); } uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj; objset_t *mos = dp->dp_meta_objset; if (origin == NULL) origin = dp->dp_origin_snap; ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; dsphys->ds_fsid_guid = unique_create(); do { (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); } while (dsphys->ds_guid == 0); dsphys->ds_snapnames_zapobj = zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; if (origin == NULL) { dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); } else { dsl_dataset_t *ohds; dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = origin->ds_phys->ds_creation_txg; dsphys->ds_referenced_bytes = origin->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = origin->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = origin->ds_phys->ds_uncompressed_bytes; dsphys->ds_bp = origin->ds_phys->ds_bp; dsphys->ds_flags |= origin->ds_phys->ds_flags; dmu_buf_will_dirty(origin->ds_dbuf, tx); origin->ds_phys->ds_num_children++; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); dsl_dataset_rele(ohds, FTAG); if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { if (origin->ds_phys->ds_next_clones_obj == 0) { origin->ds_phys->ds_next_clones_obj = zap_create(mos, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY(0 == zap_add_int(mos, origin->ds_phys->ds_next_clones_obj, dsobj, tx)); } dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_origin_obj = origin->ds_object; if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { if (origin->ds_dir->dd_phys->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); origin->ds_dir->dd_phys->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY3U(0, ==, zap_add_int(mos, origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); } } if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_head_dataset_obj = dsobj; return (dsobj); } uint64_t dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) { dsl_pool_t *dp = pdd->dd_pool; uint64_t dsobj, ddobj; dsl_dir_t *dd; ASSERT(lastname[0] != '@'); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); dsl_deleg_set_create_perms(dd, tx, cr); dsl_dir_close(dd, FTAG); /* * If we are creating a clone, make sure we zero out any stale * data from the origin snapshots zil header. */ if (origin != NULL) { dsl_dataset_t *ds; objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); bzero(&os->os_zil_header, sizeof (os->os_zil_header)); dsl_dataset_dirty(ds, tx); dsl_dataset_rele(ds, FTAG); } return (dsobj); } #ifdef __FreeBSD__ /* FreeBSD ioctl compat begin */ struct destroyarg { nvlist_t *nvl; const char *snapname; }; static int dsl_check_snap_cb(const char *name, void *arg) { struct destroyarg *da = arg; dsl_dataset_t *ds; char *dsname; dsname = kmem_asprintf("%s@%s", name, da->snapname); VERIFY(nvlist_add_boolean(da->nvl, dsname) == 0); return (0); } int dmu_get_recursive_snaps_nvl(const char *fsname, const char *snapname, nvlist_t *snaps) { struct destroyarg *da; int err; da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP); da->nvl = snaps; da->snapname = snapname; err = dmu_objset_find(fsname, dsl_check_snap_cb, da, DS_FIND_CHILDREN); kmem_free(da, sizeof (struct destroyarg)); return (err); } /* FreeBSD ioctl compat end */ #endif /* __FreeBSD__ */ /* * The snapshots must all be in the same pool. */ int dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) { int err; dsl_sync_task_t *dst; spa_t *spa; nvpair_t *pair; dsl_sync_task_group_t *dstg; pair = nvlist_next_nvpair(snaps, NULL); if (pair == NULL) return (0); err = spa_open(nvpair_name(pair), &spa, FTAG); if (err) return (err); dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { dsl_dataset_t *ds; err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); if (err == 0) { struct dsl_ds_destroyarg *dsda; dsl_dataset_make_exclusive(ds, dstg); dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); dsda->ds = ds; dsda->defer = defer; dsl_sync_task_create(dstg, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, dsda, dstg, 0); } else if (err == ENOENT) { err = 0; } else { (void) strcpy(failed, nvpair_name(pair)); break; } } if (err == 0) err = dsl_sync_task_group_wait(dstg); for (dst = list_head(&dstg->dstg_tasks); dst; dst = list_next(&dstg->dstg_tasks, dst)) { struct dsl_ds_destroyarg *dsda = dst->dst_arg1; dsl_dataset_t *ds = dsda->ds; /* * Return the file system name that triggered the error */ if (dst->dst_err) { dsl_dataset_name(ds, failed); } ASSERT3P(dsda->rm_origin, ==, NULL); dsl_dataset_disown(ds, dstg); kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); } dsl_sync_task_group_destroy(dstg); spa_close(spa, FTAG); return (err); } static boolean_t dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) { boolean_t might_destroy = B_FALSE; mutex_enter(&ds->ds_lock); if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && DS_IS_DEFER_DESTROY(ds)) might_destroy = B_TRUE; mutex_exit(&ds->ds_lock); return (might_destroy); } /* * If we're removing a clone, and these three conditions are true: * 1) the clone's origin has no other children * 2) the clone's origin has no user references * 3) the clone's origin has been marked for deferred destruction * Then, prepare to remove the origin as part of this sync task group. */ static int dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) { dsl_dataset_t *ds = dsda->ds; dsl_dataset_t *origin = ds->ds_prev; if (dsl_dataset_might_destroy_origin(origin)) { char *name; int namelen; int error; namelen = dsl_dataset_namelen(origin) + 1; name = kmem_alloc(namelen, KM_SLEEP); dsl_dataset_name(origin, name); #ifdef _KERNEL error = zfs_unmount_snap(name, NULL); if (error) { kmem_free(name, namelen); return (error); } #endif error = dsl_dataset_own(name, B_TRUE, tag, &origin); kmem_free(name, namelen); if (error) return (error); dsda->rm_origin = origin; dsl_dataset_make_exclusive(origin, tag); } return (0); } /* * ds must be opened as OWNER. On return (whether successful or not), * ds will be closed and caller can no longer dereference it. */ int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) { int err; dsl_sync_task_group_t *dstg; objset_t *os; dsl_dir_t *dd; uint64_t obj; struct dsl_ds_destroyarg dsda = { 0 }; dsl_dataset_t dummy_ds = { 0 }; dsda.ds = ds; if (dsl_dataset_is_snapshot(ds)) { /* Destroying a snapshot is simpler */ dsl_dataset_make_exclusive(ds, tag); dsda.defer = defer; err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, &dsda, tag, 0); ASSERT3P(dsda.rm_origin, ==, NULL); goto out; } else if (defer) { err = EINVAL; goto out; } dd = ds->ds_dir; dummy_ds.ds_dir = dd; dummy_ds.ds_object = ds->ds_object; if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { /* * Check for errors and mark this ds as inconsistent, in * case we crash while freeing the objects. */ err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, dsl_dataset_destroy_begin_sync, ds, NULL, 0); if (err) goto out; err = dmu_objset_from_ds(ds, &os); if (err) goto out; /* * Remove all objects while in the open context so that * there is less work to do in the syncing context. */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, ds->ds_phys->ds_prev_snap_txg)) { /* * Ignore errors, if there is not enough disk space * we will deal with it in dsl_dataset_destroy_sync(). */ (void) dmu_free_object(os, obj); } if (err != ESRCH) goto out; /* * Sync out all in-flight IO. */ txg_wait_synced(dd->dd_pool, 0); /* * If we managed to free all the objects in open * context, the user space accounting should be zero. */ if (ds->ds_phys->ds_bp.blk_fill == 0 && dmu_objset_userused_enabled(os)) { uint64_t count; ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || count == 0); ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || count == 0); } } rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); rw_exit(&dd->dd_pool->dp_config_rwlock); if (err) goto out; /* * Blow away the dsl_dir + head dataset. */ dsl_dataset_make_exclusive(ds, tag); /* * If we're removing a clone, we might also need to remove its * origin. */ do { dsda.need_prep = B_FALSE; if (dsl_dir_is_clone(dd)) { err = dsl_dataset_origin_rm_prep(&dsda, tag); if (err) { dsl_dir_close(dd, FTAG); goto out; } } dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); dsl_sync_task_create(dstg, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, &dsda, tag, 0); dsl_sync_task_create(dstg, dsl_dir_destroy_check, dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); err = dsl_sync_task_group_wait(dstg); dsl_sync_task_group_destroy(dstg); /* * We could be racing against 'zfs release' or 'zfs destroy -d' * on the origin snap, in which case we can get EBUSY if we * needed to destroy the origin snap but were not ready to * do so. */ if (dsda.need_prep) { ASSERT(err == EBUSY); ASSERT(dsl_dir_is_clone(dd)); ASSERT(dsda.rm_origin == NULL); } } while (dsda.need_prep); if (dsda.rm_origin != NULL) dsl_dataset_disown(dsda.rm_origin, tag); /* if it is successful, dsl_dir_destroy_sync will close the dd */ if (err) dsl_dir_close(dd, FTAG); out: dsl_dataset_disown(ds, tag); return (err); } blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { return (&ds->ds_phys->ds_bp); } void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); /* If it's the meta-objset, set dp_meta_rootbp */ if (ds == NULL) { tx->tx_pool->dp_meta_rootbp = *bp; } else { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_bp = *bp; } } spa_t * dsl_dataset_get_spa(dsl_dataset_t *ds) { return (ds->ds_dir->dd_pool->dp_spa); } void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp; if (ds == NULL) /* this is the meta-objset */ return; ASSERT(ds->ds_objset != NULL); if (ds->ds_phys->ds_next_snap_obj != 0) panic("dirtying snapshot!"); dp = ds->ds_dir->dd_pool; if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, ds); } } boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds) { for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, ds, t)) return (B_TRUE); } return (B_FALSE); } /* * The unique space in the head dataset can be calculated by subtracting * the space used in the most recent snapshot, that is still being used * in this file system, from the space currently in use. To figure out * the space in the most recent snapshot still in use, we need to take * the total space used in the snapshot and subtract out the space that * has been freed up since the snapshot was taken. */ static void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) { uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; ASSERT(!dsl_dataset_is_snapshot(ds)); if (ds->ds_phys->ds_prev_snap_obj != 0) mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes; else mrs_used = 0; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ASSERT3U(dlused, <=, mrs_used); ds->ds_phys->ds_unique_bytes = ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } struct killarg { dsl_dataset_t *ds; dmu_tx_t *tx; }; /* ARGSUSED */ static int kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; if (bp == NULL) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { ASSERT(zilog != NULL); /* * It's a block in the intent log. It has no * accounting, so just free it. */ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { ASSERT(zilog == NULL); ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } return (0); } /* ARGSUSED */ static int dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t count; int err; /* * Can't delete a head dataset if there are snapshots of it. * (Except if the only snapshots are from the branch we cloned * from.) */ if (ds->ds_prev != NULL && ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) return (EBUSY); /* * This is really a dsl_dir thing, but check it here so that * we'll be less likely to leave this dataset inconsistent & * nearly destroyed. */ err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); if (err) return (err); if (count != 0) return (EEXIST); return (0); } /* ARGSUSED */ static void dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_pool_t *dp = ds->ds_dir->dd_pool; /* Mark it as inconsistent on-disk, in case we crash */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, "dataset = %llu", ds->ds_object); } static int dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, dmu_tx_t *tx) { dsl_dataset_t *ds = dsda->ds; dsl_dataset_t *ds_prev = ds->ds_prev; if (dsl_dataset_might_destroy_origin(ds_prev)) { struct dsl_ds_destroyarg ndsda = {0}; /* * If we're not prepared to remove the origin, don't remove * the clone either. */ if (dsda->rm_origin == NULL) { dsda->need_prep = B_TRUE; return (EBUSY); } ndsda.ds = ds_prev; ndsda.is_origin_rm = B_TRUE; return (dsl_dataset_destroy_check(&ndsda, tag, tx)); } /* * If we're not going to remove the origin after all, * undo the open context setup. */ if (dsda->rm_origin != NULL) { dsl_dataset_disown(dsda->rm_origin, tag); dsda->rm_origin = NULL; } return (0); } /* * If you add new checks here, you may need to add * additional checks to the "temporary" case in * snapshot_check() in dmu_objset.c. */ /* ARGSUSED */ int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { struct dsl_ds_destroyarg *dsda = arg1; dsl_dataset_t *ds = dsda->ds; /* we have an owner hold, so noone else can destroy us */ ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); /* * Only allow deferred destroy on pools that support it. * NOTE: deferred destroy is only supported on snapshots. */ if (dsda->defer) { if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) return (ENOTSUP); ASSERT(dsl_dataset_is_snapshot(ds)); return (0); } /* * Can't delete a head dataset if there are snapshots of it. * (Except if the only snapshots are from the branch we cloned * from.) */ if (ds->ds_prev != NULL && ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) return (EBUSY); /* * If we made changes this txg, traverse_dsl_dataset won't find * them. Try again. */ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) return (EAGAIN); if (dsl_dataset_is_snapshot(ds)) { /* * If this snapshot has an elevated user reference count, * we can't destroy it yet. */ if (ds->ds_userrefs > 0 && !dsda->releasing) return (EBUSY); mutex_enter(&ds->ds_lock); /* * Can't delete a branch point. However, if we're destroying * a clone and removing its origin due to it having a user * hold count of 0 and having been marked for deferred destroy, * it's OK for the origin to have a single clone. */ if (ds->ds_phys->ds_num_children > (dsda->is_origin_rm ? 2 : 1)) { mutex_exit(&ds->ds_lock); return (EEXIST); } mutex_exit(&ds->ds_lock); } else if (dsl_dir_is_clone(ds->ds_dir)) { return (dsl_dataset_origin_check(dsda, arg2, tx)); } /* XXX we should do some i/o error checking... */ return (0); } struct refsarg { kmutex_t lock; boolean_t gone; kcondvar_t cv; }; /* ARGSUSED */ static void dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) { struct refsarg *arg = argv; mutex_enter(&arg->lock); arg->gone = TRUE; cv_signal(&arg->cv); mutex_exit(&arg->lock); } static void dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) { struct refsarg arg; bzero(&arg, sizeof(arg)); mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); arg.gone = FALSE; (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, dsl_dataset_refs_gone); dmu_buf_rele(ds->ds_dbuf, tag); mutex_enter(&arg.lock); while (!arg.gone) cv_wait(&arg.cv, &arg.lock); ASSERT(arg.gone); mutex_exit(&arg.lock); ds->ds_dbuf = NULL; ds->ds_phys = NULL; mutex_destroy(&arg.lock); cv_destroy(&arg.cv); } static void remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t count; int err; ASSERT(ds->ds_phys->ds_num_children >= 2); err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); /* * The err should not be ENOENT, but a bug in a previous version * of the code could cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a missing entry. * If we knew that the pool was created after * SPA_VERSION_NEXT_CLONES, we could assert that it isn't * ENOENT. However, at least we can check that we don't have * too many entries in the next_clones_obj even after failing to * remove this one. */ if (err != ENOENT) { VERIFY0(err); } ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count)); ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); } static void dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; zap_cursor_t zc; zap_attribute_t za; /* * If it is the old version, dd_clones doesn't exist so we can't * find the clones, but deadlist_remove_key() is a no-op so it * doesn't matter. */ if (ds->ds_dir->dd_phys->dd_clones == 0) return; for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, za.za_first_integer, FTAG, &clone)); if (clone->ds_dir->dd_origin_txg > mintxg) { dsl_deadlist_remove_key(&clone->ds_deadlist, mintxg, tx); dsl_dataset_remove_clones_key(clone, mintxg, tx); } dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); } struct process_old_arg { dsl_dataset_t *ds; dsl_dataset_t *ds_prev; boolean_t after_branch_point; zio_t *pio; uint64_t used, comp, uncomp; }; static int process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { struct process_old_arg *poa = arg; dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); if (poa->ds_prev && !poa->after_branch_point && bp->blk_birth > poa->ds_prev->ds_phys->ds_prev_snap_txg) { poa->ds_prev->ds_phys->ds_unique_bytes += bp_get_dsize_sync(dp->dp_spa, bp); } } else { poa->used += bp_get_dsize_sync(dp->dp_spa, bp); poa->comp += BP_GET_PSIZE(bp); poa->uncomp += BP_GET_UCSIZE(bp); dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); } return (0); } static void process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) { struct process_old_arg poa = { 0 }; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; ASSERT(ds->ds_deadlist.dl_oldfmt); ASSERT(ds_next->ds_deadlist.dl_oldfmt); poa.ds = ds; poa.ds_prev = ds_prev; poa.after_branch_point = after_branch_point; poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, process_old_cb, &poa, tx)); VERIFY0(zio_wait(poa.pio)); ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); /* change snapused */ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -poa.used, -poa.comp, -poa.uncomp, tx); /* swap next's deadlist to our deadlist */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_close(&ds_next->ds_deadlist); SWITCH64(ds_next->ds_phys->ds_deadlist_obj, ds->ds_phys->ds_deadlist_obj); dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); dsl_deadlist_open(&ds_next->ds_deadlist, mos, ds_next->ds_phys->ds_deadlist_obj); } static int old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) { int err; struct killarg ka; /* * Free everything that we point to (that's born after * the previous snapshot, if we are a clone) * * NB: this should be very quick, because we already * freed all the objects in open context. */ ka.ds = ds; ka.tx = tx; err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, kill_blkptr, &ka); ASSERT0(err); ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); return (err); } void dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_destroyarg *dsda = arg1; dsl_dataset_t *ds = dsda->ds; int err; int after_branch_point = FALSE; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; dsl_dataset_t *ds_prev = NULL; boolean_t wont_destroy; uint64_t obj; wont_destroy = (dsda->defer && (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); ASSERT(ds->ds_owner || wont_destroy); ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); if (wont_destroy) { ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; return; } /* signal any waiters that this dataset is going away */ mutex_enter(&ds->ds_lock); ds->ds_owner = dsl_reaper; cv_broadcast(&ds->ds_exclusive_cv); mutex_exit(&ds->ds_lock); /* Remove our reservation */ if (ds->ds_reserved != 0) { dsl_prop_setarg_t psa; uint64_t value = 0; dsl_prop_setarg_init_uint64(&psa, "refreservation", (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), &value); psa.psa_effective_value = 0; /* predict default value */ dsl_dataset_set_reservation_sync(ds, &psa, tx); ASSERT0(ds->ds_reserved); } ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; if (ds->ds_phys->ds_prev_snap_obj != 0) { if (ds->ds_prev) { ds_prev = ds->ds_prev; } else { VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); } after_branch_point = (ds_prev->ds_phys->ds_next_snap_obj != obj); dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && ds_prev->ds_phys->ds_next_clones_obj != 0) { remove_from_next_clones(ds_prev, obj, tx); if (ds->ds_phys->ds_next_snap_obj != 0) { VERIFY(0 == zap_add_int(mos, ds_prev->ds_phys->ds_next_clones_obj, ds->ds_phys->ds_next_snap_obj, tx)); } } if (after_branch_point && ds->ds_phys->ds_next_snap_obj == 0) { /* This clone is toast. */ ASSERT(ds_prev->ds_phys->ds_num_children > 1); ds_prev->ds_phys->ds_num_children--; /* * If the clone's origin has no other clones, no * user holds, and has been marked for deferred * deletion, then we should have done the necessary * destroy setup for it. */ if (ds_prev->ds_phys->ds_num_children == 1 && ds_prev->ds_userrefs == 0 && DS_IS_DEFER_DESTROY(ds_prev)) { ASSERT3P(dsda->rm_origin, !=, NULL); } else { ASSERT3P(dsda->rm_origin, ==, NULL); } } else if (!after_branch_point) { ds_prev->ds_phys->ds_next_snap_obj = ds->ds_phys->ds_next_snap_obj; } } if (dsl_dataset_is_snapshot(ds)) { dsl_dataset_t *ds_next; uint64_t old_unique; uint64_t used = 0, comp = 0, uncomp = 0; VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); old_unique = ds_next->ds_phys->ds_unique_bytes; dmu_buf_will_dirty(ds_next->ds_dbuf, tx); ds_next->ds_phys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; ds_next->ds_phys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); if (ds_next->ds_deadlist.dl_oldfmt) { process_old_deadlist(ds, ds_prev, ds_next, after_branch_point, tx); } else { /* Adjust prev's unique space. */ if (ds_prev && !after_branch_point) { dsl_deadlist_space_range(&ds_next->ds_deadlist, ds_prev->ds_phys->ds_prev_snap_txg, ds->ds_phys->ds_prev_snap_txg, &used, &comp, &uncomp); ds_prev->ds_phys->ds_unique_bytes += used; } /* Adjust snapused. */ dsl_deadlist_space_range(&ds_next->ds_deadlist, ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -used, -comp, -uncomp, tx); /* Move blocks to be freed to pool's free list. */ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, tx); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, ds->ds_phys->ds_deadlist_obj, tx); } dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); /* Collapse range in clone heads */ dsl_dataset_remove_clones_key(ds, ds->ds_phys->ds_creation_txg, tx); if (dsl_dataset_is_snapshot(ds_next)) { dsl_dataset_t *ds_nextnext; /* * Update next's unique to include blocks which * were previously shared by only this snapshot * and it. Those blocks will be born after the * prev snap and before this snap, and will have * died after the next snap and before the one * after that (ie. be on the snap after next's * deadlist). */ VERIFY(0 == dsl_dataset_hold_obj(dp, ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext)); dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, ds->ds_phys->ds_prev_snap_txg, ds->ds_phys->ds_creation_txg, &used, &comp, &uncomp); ds_next->ds_phys->ds_unique_bytes += used; dsl_dataset_rele(ds_nextnext, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); /* Collapse range in this head. */ dsl_dataset_t *hds; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds)); dsl_deadlist_remove_key(&hds->ds_deadlist, ds->ds_phys->ds_creation_txg, tx); dsl_dataset_rele(hds, FTAG); } else { ASSERT3P(ds_next->ds_prev, ==, ds); dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); ds_next->ds_prev = NULL; if (ds_prev) { VERIFY(0 == dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds_next, &ds_next->ds_prev)); } dsl_dataset_recalc_head_uniq(ds_next); /* * Reduce the amount of our unconsmed refreservation * being charged to our parent by the amount of * new unique data we have gained. */ if (old_unique < ds_next->ds_reserved) { int64_t mrsdelta; uint64_t new_unique = ds_next->ds_phys->ds_unique_bytes; ASSERT(old_unique <= new_unique); mrsdelta = MIN(new_unique - old_unique, ds_next->ds_reserved - old_unique); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); } } dsl_dataset_rele(ds_next, FTAG); } else { zfeature_info_t *async_destroy = &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; objset_t *os; /* * There's no next snapshot, so this is a head dataset. * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty. (If it's a clone, it's * safe to ignore the deadlist contents.) */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { err = old_synchronous_dataset_destroy(ds, tx); } else { /* * Move the bptree into the pool's list of trees to * clean up and update space accounting information. */ uint64_t used, comp, uncomp; zil_destroy_sync(dmu_objset_zil(os), tx); if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { spa_feature_incr(dp->dp_spa, async_destroy, tx); dp->dp_bptree_obj = bptree_alloc(mos, tx); VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj, tx) == 0); } used = ds->ds_dir->dd_phys->dd_used_bytes; comp = ds->ds_dir->dd_phys->dd_compressed_bytes; uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == used); bptree_add(mos, dp->dp_bptree_obj, &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, used, comp, uncomp, tx); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, -used, -comp, -uncomp, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); } if (ds->ds_prev != NULL) { if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY3U(0, ==, zap_remove_int(mos, ds->ds_prev->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); } dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = ds_prev = NULL; } } /* * This must be done after the dsl_traverse(), because it will * re-open the objset. */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); ASSERT(err == 0); } else { /* remove from snapshot namespace */ dsl_dataset_t *ds_head; ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); VERIFY(0 == dsl_dataset_get_snapname(ds)); #ifdef ZFS_DEBUG { uint64_t val; err = dsl_dataset_snap_lookup(ds_head, ds->ds_snapname, &val); ASSERT0(err); ASSERT3U(val, ==, obj); } #endif err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); ASSERT(err == 0); dsl_dataset_rele(ds_head, FTAG); } if (ds_prev && ds->ds_prev != ds_prev) dsl_dataset_rele(ds_prev, FTAG); spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, "dataset = %llu", ds->ds_object); if (ds->ds_phys->ds_next_clones_obj != 0) { uint64_t count; ASSERT(0 == zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count) && count == 0); VERIFY(0 == dmu_object_free(mos, ds->ds_phys->ds_next_clones_obj, tx)); } if (ds->ds_phys->ds_props_obj != 0) VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); if (ds->ds_phys->ds_userrefs_obj != 0) VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); dsl_dir_close(ds->ds_dir, ds); ds->ds_dir = NULL; dsl_dataset_drain_refs(ds, tag); VERIFY(0 == dmu_object_free(mos, obj, tx)); if (dsda->rm_origin) { /* * Remove the origin of the clone we just destroyed. */ struct dsl_ds_destroyarg ndsda = {0}; ndsda.ds = dsda->rm_origin; dsl_dataset_destroy_sync(&ndsda, tag, tx); } } static int dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t asize; if (!dmu_tx_is_syncing(tx)) return (0); /* * If there's an fs-only reservation, any blocks that might become * owned by the snapshot dataset must be accommodated by space * outside of the reservation. */ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); /* * Propogate any reserved space for this snapshot to other * snapshot checks in this sync group. */ if (asize > 0) dsl_dir_willuse_space(ds->ds_dir, asize, tx); return (0); } int dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; const char *snapname = arg2; int err; uint64_t value; /* * We don't allow multiple snapshots of the same txg. If there * is already one, try again. */ if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) return (EAGAIN); /* * Check for conflicting name snapshot name. */ err = dsl_dataset_snap_lookup(ds, snapname, &value); if (err == 0) return (EEXIST); if (err != ENOENT) return (err); /* * Check that the dataset's name is not too long. Name consists * of the dataset's length + 1 for the @-sign + snapshot name's length */ if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) return (ENAMETOOLONG); err = dsl_dataset_snapshot_reserve_space(ds, tx); if (err) return (err); ds->ds_trysnap_txg = tx->tx_txg; return (0); } void dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; const char *snapname = arg2; dsl_pool_t *dp = ds->ds_dir->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; int err; ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); /* * The origin's ds_creation_txg has to be < TXG_INITIAL */ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) crtxg = 1; else crtxg = tx->tx_txg; dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); do { (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); } while (dsphys->ds_guid == 0); dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; dsphys->ds_next_snap_obj = ds->ds_object; dsphys->ds_num_children = 1; dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes; dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; dsphys->ds_flags = ds->ds_phys->ds_flags; dsphys->ds_bp = ds->ds_phys->ds_bp; dmu_buf_rele(dbuf, FTAG); ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); if (ds->ds_prev) { uint64_t next_clones_obj = ds->ds_prev->ds_phys->ds_next_clones_obj; ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object || ds->ds_prev->ds_phys->ds_num_children > 1); if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ds->ds_prev->ds_phys->ds_creation_txg); ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { remove_from_next_clones(ds->ds_prev, dsphys->ds_next_snap_obj, tx); VERIFY3U(0, ==, zap_add_int(mos, next_clones_obj, dsobj, tx)); } } /* * If we have a reference-reservation on this dataset, we will * need to increase the amount of refreservation being charged * since our unique space is going to zero. */ if (ds->ds_reserved) { int64_t delta; ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); } dmu_buf_will_dirty(ds->ds_dbuf, tx); zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", ds->ds_dir->dd_myname, snapname, dsobj, ds->ds_phys->ds_prev_snap_txg); ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); dsl_deadlist_add_key(&ds->ds_deadlist, ds->ds_phys->ds_prev_snap_txg, tx); ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); ds->ds_phys->ds_prev_snap_obj = dsobj; ds->ds_phys->ds_prev_snap_txg = crtxg; ds->ds_phys->ds_unique_bytes = 0; if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &dsobj, tx); ASSERT(err == 0); if (ds->ds_prev) dsl_dataset_drop_ref(ds->ds_prev, ds); VERIFY(0 == dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); dsl_scan_ds_snapshotted(ds, tx); dsl_dir_snap_cmtime_update(ds->ds_dir); spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, "dataset = %llu", dsobj); } void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); ASSERT(ds->ds_phys->ds_next_snap_obj == 0); /* * in case we had to change ds_fsid_guid when we opened it, * sync it out now. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; dmu_objset_sync(ds->ds_objset, zio, tx); } static void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) { uint64_t count = 0; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; zap_cursor_t zc; zap_attribute_t za; nvlist_t *propval; nvlist_t *val; rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); /* * There may me missing entries in ds_next_clones_obj * due to a bug in a previous version of the code. * Only trust it if it has the right number of entries. */ if (ds->ds_phys->ds_next_clones_obj != 0) { ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, &count)); } if (count != ds->ds_phys->ds_num_children - 1) { goto fail; } for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAXNAMELEN]; /* * Even though we hold the dp_config_rwlock, the dataset * may fail to open, returning ENOENT. If there is a * thread concurrently attempting to destroy this * dataset, it will have the ds_rwlock held for * RW_WRITER. Our call to dsl_dataset_hold_obj() -> * dsl_dataset_hold_ref() will fail its * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the * dp_config_rwlock, and wait for the destroy progress * and signal ds_exclusive_cv. If the destroy was * successful, we will see that * DSL_DATASET_IS_DESTROYED(), and return ENOENT. */ if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, za.za_first_integer, FTAG, &clone) != 0) continue; dsl_dir_name(clone->ds_dir, buf); VERIFY(nvlist_add_boolean(val, buf) == 0); dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval) == 0); fail: nvlist_free(val); nvlist_free(propval); rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { uint64_t refd, avail, uobjs, aobjs, ratio; dsl_dir_stats(ds->ds_dir, nv); dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, ds->ds_phys->ds_creation_time); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, ds->ds_phys->ds_creation_txg); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, ds->ds_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, ds->ds_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, ds->ds_phys->ds_guid); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, ds->ds_phys->ds_unique_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, ds->ds_object); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ds->ds_userrefs); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, DS_IS_DEFER_DESTROY(ds) ? 1 : 0); if (ds->ds_phys->ds_prev_snap_obj != 0) { uint64_t written, comp, uncomp; dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_dataset_t *prev; rw_enter(&dp->dp_config_rwlock, RW_READER); int err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); rw_exit(&dp->dp_config_rwlock); if (err == 0) { err = dsl_dataset_space_written(prev, ds, &written, &comp, &uncomp); dsl_dataset_rele(prev, FTAG); if (err == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, written); } } } ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : (ds->ds_phys->ds_uncompressed_bytes * 100 / ds->ds_phys->ds_compressed_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); if (ds->ds_phys->ds_next_snap_obj) { /* * This is a snapshot; override the dd's space used with * our unique space and compression ratio. */ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, ds->ds_phys->ds_unique_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); get_clones_stat(ds, nv); } } void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; stat->dds_guid = ds->ds_phys->ds_guid; if (ds->ds_phys->ds_next_snap_obj) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; } else { stat->dds_is_snapshot = B_FALSE; stat->dds_num_clones = 0; } /* clone origin is really a dsl_dir thing... */ rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); if (dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_t *ods; VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); dsl_dataset_name(ods, stat->dds_origin); dsl_dataset_drop_ref(ods, FTAG); } else { stat->dds_origin[0] = '\0'; } rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds) { return (ds->ds_fsid_guid); } void dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { *refdbytesp = ds->ds_phys->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; if (ds->ds_quota != 0) { /* * Adjust available bytes according to refquota */ if (*refdbytesp < ds->ds_quota) *availbytesp = MIN(*availbytesp, ds->ds_quota - *refdbytesp); else *availbytesp = 0; } *usedobjsp = ds->ds_phys->ds_bp.blk_fill; *availobjsp = DN_MAX_OBJECT - *usedobjsp; } boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) { dsl_pool_t *dp = ds->ds_dir->dd_pool; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); if (ds->ds_prev == NULL) return (B_FALSE); if (ds->ds_phys->ds_bp.blk_birth > ds->ds_prev->ds_phys->ds_creation_txg) { objset_t *os, *os_prev; /* * It may be that only the ZIL differs, because it was * reset in the head. Don't count that as being * modified. */ if (dmu_objset_from_ds(ds, &os) != 0) return (B_TRUE); if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) return (B_TRUE); return (bcmp(&os->os_phys->os_meta_dnode, &os_prev->os_phys->os_meta_dnode, sizeof (os->os_phys->os_meta_dnode)) != 0); } return (B_FALSE); } /* ARGSUSED */ static int dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; char *newsnapname = arg2; dsl_dir_t *dd = ds->ds_dir; dsl_dataset_t *hds; uint64_t val; int err; err = dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); if (err) return (err); /* new name better not be in use */ err = dsl_dataset_snap_lookup(hds, newsnapname, &val); dsl_dataset_rele(hds, FTAG); if (err == 0) err = EEXIST; else if (err == ENOENT) err = 0; /* dataset name + 1 for the "@" + the new snapshot name must fit */ if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) err = ENAMETOOLONG; return (err); } static void dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) { char oldname[MAXPATHLEN], newname[MAXPATHLEN]; dsl_dataset_t *ds = arg1; const char *newsnapname = arg2; dsl_dir_t *dd = ds->ds_dir; objset_t *mos = dd->dd_pool->dp_meta_objset; dsl_dataset_t *hds; int err; ASSERT(ds->ds_phys->ds_next_snap_obj != 0); VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); VERIFY(0 == dsl_dataset_get_snapname(ds)); err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); ASSERT0(err); dsl_dataset_name(ds, oldname); mutex_enter(&ds->ds_lock); (void) strcpy(ds->ds_snapname, newsnapname); mutex_exit(&ds->ds_lock); err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx); ASSERT0(err); dsl_dataset_name(ds, newname); #ifdef _KERNEL zvol_rename_minors(oldname, newname); #endif spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, "dataset = %llu", ds->ds_object); dsl_dataset_rele(hds, FTAG); } struct renamesnaparg { dsl_sync_task_group_t *dstg; char failed[MAXPATHLEN]; char *oldsnap; char *newsnap; int error; }; static int dsl_snapshot_rename_one(const char *name, void *arg) { struct renamesnaparg *ra = arg; dsl_dataset_t *ds = NULL; char *snapname; int err; snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); /* * For recursive snapshot renames the parent won't be changing * so we just pass name for both the to/from argument. */ err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); if (err != 0) { strfree(snapname); return (err == ENOENT ? 0 : err); } #ifdef _KERNEL /* * For all filesystems undergoing rename, we'll need to unmount it. */ (void) zfs_unmount_snap(snapname, NULL); #endif err = dsl_dataset_hold(snapname, ra->dstg, &ds); strfree(snapname); if (err != 0) return (err == ENOENT ? 0 : err); dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); /* First successful rename clears the error. */ ra->error = 0; return (0); } static int dsl_recursive_rename(char *oldname, const char *newname) { int err; struct renamesnaparg *ra; dsl_sync_task_t *dst; spa_t *spa; char *cp, *fsname = spa_strdup(oldname); int len = strlen(oldname) + 1; /* truncate the snapshot name to get the fsname */ cp = strchr(fsname, '@'); *cp = '\0'; err = spa_open(fsname, &spa, FTAG); if (err) { kmem_free(fsname, len); return (err); } ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ra->oldsnap = strchr(oldname, '@') + 1; ra->newsnap = strchr(newname, '@') + 1; *ra->failed = '\0'; ra->error = ENOENT; err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, DS_FIND_CHILDREN); kmem_free(fsname, len); if (err == 0) err = ra->error; if (err == 0) err = dsl_sync_task_group_wait(ra->dstg); for (dst = list_head(&ra->dstg->dstg_tasks); dst; dst = list_next(&ra->dstg->dstg_tasks, dst)) { dsl_dataset_t *ds = dst->dst_arg1; if (dst->dst_err) { dsl_dir_name(ds->ds_dir, ra->failed); (void) strlcat(ra->failed, "@", sizeof (ra->failed)); (void) strlcat(ra->failed, ra->newsnap, sizeof (ra->failed)); } dsl_dataset_rele(ds, ra->dstg); } if (err) (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); dsl_sync_task_group_destroy(ra->dstg); kmem_free(ra, sizeof (struct renamesnaparg)); spa_close(spa, FTAG); return (err); } static int dsl_valid_rename(const char *oldname, void *arg) { int delta = *(int *)arg; if (strlen(oldname) + delta >= MAXNAMELEN) return (ENAMETOOLONG); return (0); } #pragma weak dmu_objset_rename = dsl_dataset_rename int dsl_dataset_rename(char *oldname, const char *newname, int flags) { dsl_dir_t *dd; dsl_dataset_t *ds; const char *tail; int err; err = dsl_dir_open(oldname, FTAG, &dd, &tail); if (err) return (err); if (tail == NULL) { int delta = strlen(newname) - strlen(oldname); /* if we're growing, validate child name lengths */ if (delta > 0) err = dmu_objset_find(oldname, dsl_valid_rename, &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (err == 0) err = dsl_dir_rename(dd, newname, flags); dsl_dir_close(dd, FTAG); return (err); } if (tail[0] != '@') { /* the name ended in a nonexistent component */ dsl_dir_close(dd, FTAG); return (ENOENT); } dsl_dir_close(dd, FTAG); /* new name must be snapshot in same filesystem */ tail = strchr(newname, '@'); if (tail == NULL) return (EINVAL); tail++; if (strncmp(oldname, newname, tail - newname) != 0) return (EXDEV); if (flags & ZFS_RENAME_RECURSIVE) { err = dsl_recursive_rename(oldname, newname); } else { err = dsl_dataset_hold(oldname, FTAG, &ds); if (err) return (err); err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); dsl_dataset_rele(ds, FTAG); } return (err); } struct promotenode { list_node_t link; dsl_dataset_t *ds; }; struct promotearg { list_t shared_snaps, origin_snaps, clone_snaps; dsl_dataset_t *origin_origin; uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; }; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); static boolean_t snaplist_unstable(list_t *l); static int dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; struct promotenode *snap = list_head(&pa->shared_snaps); dsl_dataset_t *origin_ds = snap->ds; int err; uint64_t unused; /* Check that it is a real clone */ if (!dsl_dir_is_clone(hds->ds_dir)) return (EINVAL); /* Since this is so expensive, don't do the preliminary check */ if (!dmu_tx_is_syncing(tx)) return (0); if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) return (EXDEV); /* compute origin's new unique space */ snap = list_tail(&pa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_deadlist_space_range(&snap->ds->ds_deadlist, origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique, &unused, &unused); /* * Walk the snapshots that we are moving * * Compute space to transfer. Consider the incremental changes * to used for each snapshot: * (my used) = (prev's used) + (blocks born) - (blocks killed) * So each snapshot gave birth to: * (blocks born) = (my used) - (prev's used) + (blocks killed) * So a sequence would look like: * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) * Which simplifies to: * uN + kN + kN-1 + ... + k1 + k0 * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ pa->used = origin_ds->ds_phys->ds_referenced_bytes; pa->comp = origin_ds->ds_phys->ds_compressed_bytes; pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; for (snap = list_head(&pa->shared_snaps); snap; snap = list_next(&pa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; /* Check that the snapshot name does not conflict */ VERIFY(0 == dsl_dataset_get_snapname(ds)); err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); if (err == 0) { err = EEXIST; goto out; } if (err != ENOENT) goto out; /* The very first snapshot does not have a deadlist */ if (ds->ds_phys->ds_prev_snap_obj == 0) continue; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); pa->used += dlused; pa->comp += dlcomp; pa->uncomp += dluncomp; } /* * If we are a clone of a clone then we never reached ORIGIN, * so we need to subtract out the clone origin's used space. */ if (pa->origin_origin) { pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; } /* Check that there is enough space here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, pa->used); if (err) return (err); /* * Compute the amounts of space that will be used by snapshots * after the promotion (for both origin and clone). For each, * it is the amount of space that will be on all of their * deadlists (that was not born before their new origin). */ if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { uint64_t space; /* * Note, typically this will not be a clone of a clone, * so dd_origin_txg will be < TXG_INITIAL, so * these snaplist_space() -> dsl_deadlist_space_range() * calls will be fast because they do not have to * iterate over all bps. */ snap = list_head(&pa->origin_snaps); err = snaplist_space(&pa->shared_snaps, snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); if (err) return (err); err = snaplist_space(&pa->clone_snaps, snap->ds->ds_dir->dd_origin_txg, &space); if (err) return (err); pa->cloneusedsnap += space; } if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { err = snaplist_space(&pa->origin_snaps, origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); if (err) return (err); } return (0); out: pa->err_ds = snap->ds->ds_snapname; return (err); } static void dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; struct promotenode *snap = list_head(&pa->shared_snaps); dsl_dataset_t *origin_ds = snap->ds; dsl_dataset_t *origin_head; dsl_dir_t *dd = hds->ds_dir; dsl_pool_t *dp = hds->ds_dir->dd_pool; dsl_dir_t *odd = NULL; uint64_t oldnext_obj; int64_t delta; ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); snap = list_head(&pa->origin_snaps); origin_head = snap->ds; /* * We need to explicitly open odd, since origin_ds's dd will be * changing. */ VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, NULL, FTAG, &odd)); /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; snap = list_tail(&pa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ if (origin_ds->ds_phys->ds_next_clones_obj) { remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, origin_ds->ds_phys->ds_next_clones_obj, oldnext_obj, tx)); } /* change origin */ dmu_buf_will_dirty(dd->dd_dbuf, tx); ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); odd->dd_phys->dd_origin_obj = origin_ds->ds_object; origin_head->ds_dir->dd_origin_txg = origin_ds->ds_phys->ds_creation_txg; /* change dd_clone entries */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, odd->dd_phys->dd_clones, hds->ds_object, tx)); VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, pa->origin_origin->ds_dir->dd_phys->dd_clones, hds->ds_object, tx)); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, pa->origin_origin->ds_dir->dd_phys->dd_clones, origin_head->ds_object, tx)); if (dd->dd_phys->dd_clones == 0) { dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, dd->dd_phys->dd_clones, origin_head->ds_object, tx)); } /* move snapshots to this dir */ for (snap = list_head(&pa->shared_snaps); snap; snap = list_next(&pa->shared_snaps, snap)) { dsl_dataset_t *ds = snap->ds; /* unregister props as dsl_dir is changing */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } /* move snap name entry */ VERIFY(0 == dsl_dataset_get_snapname(ds)); VERIFY(0 == dsl_dataset_snap_remove(origin_head, ds->ds_snapname, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); ds->ds_phys->ds_dir_obj = dd->dd_object; ASSERT3P(ds->ds_dir, ==, odd); dsl_dir_close(ds->ds_dir, ds); VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); /* move any clone references */ if (ds->ds_phys->ds_next_clones_obj && spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, ds->ds_phys->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *cnds; uint64_t o; if (za.za_first_integer == oldnext_obj) { /* * We've already moved the * origin's reference. */ continue; } VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &cnds)); o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; VERIFY3U(zap_remove_int(dp->dp_meta_objset, odd->dd_phys->dd_clones, o, tx), ==, 0); VERIFY3U(zap_add_int(dp->dp_meta_objset, dd->dd_phys->dd_clones, o, tx), ==, 0); dsl_dataset_rele(cnds, FTAG); } zap_cursor_fini(&zc); } ASSERT0(dsl_prop_numcb(ds)); } /* * Change space accounting. * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either * both be valid, or both be 0 (resulting in delta == 0). This * is true for each of {clone,origin} independently. */ delta = pa->cloneusedsnap - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, >=, 0); ASSERT3U(pa->used, >=, delta); dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(dd, DD_USED_HEAD, pa->used - delta, pa->comp, pa->uncomp, tx); delta = pa->originusedsnap - odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, <=, 0); ASSERT3U(pa->used, >=, -delta); dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(odd, DD_USED_HEAD, -pa->used - delta, -pa->comp, -pa->uncomp, tx); origin_ds->ds_phys->ds_unique_bytes = pa->unique; /* log history record */ spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, "dataset = %llu", hds->ds_object); dsl_dir_close(odd, FTAG); } static char *snaplist_tag = "snaplist"; /* * Make a list of dsl_dataset_t's for the snapshots between first_obj * (exclusive) and last_obj (inclusive). The list will be in reverse * order (last_obj will be the list_head()). If first_obj == 0, do all * snapshots back to this dataset's origin. */ static int snaplist_make(dsl_pool_t *dp, boolean_t own, uint64_t first_obj, uint64_t last_obj, list_t *l) { uint64_t obj = last_obj; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); list_create(l, sizeof (struct promotenode), offsetof(struct promotenode, link)); while (obj != first_obj) { dsl_dataset_t *ds; struct promotenode *snap; int err; if (own) { err = dsl_dataset_own_obj(dp, obj, 0, snaplist_tag, &ds); if (err == 0) dsl_dataset_make_exclusive(ds, snaplist_tag); } else { err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); } if (err == ENOENT) { /* lost race with snapshot destroy */ struct promotenode *last = list_tail(l); ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); obj = last->ds->ds_phys->ds_prev_snap_obj; continue; } else if (err) { return (err); } if (first_obj == 0) first_obj = ds->ds_dir->dd_phys->dd_origin_obj; snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); snap->ds = ds; list_insert_tail(l, snap); obj = ds->ds_phys->ds_prev_snap_obj; } return (0); } static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) { struct promotenode *snap; *spacep = 0; for (snap = list_head(l); snap; snap = list_next(l, snap)) { uint64_t used, comp, uncomp; dsl_deadlist_space_range(&snap->ds->ds_deadlist, mintxg, UINT64_MAX, &used, &comp, &uncomp); *spacep += used; } return (0); } static void snaplist_destroy(list_t *l, boolean_t own) { struct promotenode *snap; if (!l || !list_link_active(&l->list_head)) return; while ((snap = list_tail(l)) != NULL) { list_remove(l, snap); if (own) dsl_dataset_disown(snap->ds, snaplist_tag); else dsl_dataset_rele(snap->ds, snaplist_tag); kmem_free(snap, sizeof (struct promotenode)); } list_destroy(l); } /* * Promote a clone. Nomenclature note: * "clone" or "cds": the original clone which is being promoted * "origin" or "ods": the snapshot which is originally clone's origin * "origin head" or "ohds": the dataset which is the head * (filesystem/volume) for the origin * "origin origin": the origin of the origin's filesystem (typically * NULL, indicating that the clone is not a clone of a clone). */ int dsl_dataset_promote(const char *name, char *conflsnap) { dsl_dataset_t *ds; dsl_dir_t *dd; dsl_pool_t *dp; dmu_object_info_t doi; struct promotearg pa = { 0 }; struct promotenode *snap; int err; err = dsl_dataset_hold(name, FTAG, &ds); if (err) return (err); dd = ds->ds_dir; dp = dd->dd_pool; err = dmu_object_info(dp->dp_meta_objset, ds->ds_phys->ds_snapnames_zapobj, &doi); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { dsl_dataset_rele(ds, FTAG); return (EINVAL); } /* * We are going to inherit all the snapshots taken before our * origin (i.e., our new origin will be our parent's origin). * Take ownership of them so that we can rename them into our * namespace. */ rw_enter(&dp->dp_config_rwlock, RW_READER); err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, &pa.shared_snaps); if (err != 0) goto out; err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); if (err != 0) goto out; snap = list_head(&pa.shared_snaps); ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); if (err != 0) goto out; if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { err = dsl_dataset_hold_obj(dp, snap->ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &pa.origin_origin); if (err != 0) goto out; } out: rw_exit(&dp->dp_config_rwlock); /* * Add in 128x the snapnames zapobj size, since we will be moving * a bunch of snapnames to the promoted ds, and dirtying their * bonus buffers. */ if (err == 0) { err = dsl_sync_task_do(dp, dsl_dataset_promote_check, dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blocks_512); if (err && pa.err_ds && conflsnap) (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); } snaplist_destroy(&pa.shared_snaps, B_TRUE); snaplist_destroy(&pa.clone_snaps, B_FALSE); snaplist_destroy(&pa.origin_snaps, B_FALSE); if (pa.origin_origin) dsl_dataset_rele(pa.origin_origin, FTAG); dsl_dataset_rele(ds, FTAG); return (err); } struct cloneswaparg { dsl_dataset_t *cds; /* clone dataset */ dsl_dataset_t *ohds; /* origin's head dataset */ boolean_t force; int64_t unused_refres_delta; /* change in unconsumed refreservation */ }; /* ARGSUSED */ static int dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) { struct cloneswaparg *csa = arg1; /* they should both be heads */ if (dsl_dataset_is_snapshot(csa->cds) || dsl_dataset_is_snapshot(csa->ohds)) return (EINVAL); /* the branch point should be just before them */ if (csa->cds->ds_prev != csa->ohds->ds_prev) return (EINVAL); /* cds should be the clone (unless they are unrelated) */ if (csa->cds->ds_prev != NULL && csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && csa->ohds->ds_object != csa->cds->ds_prev->ds_phys->ds_next_snap_obj) return (EINVAL); /* the clone should be a child of the origin */ if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) return (EINVAL); /* ohds shouldn't be modified unless 'force' */ if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) return (ETXTBSY); /* adjust amount of any unconsumed refreservation */ csa->unused_refres_delta = (int64_t)MIN(csa->ohds->ds_reserved, csa->ohds->ds_phys->ds_unique_bytes) - (int64_t)MIN(csa->ohds->ds_reserved, csa->cds->ds_phys->ds_unique_bytes); if (csa->unused_refres_delta > 0 && csa->unused_refres_delta > dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); if (csa->ohds->ds_quota != 0 && csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) return (EDQUOT); return (0); } /* ARGSUSED */ static void dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) { struct cloneswaparg *csa = arg1; dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; ASSERT(csa->cds->ds_reserved == 0); ASSERT(csa->ohds->ds_quota == 0 || csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); if (csa->cds->ds_objset != NULL) { dmu_objset_evict(csa->cds->ds_objset); csa->cds->ds_objset = NULL; } if (csa->ohds->ds_objset != NULL) { dmu_objset_evict(csa->ohds->ds_objset); csa->ohds->ds_objset = NULL; } /* * Reset origin's unique bytes, if it exists. */ if (csa->cds->ds_prev) { dsl_dataset_t *origin = csa->cds->ds_prev; uint64_t comp, uncomp; dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_deadlist_space_range(&csa->cds->ds_deadlist, origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); } /* swap blkptrs */ { blkptr_t tmp; tmp = csa->ohds->ds_phys->ds_bp; csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; csa->cds->ds_phys->ds_bp = tmp; } /* set dd_*_bytes */ { int64_t dused, dcomp, duncomp; uint64_t cdl_used, cdl_comp, cdl_uncomp; uint64_t odl_used, odl_comp, odl_uncomp; ASSERT3U(csa->cds->ds_dir->dd_phys-> dd_used_breakdown[DD_USED_SNAP], ==, 0); dsl_deadlist_space(&csa->cds->ds_deadlist, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space(&csa->ohds->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + cdl_uncomp - (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, dused, dcomp, duncomp, tx); dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, -dused, -dcomp, -duncomp, tx); /* * The difference in the space used by snapshots is the * difference in snapshot space due to the head's * deadlist (since that's the only thing that's * changing that affects the snapused). */ dsl_deadlist_space_range(&csa->cds->ds_deadlist, csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space_range(&csa->ohds->ds_deadlist, csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used, &odl_comp, &odl_uncomp); dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, tx); } /* swap ds_*_bytes */ SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, csa->cds->ds_phys->ds_referenced_bytes); SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, csa->cds->ds_phys->ds_compressed_bytes); SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, csa->cds->ds_phys->ds_uncompressed_bytes); SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, csa->cds->ds_phys->ds_unique_bytes); /* apply any parent delta for change in unconsumed refreservation */ dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, csa->unused_refres_delta, 0, 0, tx); /* * Swap deadlists. */ dsl_deadlist_close(&csa->cds->ds_deadlist); dsl_deadlist_close(&csa->ohds->ds_deadlist); SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, csa->cds->ds_phys->ds_deadlist_obj); dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, csa->cds->ds_phys->ds_deadlist_obj); dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, csa->ohds->ds_phys->ds_deadlist_obj); dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); } /* * Swap 'clone' with its origin head datasets. Used at the end of "zfs * recv" into an existing fs to swizzle the file system to the new * version, and by "zfs rollback". Can also be used to swap two * independent head datasets if neither has any snapshots. */ int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force) { struct cloneswaparg csa; int error; ASSERT(clone->ds_owner); ASSERT(origin_head->ds_owner); retry: /* * Need exclusive access for the swap. If we're swapping these * datasets back after an error, we already hold the locks. */ if (!RW_WRITE_HELD(&clone->ds_rwlock)) rw_enter(&clone->ds_rwlock, RW_WRITER); if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { rw_exit(&clone->ds_rwlock); rw_enter(&origin_head->ds_rwlock, RW_WRITER); if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { rw_exit(&origin_head->ds_rwlock); goto retry; } } csa.cds = clone; csa.ohds = origin_head; csa.force = force; error = dsl_sync_task_do(clone->ds_dir->dd_pool, dsl_dataset_clone_swap_check, dsl_dataset_clone_swap_sync, &csa, NULL, 9); return (error); } /* * Given a pool name and a dataset object number in that pool, * return the name of that dataset. */ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { spa_t *spa; dsl_pool_t *dp; dsl_dataset_t *ds; int error; if ((error = spa_open(pname, &spa, FTAG)) != 0) return (error); dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); } rw_exit(&dp->dp_config_rwlock); spa_close(spa, FTAG); return (error); } int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) { int error = 0; ASSERT3S(asize, >, 0); /* * *ref_rsrv is the portion of asize that will come from any * unconsumed refreservation space. */ *ref_rsrv = 0; mutex_enter(&ds->ds_lock); /* * Make a space adjustment for reserved bytes. */ if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { ASSERT3U(*used, >=, ds->ds_reserved - ds->ds_phys->ds_unique_bytes); *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); *ref_rsrv = asize - MIN(asize, parent_delta(ds, asize + inflight)); } if (!check_quota || ds->ds_quota == 0) { mutex_exit(&ds->ds_lock); return (0); } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) { if (inflight > 0 || ds->ds_phys->ds_referenced_bytes < ds->ds_quota) error = ERESTART; else error = EDQUOT; } mutex_exit(&ds->ds_lock); return (error); } /* ARGSUSED */ static int dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; int err; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) return (ENOTSUP); if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) return (err); if (psa->psa_effective_value == 0) return (0); if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || psa->psa_effective_value < ds->ds_reserved) return (ENOSPC); return (0); } extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); void dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value = psa->psa_effective_value; dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); if (ds->ds_quota != effective_value) { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_quota = effective_value; } } int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) { dsl_dataset_t *ds; dsl_prop_setarg_t psa; int err; dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); /* * If someone removes a file, then tries to set the quota, we * want to make sure the file freeing takes effect. */ txg_wait_open(ds->ds_dir->dd_pool, 0); err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, ds, &psa, 0); dsl_dataset_rele(ds, FTAG); return (err); } static int dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value; uint64_t unique; int err; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFRESERVATION) return (ENOTSUP); if (dsl_dataset_is_snapshot(ds)) return (EINVAL); if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) return (err); effective_value = psa->psa_effective_value; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) return (0); mutex_enter(&ds->ds_lock); if (!DS_UNIQUE_IS_ACCURATE(ds)) dsl_dataset_recalc_head_uniq(ds); unique = ds->ds_phys->ds_unique_bytes; mutex_exit(&ds->ds_lock); if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { uint64_t delta = MAX(unique, effective_value) - MAX(unique, ds->ds_reserved); if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); if (ds->ds_quota > 0 && effective_value > ds->ds_quota) return (ENOSPC); } return (0); } static void dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; dsl_prop_setarg_t *psa = arg2; uint64_t effective_value = psa->psa_effective_value; uint64_t unique; int64_t delta; dsl_prop_set_sync(ds, psa, tx); DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); unique = ds->ds_phys->ds_unique_bytes; delta = MAX(0, (int64_t)(effective_value - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); ds->ds_reserved = effective_value; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); } int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, uint64_t reservation) { dsl_dataset_t *ds; dsl_prop_setarg_t psa; int err; dsl_prop_setarg_init_uint64(&psa, "refreservation", source, &reservation); err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_set_reservation_check, dsl_dataset_set_reservation_sync, ds, &psa, 0); dsl_dataset_rele(ds, FTAG); return (err); } typedef struct zfs_hold_cleanup_arg { dsl_pool_t *dp; uint64_t dsobj; char htag[MAXNAMELEN]; } zfs_hold_cleanup_arg_t; static void dsl_dataset_user_release_onexit(void *arg) { zfs_hold_cleanup_arg_t *ca = arg; (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, B_TRUE); kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); } void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, minor_t minor) { zfs_hold_cleanup_arg_t *ca; ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); ca->dp = ds->ds_dir->dd_pool; ca->dsobj = ds->ds_object; (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); VERIFY3U(0, ==, zfs_onexit_add_cb(minor, dsl_dataset_user_release_onexit, ca, NULL)); } /* * If you add new checks here, you may need to add * additional checks to the "temporary" case in * snapshot_check() in dmu_objset.c. */ static int dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct dsl_ds_holdarg *ha = arg2; char *htag = ha->htag; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int error = 0; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) return (ENOTSUP); if (!dsl_dataset_is_snapshot(ds)) return (EINVAL); /* tags must be unique */ mutex_enter(&ds->ds_lock); if (ds->ds_phys->ds_userrefs_obj) { error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 8, 1, tx); if (error == 0) error = EEXIST; else if (error == ENOENT) error = 0; } mutex_exit(&ds->ds_lock); if (error == 0 && ha->temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) error = E2BIG; return (error); } void dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct dsl_ds_holdarg *ha = arg2; char *htag = ha->htag; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t now = gethrestime_sec(); uint64_t zapobj; mutex_enter(&ds->ds_lock); if (ds->ds_phys->ds_userrefs_obj == 0) { /* * This is the first user hold for this dataset. Create * the userrefs zap object. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); zapobj = ds->ds_phys->ds_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); } else { zapobj = ds->ds_phys->ds_userrefs_obj; } ds->ds_userrefs++; mutex_exit(&ds->ds_lock); VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); if (ha->temphold) { VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, htag, &now, tx)); } spa_history_log_internal(LOG_DS_USER_HOLD, dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, (int)ha->temphold, ds->ds_object); } static int dsl_dataset_user_hold_one(const char *dsname, void *arg) { struct dsl_ds_holdarg *ha = arg; dsl_dataset_t *ds; int error; char *name; /* alloc a buffer to hold dsname@snapname plus terminating NULL */ name = kmem_asprintf("%s@%s", dsname, ha->snapname); error = dsl_dataset_hold(name, ha->dstg, &ds); strfree(name); if (error == 0) { ha->gotone = B_TRUE; dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, ds, ha, 0); } else if (error == ENOENT && ha->recursive) { error = 0; } else { (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); } return (error); } int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, boolean_t temphold) { struct dsl_ds_holdarg *ha; int error; ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); ha->htag = htag; ha->temphold = temphold; error = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, ds, ha, 0); kmem_free(ha, sizeof (struct dsl_ds_holdarg)); return (error); } int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, boolean_t recursive, boolean_t temphold, int cleanup_fd) { struct dsl_ds_holdarg *ha; dsl_sync_task_t *dst; spa_t *spa; int error; minor_t minor = 0; if (cleanup_fd != -1) { /* Currently we only support cleanup-on-exit of tempholds. */ if (!temphold) return (EINVAL); error = zfs_onexit_fd_hold(cleanup_fd, &minor); if (error) return (error); } ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); error = spa_open(dsname, &spa, FTAG); if (error) { kmem_free(ha, sizeof (struct dsl_ds_holdarg)); if (cleanup_fd != -1) zfs_onexit_fd_rele(cleanup_fd); return (error); } ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ha->htag = htag; ha->snapname = snapname; ha->recursive = recursive; ha->temphold = temphold; if (recursive) { error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, ha, DS_FIND_CHILDREN); } else { error = dsl_dataset_user_hold_one(dsname, ha); } if (error == 0) error = dsl_sync_task_group_wait(ha->dstg); for (dst = list_head(&ha->dstg->dstg_tasks); dst; dst = list_next(&ha->dstg->dstg_tasks, dst)) { dsl_dataset_t *ds = dst->dst_arg1; if (dst->dst_err) { dsl_dataset_name(ds, ha->failed); *strchr(ha->failed, '@') = '\0'; } else if (error == 0 && minor != 0 && temphold) { /* * If this hold is to be released upon process exit, * register that action now. */ dsl_register_onexit_hold_cleanup(ds, htag, minor); } dsl_dataset_rele(ds, ha->dstg); } if (error == 0 && recursive && !ha->gotone) error = ENOENT; if (error) (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); dsl_sync_task_group_destroy(ha->dstg); kmem_free(ha, sizeof (struct dsl_ds_holdarg)); spa_close(spa, FTAG); if (cleanup_fd != -1) zfs_onexit_fd_rele(cleanup_fd); return (error); } struct dsl_ds_releasearg { dsl_dataset_t *ds; const char *htag; boolean_t own; /* do we own or just hold ds? */ }; static int dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, boolean_t *might_destroy) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t zapobj; uint64_t tmp; int error; *might_destroy = B_FALSE; mutex_enter(&ds->ds_lock); zapobj = ds->ds_phys->ds_userrefs_obj; if (zapobj == 0) { /* The tag can't possibly exist */ mutex_exit(&ds->ds_lock); return (ESRCH); } /* Make sure the tag exists */ error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); if (error) { mutex_exit(&ds->ds_lock); if (error == ENOENT) error = ESRCH; return (error); } if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)) *might_destroy = B_TRUE; mutex_exit(&ds->ds_lock); return (0); } static int dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_releasearg *ra = arg1; dsl_dataset_t *ds = ra->ds; boolean_t might_destroy; int error; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) return (ENOTSUP); error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); if (error) return (error); if (might_destroy) { struct dsl_ds_destroyarg dsda = {0}; if (dmu_tx_is_syncing(tx)) { /* * If we're not prepared to remove the snapshot, * we can't allow the release to happen right now. */ if (!ra->own) return (EBUSY); } dsda.ds = ds; dsda.releasing = B_TRUE; return (dsl_dataset_destroy_check(&dsda, tag, tx)); } return (0); } static void dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) { struct dsl_ds_releasearg *ra = arg1; dsl_dataset_t *ds = ra->ds; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj; uint64_t dsobj = ds->ds_object; uint64_t refs; int error; mutex_enter(&ds->ds_lock); ds->ds_userrefs--; refs = ds->ds_userrefs; mutex_exit(&ds->ds_lock); error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); VERIFY(error == 0 || error == ENOENT); zapobj = ds->ds_phys->ds_userrefs_obj; VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); spa_history_log_internal(LOG_DS_USER_RELEASE, dp->dp_spa, tx, "<%s> %lld dataset = %llu", ra->htag, (longlong_t)refs, dsobj); if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)) { struct dsl_ds_destroyarg dsda = {0}; ASSERT(ra->own); dsda.ds = ds; dsda.releasing = B_TRUE; /* We already did the destroy_check */ dsl_dataset_destroy_sync(&dsda, tag, tx); } } static int dsl_dataset_user_release_one(const char *dsname, void *arg) { struct dsl_ds_holdarg *ha = arg; struct dsl_ds_releasearg *ra; dsl_dataset_t *ds; int error; void *dtag = ha->dstg; char *name; boolean_t own = B_FALSE; boolean_t might_destroy; /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ name = kmem_asprintf("%s@%s", dsname, ha->snapname); error = dsl_dataset_hold(name, dtag, &ds); strfree(name); if (error == ENOENT && ha->recursive) return (0); (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); if (error) return (error); ha->gotone = B_TRUE; ASSERT(dsl_dataset_is_snapshot(ds)); error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); if (error) { dsl_dataset_rele(ds, dtag); return (error); } if (might_destroy) { #ifdef _KERNEL name = kmem_asprintf("%s@%s", dsname, ha->snapname); error = zfs_unmount_snap(name, NULL); strfree(name); if (error) { dsl_dataset_rele(ds, dtag); return (error); } #endif if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { dsl_dataset_rele(ds, dtag); return (EBUSY); } else { own = B_TRUE; dsl_dataset_make_exclusive(ds, dtag); } } ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); ra->ds = ds; ra->htag = ha->htag; ra->own = own; dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, dsl_dataset_user_release_sync, ra, dtag, 0); return (0); } int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, boolean_t recursive) { struct dsl_ds_holdarg *ha; dsl_sync_task_t *dst; spa_t *spa; int error; top: ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); error = spa_open(dsname, &spa, FTAG); if (error) { kmem_free(ha, sizeof (struct dsl_ds_holdarg)); return (error); } ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); ha->htag = htag; ha->snapname = snapname; ha->recursive = recursive; if (recursive) { error = dmu_objset_find(dsname, dsl_dataset_user_release_one, ha, DS_FIND_CHILDREN); } else { error = dsl_dataset_user_release_one(dsname, ha); } if (error == 0) error = dsl_sync_task_group_wait(ha->dstg); for (dst = list_head(&ha->dstg->dstg_tasks); dst; dst = list_next(&ha->dstg->dstg_tasks, dst)) { struct dsl_ds_releasearg *ra = dst->dst_arg1; dsl_dataset_t *ds = ra->ds; if (dst->dst_err) dsl_dataset_name(ds, ha->failed); if (ra->own) dsl_dataset_disown(ds, ha->dstg); else dsl_dataset_rele(ds, ha->dstg); kmem_free(ra, sizeof (struct dsl_ds_releasearg)); } if (error == 0 && recursive && !ha->gotone) error = ENOENT; if (error && error != EBUSY) (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); dsl_sync_task_group_destroy(ha->dstg); kmem_free(ha, sizeof (struct dsl_ds_holdarg)); spa_close(spa, FTAG); /* * We can get EBUSY if we were racing with deferred destroy and * dsl_dataset_user_release_check() hadn't done the necessary * open context setup. We can also get EBUSY if we're racing * with destroy and that thread is the ds_owner. Either way * the busy condition should be transient, and we should retry * the release operation. */ if (error == EBUSY) goto top; return (error); } /* * Called at spa_load time (with retry == B_FALSE) to release a stale * temporary user hold. Also called by the onexit code (with retry == B_TRUE). */ int dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, boolean_t retry) { dsl_dataset_t *ds; char *snap; char *name; int namelen; int error; do { rw_enter(&dp->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); rw_exit(&dp->dp_config_rwlock); if (error) return (error); namelen = dsl_dataset_namelen(ds)+1; name = kmem_alloc(namelen, KM_SLEEP); dsl_dataset_name(ds, name); dsl_dataset_rele(ds, FTAG); snap = strchr(name, '@'); *snap = '\0'; ++snap; error = dsl_dataset_user_release(name, snap, htag, B_FALSE); kmem_free(name, namelen); /* * The object can't have been destroyed because we have a hold, * but it might have been renamed, resulting in ENOENT. Retry * if we've been requested to do so. * * It would be nice if we could use the dsobj all the way * through and avoid ENOENT entirely. But we might need to * unmount the snapshot, and there's currently no way to lookup * a vfsp using a ZFS object id. */ } while ((error == ENOENT) && retry); return (error); } int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) { dsl_dataset_t *ds; int err; err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); if (ds->ds_phys->ds_userrefs_obj != 0) { zap_attribute_t *za; zap_cursor_t zc; za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_phys->ds_userrefs_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, za->za_first_integer)); } zap_cursor_fini(&zc); kmem_free(za, sizeof (zap_attribute_t)); } dsl_dataset_rele(ds, FTAG); return (0); } /* * Note, this function is used as the callback for dmu_objset_find(). We * always return 0 so that we will continue to find and process * inconsistent datasets, even if we encounter an error trying to * process one of them. */ /* ARGSUSED */ int dsl_destroy_inconsistent(const char *dsname, void *arg) { dsl_dataset_t *ds; if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { if (DS_IS_INCONSISTENT(ds)) (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); else dsl_dataset_disown(ds, FTAG); } return (0); } /* * Return (in *usedp) the amount of space written in new that is not * present in oldsnap. New may be a snapshot or the head. Old must be * a snapshot before new, in new's filesystem (or its origin). If not then * fail and return EINVAL. * * The written space is calculated by considering two components: First, we * ignore any freed space, and calculate the written as new's used space * minus old's used space. Next, we add in the amount of space that was freed * between the two snapshots, thus reducing new's used space relative to old's. * Specifically, this is the space that was born before old->ds_creation_txg, * and freed before new (ie. on new's deadlist or a previous deadlist). * * space freed [---------------------] * snapshots ---O-------O--------O-------O------ * oldsnap new */ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; uint64_t snapobj; dsl_pool_t *dp = new->ds_dir->dd_pool; *usedp = 0; *usedp += new->ds_phys->ds_referenced_bytes; *usedp -= oldsnap->ds_phys->ds_referenced_bytes; *compp = 0; *compp += new->ds_phys->ds_compressed_bytes; *compp -= oldsnap->ds_phys->ds_compressed_bytes; *uncompp = 0; *uncompp += new->ds_phys->ds_uncompressed_bytes; *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes; rw_enter(&dp->dp_config_rwlock, RW_READER); snapobj = new->ds_object; while (snapobj != oldsnap->ds_object) { dsl_dataset_t *snap; uint64_t used, comp, uncomp; if (snapobj == new->ds_object) { snap = new; } else { err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); if (err != 0) break; } if (snap->ds_phys->ds_prev_snap_txg == oldsnap->ds_phys->ds_creation_txg) { /* * The blocks in the deadlist can not be born after * ds_prev_snap_txg, so get the whole deadlist space, * which is more efficient (especially for old-format * deadlists). Unfortunately the deadlist code * doesn't have enough information to make this * optimization itself. */ dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp); } else { dsl_deadlist_space_range(&snap->ds_deadlist, 0, oldsnap->ds_phys->ds_creation_txg, &used, &comp, &uncomp); } *usedp += used; *compp += comp; *uncompp += uncomp; /* * If we get to the beginning of the chain of snapshots * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap * was not a snapshot of/before new. */ snapobj = snap->ds_phys->ds_prev_snap_obj; if (snap != new) dsl_dataset_rele(snap, FTAG); if (snapobj == 0) { err = EINVAL; break; } } rw_exit(&dp->dp_config_rwlock); return (err); } /* * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, * lastsnap, and all snapshots in between are deleted. * * blocks that would be freed [---------------------------] * snapshots ---O-------O--------O-------O--------O * firstsnap lastsnap * * This is the set of blocks that were born after the snap before firstsnap, * (birth > firstsnap->prev_snap_txg) and died before the snap after the * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). * We calculate this by iterating over the relevant deadlists (from the snap * after lastsnap, backward to the snap after firstsnap), summing up the * space on the deadlist that was born after the snap before firstsnap. */ int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *lastsnap, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; uint64_t snapobj; dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; ASSERT(dsl_dataset_is_snapshot(firstsnap)); ASSERT(dsl_dataset_is_snapshot(lastsnap)); /* * Check that the snapshots are in the same dsl_dir, and firstsnap * is before lastsnap. */ if (firstsnap->ds_dir != lastsnap->ds_dir || firstsnap->ds_phys->ds_creation_txg > lastsnap->ds_phys->ds_creation_txg) return (EINVAL); *usedp = *compp = *uncompp = 0; rw_enter(&dp->dp_config_rwlock, RW_READER); snapobj = lastsnap->ds_phys->ds_next_snap_obj; while (snapobj != firstsnap->ds_object) { dsl_dataset_t *ds; uint64_t used, comp, uncomp; err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); if (err != 0) break; dsl_deadlist_space_range(&ds->ds_deadlist, firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; snapobj = ds->ds_phys->ds_prev_snap_obj; ASSERT3U(snapobj, !=, 0); dsl_dataset_rele(ds, FTAG); } rw_exit(&dp->dp_config_rwlock); return (err); } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 247192) @@ -1,1750 +1,1751 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); static scan_cb_t dsl_scan_defrag_cb; static scan_cb_t dsl_scan_scrub_cb; static scan_cb_t dsl_scan_remove_cb; static dsl_syncfunc_t dsl_scan_cancel_sync; static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ SYSCTL_DECL(_vfs_zfs); TUNABLE_INT("vfs.zfs.top_maxinflight", &zfs_top_maxinflight); SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RW, &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); TUNABLE_INT("vfs.zfs.resilver_delay", &zfs_resilver_delay); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RW, &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); TUNABLE_INT("vfs.zfs.scrub_delay", &zfs_scrub_delay); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RW, &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); TUNABLE_INT("vfs.zfs.scan_idle", &zfs_scan_idle); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RW, &zfs_scan_idle, 0, "Idle scan window in clock ticks"); TUNABLE_INT("vfs.zfs.scan_min_time_ms", &zfs_scan_min_time_ms); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RW, &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg"); TUNABLE_INT("vfs.zfs.free_min_time_ms", &zfs_free_min_time_ms); SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RW, &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); TUNABLE_INT("vfs.zfs.resilver_min_time_ms", &zfs_resilver_min_time_ms); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RW, &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); TUNABLE_INT("vfs.zfs.no_scrub_io", &zfs_no_scrub_io); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RW, &zfs_no_scrub_io, 0, "Disable scrub I/O"); TUNABLE_INT("vfs.zfs.no_scrub_prefetch", &zfs_no_scrub_prefetch); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RW, &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) extern int zfs_txg_timeout; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { int err; dsl_scan_t *scn; spa_t *spa = dp->dp_spa; uint64_t f; scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { /* * There was an old-style scrub in progress. Restart a * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", scn->scn_restart_txg); /* * Load the queue obj from the old location so that it * can be freed by dsl_scan_done(). */ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); if (err == ENOENT) return (0); else if (err) return (err); if (scn->scn_phys.scn_state == DSS_SCANNING && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old * pool, and the pool was accessed by old * software. Restart from the beginning, since * the old software may have changed the pool in * the meantime. */ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", scn->scn_restart_txg); } } spa_scan_stat_init(spa); return (0); } void dsl_scan_fini(dsl_pool_t *dp) { if (dp->dp_scan) { kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } /* ARGSUSED */ static int dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_scan_t *scn = arg1; if (scn->scn_phys.scn_state == DSS_SCANNING) return (EBUSY); return (0); } /* ARGSUSED */ static void dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_scan_t *scn = arg1; pool_scan_func_t *funcp = arg2; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; scn->scn_phys.scn_max_txg = tx->tx_txg; scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; scn->scn_restart_txg = 0; spa_scan_stat_init(spa); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; /* rewrite all disk labels */ vdev_config_dirty(spa->spa_root_vdev); if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); } else { spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); } spa->spa_scrub_started = B_TRUE; /* * If this is an incremental scrub, limit the DDT scrub phase * to just the auto-ditto class (for correctness); the rest * of the scrub should go faster using top-down pruning. */ if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; } /* back to the generic stuff */ if (dp->dp_blkstats == NULL) { dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); } bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); dsl_scan_sync_state(scn, tx); spa_history_log_internal(LOG_POOL_SCAN, spa, tx, "func=%u mintxg=%llu maxtxg=%llu", *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); } /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { static const char *old_names[] = { "scrub_bookmark", "scrub_ddt_bookmark", "scrub_ddt_class_max", "scrub_queue", "scrub_min_txg", "scrub_max_txg", "scrub_func", "scrub_errors", NULL }; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int i; /* Remove any remnants of an old-style scrub. */ for (i = 0; old_names[i]; i++) { (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); } if (scn->scn_phys.scn_queue_obj != 0) { VERIFY(0 == dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } /* * If we were "restarted" from a stopped state, don't bother * with anything else. */ if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (complete) scn->scn_phys.scn_state = DSS_FINISHED; else scn->scn_phys.scn_state = DSS_CANCELED; spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx, "complete=%u", complete); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > 0) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } mutex_exit(&spa->spa_scrub_lock); spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* * If the scrub/resilver completed, update all DTLs to * reflect this. Whether it succeeded or not, vacate * all temporary scrub DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); if (complete) { spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); } spa_errlog_rotate(spa); /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); } scn->scn_phys.scn_end_time = gethrestime_sec(); } /* ARGSUSED */ static int dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_scan_t *scn = arg1; if (scn->scn_phys.scn_state != DSS_SCANNING) return (ENOENT); return (0); } /* ARGSUSED */ static void dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_scan_t *scn = arg1; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx); } int dsl_scan_cancel(dsl_pool_t *dp) { boolean_t complete = B_FALSE; int err; err = dsl_sync_task_do(dp, dsl_scan_cancel_check, dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); return (err); } static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { zio_free(dp->dp_spa, txg, bp); } void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), pio->io_flags)); } static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; if (dsl_dataset_is_snapshot(ds)) return (MIN(smt, ds->ds_phys->ds_creation_txg)); return (smt); } static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) { VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); } static boolean_t dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) { uint64_t elapsed_nanosecs; unsigned int mintime; /* we never skip user/group accounting objects */ if (zb && (int64_t)zb->zb_object < 0) return (B_FALSE); if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ if (zb && zb->zb_level != 0) return (B_FALSE); mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scan_min_time_ms; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)) { if (zb) { dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; } dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); scn->scn_pausing = B_TRUE; return (B_TRUE); } return (B_FALSE); } typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; } zil_scan_arg_t; /* ARGSUSED */ static int dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; zbookmark_t zb; if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * One block ("stubby") can be allocated a long time ago; we * want to visit that one because it has been allocated * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); return (0); } /* ARGSUSED */ static int dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; zbookmark_t zb; if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * birth can be < claim_txg if this record's txg is * already txg sync'ed (but this log block contains * other records that are not synced) */ if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); } return (0); } static void dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ if (claim_txg == 0 && spa_writeable(dp->dp_spa)) return; zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, claim_txg); zil_free(zilog); } /* ARGSUSED */ static void dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, uint64_t object, uint64_t blkid) { zbookmark_t czb; uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; if (zfs_no_scrub_prefetch) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) return; SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); } static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, const zbookmark_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it to a different object, zero it out to * indicate that it's OK to start checking for pausing * again. */ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { dprintf("resuming at %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); } } return (B_FALSE); } /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp) { dsl_pool_t *dp = scn->scn_dp; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset, zb->zb_object, zb->zb_blkid * epb + i); } for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) { zbookmark_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_visitbp(cbp, &czb, dnp, *bufp, ds, scn, ostype, tx); } } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) { uint32_t flags = ARC_WAIT; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_WAIT; dnode_phys_t *cdnp; int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { for (j = 0; j < cdnp->dn_nblkptr; j++) { blkptr_t *cbp = &cdnp->dn_blkptr[j]; dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset, zb->zb_blkid * epb + i, j); } } for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, *bufp, zb->zb_blkid * epb + i, tx); } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; objset_phys_t *osp; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } osp = (*bufp)->b_data; dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx); if (OBJSET_BUF_HAS_USERUSED(*bufp)) { /* * We also always visit user/group accounting * objects, and never skip them, even if we are * pausing. This is necessary so that the space * deltas from this txg get integrated. */ dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_groupused_dnode, *bufp, DMU_GROUPUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_userused_dnode, *bufp, DMU_USERUSED_OBJECT, tx); } } return (0); } static void dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx) { int j; for (j = 0; j < dnp->dn_nblkptr; j++) { zbookmark_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, dnp->dn_nlevels - 1, j); dsl_scan_visitbp(&dnp->dn_blkptr[j], &czb, dnp, buf, ds, scn, ostype, tx); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zbookmark_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); dsl_scan_visitbp(&dnp->dn_spill, &czb, dnp, buf, ds, scn, ostype, tx); } } /* * The arguments are in this order because mdb can only print the * first 5; we want them to be useful. */ static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; arc_buf_t *buf = NULL; blkptr_t bp_toread = *bp; /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ if (dsl_scan_check_pause(scn, zb)) return; if (dsl_scan_check_resume(scn, dnp, zb)) return; if (bp->blk_birth == 0) return; scn->scn_visited_this_txg++; dprintf_bp(bp, "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p", ds, ds ? ds->ds_object : 0, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, pbuf, bp); if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return; if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, &buf) != 0) return; /* * If dsl_scan_ddt() has aready visited this block, it will have * already done any translations or scrubbing, so don't call the * callback again. */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { ASSERT(buf == NULL); return; } /* * If this block is from the future (after cur_max_txg), then we * are doing this on behalf of a deleted snapshot, and we will * revisit the future block on the next pass of this dataset. * Don't scan it now unless we need to because something * under it was modified. */ if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) { scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); } if (buf) (void) arc_buf_remove_ref(buf, &buf); } static void dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); dsl_scan_visitbp(bp, &zb, NULL, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } void dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { if (dsl_dataset_is_snapshot(ds)) { /* Note, scn_cur_{min,max}_txg stays the same. */ scn->scn_phys.scn_bookmark.zb_objset = ds->ds_phys->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)ds->ds_phys->ds_next_snap_obj); scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn->scn_phys.scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object); } } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); if (dsl_dataset_is_snapshot(ds)) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was * deleted too. */ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("destroying ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)ds->ds_phys->ds_next_snap_obj); } else { zfs_dbgmsg("destroying ds %llu; in queue; removing", (u_longlong_t)ds->ds_object); } } else { zfs_dbgmsg("destroying ds %llu; ignoring", (u_longlong_t)ds->ds_object); } /* * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ dsl_scan_sync_state(scn, tx); } void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = ds->ds_phys->ds_prev_snap_obj; zfs_dbgmsg("snapshotting ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("snapshotting ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)ds->ds_phys->ds_prev_snap_obj); } dsl_scan_sync_state(scn, tx); } void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg) == 0) { int err; ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); err = zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); VERIFY(err == 0 || err == EEXIST); if (err == EEXIST) { /* Both were there to begin with */ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); } zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg); ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } dsl_scan_sync_state(scn, tx); } struct enqueue_clones_arg { dmu_tx_t *tx; uint64_t originobj; }; /* ARGSUSED */ static int enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) { struct enqueue_clones_arg *eca = arg; dsl_dataset_t *ds; int err; dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); if (err) return (err); if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); dsl_dataset_rele(ds, FTAG); if (err) return (err); ds = prev; } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (dmu_objset_from_ds(ds, &os)) goto out; /* * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same * BP as in the head), they must be ignored. So we traverse the * ZIL here, rather than in scan_recurse(), because the regular * snapshot block-sharing rules don't apply to it. */ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) dsl_scan_zil(dp, &os->os_zil_header); /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx); char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "pausing=%u", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_cur_max_txg, (int)scn->scn_pausing); kmem_free(dsname, ZFS_MAXNAMELEN); if (scn->scn_pausing) goto out; /* * We've finished this pass over this dataset. */ /* * If we did not completely visit this dataset, do another pass. */ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass; visiting again"); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, scn->scn_phys.scn_cur_max_txg, tx) == 0); goto out; } /* * Add descendent datasets to work queue. */ if (ds->ds_phys->ds_next_snap_obj != 0) { VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj, ds->ds_phys->ds_creation_txg, tx) == 0); } if (ds->ds_phys->ds_num_children > 1) { boolean_t usenext = B_FALSE; if (ds->ds_phys->ds_next_clones_obj != 0) { uint64_t count; /* * A bug in a previous version of the code could * cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a * missing entry. Therefore we can only use the * next_clones_obj when its count is correct. */ int err = zap_count(dp->dp_meta_objset, ds->ds_phys->ds_next_clones_obj, &count); if (err == 0 && count == ds->ds_phys->ds_num_children - 1) usenext = B_TRUE; } if (usenext) { VERIFY(zap_join_key(dp->dp_meta_objset, ds->ds_phys->ds_next_clones_obj, scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_creation_txg, tx) == 0); } else { struct enqueue_clones_arg eca; eca.tx = tx; eca.originobj = ds->ds_object; (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); } } out: dsl_dataset_rele(ds, FTAG); } /* ARGSUSED */ static int enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds; int err; dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); if (err) return (err); while (ds->ds_phys->ds_prev_snap_obj != 0) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } /* * If this is a clone, we don't need to worry about it for now. */ if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(prev, FTAG); return (0); } dsl_dataset_rele(ds, FTAG); ds = prev; } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } /* * Scrub/dedup interaction. * * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * * We leverage the fact that the dde's replication class (enum ddt_class) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * * To prevent excess scrubbing, the scrub begins by walking the DDT * to find all blocks with refcnt > 1, and scrubs each of these once. * Since there are two replication classes which contain blocks with * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. * * There would be nothing more to say if a block's refcnt couldn't change * during a scrub, but of course it can so we must account for changes * in a block's replication class. * * Here's an example of what can occur: * * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 * when visited during the top-down scrub phase, it will be scrubbed twice. * This negates our scrub optimization, but is otherwise harmless. * * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 * on each visit during the top-down scrub phase, it will never be scrubbed. * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 * while a scrub is in progress, it scrubs the block right then. */ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ddt_entry_t dde = { 0 }; int error; uint64_t n = 0; while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) break; dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", (longlong_t)ddb->ddb_class, (longlong_t)ddb->ddb_type, (longlong_t)ddb->ddb_checksum, (longlong_t)ddb->ddb_cursor); /* There should be no pending changes to the dedup table */ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); n++; if (dsl_scan_check_pause(scn, NULL)) break; } zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_pausing); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } /* ARGSUSED */ void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx) { const ddt_key_t *ddk = &dde->dde_key; ddt_phys_t *ddp = dde->dde_phys; blkptr_t bp; zbookmark_t zb = { 0 }; if (scn->scn_phys.scn_state != DSS_SCANNING) return; for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) continue; ddt_bp_create(checksum, ddk, ddp, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); } } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; zap_cursor_t zc; zap_attribute_t za; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_ddt(scn, tx); if (scn->scn_pausing) return; } if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { /* First do the MOS & ORIGIN */ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_pausing) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); } ASSERT(!scn->scn_pausing); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { /* * If we were paused, continue from here. Note if the * ds we were paused on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); if (scn->scn_pausing) return; } /* * In case we were paused right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); /* keep pulling things out of the zap-object-as-queue */ while (zap_cursor_init(&zc, dp->dp_meta_objset, scn->scn_phys.scn_queue_obj), zap_cursor_retrieve(&zc, &za) == 0) { dsl_dataset_t *ds; uint64_t dsobj; dsobj = strtonum(za.za_name, NULL); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsobj, tx)); /* Set up min/max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (za.za_first_integer != 0) { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, za.za_first_integer); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, ds->ds_phys->ds_prev_snap_txg); } scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); zap_cursor_fini(&zc); if (scn->scn_pausing) return; } zap_cursor_fini(&zc); } static boolean_t dsl_scan_free_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } static int dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_scan_t *scn = arg; if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { if (dsl_scan_free_should_pause(scn)) return (ERESTART); } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; return (0); } boolean_t dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if (scn->scn_phys.scn_state == DSS_SCANNING) return (B_TRUE); if (spa_feature_is_active(spa, &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { return (B_TRUE); } if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } return (used != 0); } void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; int err; /* * Check for scn_restart_txg before checking spa_load_state, so * that we can restart an old-style scan while the pool is being * imported (see dsl_scan_init). */ if (scn->scn_restart_txg != 0 && scn->scn_restart_txg <= tx->tx_txg) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", func, tx->tx_txg); dsl_scan_setup_sync(scn, &func, tx); } if (!dsl_scan_active(scn) || spa_sync_pass(dp->dp_spa) > 1) return; scn->scn_visited_this_txg = 0; scn->scn_pausing = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; /* * First process the free list. If we pause the free, don't do * any scanning. This ensures that there is no free list when * we are scanning, so the scan code doesn't have to worry about * traversing it. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); if (err == 0 && spa_feature_is_active(spa, &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { scn->scn_is_bptree = B_TRUE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); if (err != 0) return; /* disable async destroy feature */ spa_feature_decr(spa, &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); ASSERT(!spa_feature_is_active(spa, &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, tx)); VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, dp->dp_bptree_obj, tx)); dp->dp_bptree_obj = 0; } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " "free_bpobj/bptree txg %llu", (longlong_t)scn->scn_visited_this_txg, (longlong_t) (gethrtime() - scn->scn_sync_start_time) / MICROSEC, (longlong_t)tx->tx_txg); scn->scn_visited_this_txg = 0; /* * Re-sync the ddt so that we can further modify * it when doing bprewrite. */ ddt_sync(spa, tx->tx_txg); } if (err == ERESTART) return; } if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { zfs_dbgmsg("doing scan sync txg %llu; " "ddt bm=%llu/%llu/%llu/%llx", (longlong_t)tx->tx_txg, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); } else { zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", (longlong_t)tx->tx_txg, (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, (longlong_t)scn->scn_phys.scn_bookmark.zb_object, (longlong_t)scn->scn_phys.scn_bookmark.zb_level, (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); } scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); dsl_scan_visit(scn, tx); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; zfs_dbgmsg("visited %llu blocks in %llums", (longlong_t)scn->scn_visited_this_txg, (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); if (!scn->scn_pausing) { /* finished with scan. */ zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg); dsl_scan_done(scn, B_TRUE, tx); } if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > 0) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } mutex_exit(&spa->spa_scrub_lock); } dsl_scan_sync_state(scn, tx); } /* * This will start a new scan, or restart an existing one. */ void dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; dmu_tx_commit(tx); } else { dp->dp_scan->scn_restart_txg = txg; } zfs_dbgmsg("restarting resilver txg=%llu", txg); } boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); } /* * scrub consumers */ static void count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. */ if (zab == NULL) return; for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; if (t & DMU_OT_NEWTYPE) t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_2_of_2_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal == 1) zb->zb_ditto_2_of_3_samevdev++; else if (equal == 3) zb->zb_ditto_3_of_3_samevdev++; break; } } } static void dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; } mutex_exit(&spa->spa_scrub_lock); } static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_t *zb) { dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); boolean_t needs_io; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; int zio_priority; unsigned int scan_delay = 0; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) return (0); count_block(dp->dp_blkstats, bp); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; zio_priority = ZIO_PRIORITY_SCRUB; needs_io = B_TRUE; scan_delay = zfs_scrub_delay; - } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + } else { + ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; zio_priority = ZIO_PRIORITY_RESILVER; needs_io = B_FALSE; scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; for (int d = 0; d < BP_GET_NDVAS(bp); d++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[d])); /* * Keep track of how much data we've examined so that * zpool(1M) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); /* if it's a resilver, this may not be in the target range */ if (!needs_io) { if (DVA_GET_GANG(&bp->blk_dva[d])) { /* * Gang members may be spread across multiple * vdevs, so the best estimate we have is the * scrub range, which has already been checked. * XXX -- it would be better to change our * allocation policy to ensure that all * gang members reside on the same vdev. */ needs_io = B_TRUE; } else { needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1); } } } if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * MAX(zfs_top_maxinflight, 1); void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); /* * If we're seeing recent (zfs_scan_idle) "important" I/Os * then throttle our workload to limit the impact of a scan. */ if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) delay(MAX((int)scan_delay, 0)); zio_nowait(zio_read(NULL, spa, bp, data, size, dsl_scan_scrub_done, NULL, zio_priority, zio_flags, zb)); } /* do not relocate this block */ return (0); } int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) { spa_t *spa = dp->dp_spa; /* * Purge all vdev caches and probe all devices. We do this here * rather than in sync context because this requires a writer lock * on the spa_config lock, which we can't do from sync context. The * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ spa_vdev_state_enter(spa, SCL_NONE); spa->spa_scrub_reopen = B_TRUE; vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); return (dsl_sync_task_do(dp, dsl_scan_setup_check, dsl_scan_setup_sync, dp->dp_scan, &func, 0)); } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c (revision 247192) @@ -1,124 +1,127 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * We keep our own copy of this algorithm for 3 main reasons: * 1. If we didn't, anyone modifying common/os/compress.c would * directly break our on disk format * 2. Our version of lzjb does not have a number of checks that the * common/os version needs and uses * 3. We initialize the lempel to ensure deterministic results, * so that identical blocks can always be deduplicated. * In particular, we are adding the "feature" that compress() can * take a destination buffer size and returns the compressed length, or the * source length if compression would overflow the destination buffer. */ #include #include +#include #define MATCH_BITS 6 #define MATCH_MIN 3 #define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) #define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) #define LEMPEL_SIZE 1024 /*ARGSUSED*/ size_t lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) { uchar_t *src = s_start; uchar_t *dst = d_start; - uchar_t *cpy, *copymap; + uchar_t *cpy; + uchar_t *copymap = NULL; int copymask = 1 << (NBBY - 1); int mlen, offset, hash; uint16_t *hp; uint16_t lempel[LEMPEL_SIZE] = { 0 }; while (src < (uchar_t *)s_start + s_len) { if ((copymask <<= 1) == (1 << NBBY)) { if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) return (s_len); copymask = 1; copymap = dst; *dst++ = 0; } if (src > (uchar_t *)s_start + s_len - MATCH_MAX) { *dst++ = *src++; continue; } hash = (src[0] << 16) + (src[1] << 8) + src[2]; hash += hash >> 9; hash += hash >> 5; hp = &lempel[hash & (LEMPEL_SIZE - 1)]; offset = (intptr_t)(src - *hp) & OFFSET_MASK; *hp = (uint16_t)(uintptr_t)src; cpy = src - offset; if (cpy >= (uchar_t *)s_start && cpy != src && src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) { *copymap |= copymask; for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) if (src[mlen] != cpy[mlen]) break; *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | (offset >> NBBY); *dst++ = (uchar_t)offset; src += mlen; } else { *dst++ = *src++; } } return (dst - (uchar_t *)d_start); } /*ARGSUSED*/ int lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) { uchar_t *src = s_start; uchar_t *dst = d_start; uchar_t *d_end = (uchar_t *)d_start + d_len; - uchar_t *cpy, copymap; + uchar_t *cpy; + uchar_t copymap = 0; int copymask = 1 << (NBBY - 1); while (dst < d_end) { if ((copymask <<= 1) == (1 << NBBY)) { copymask = 1; copymap = *src++; } if (copymap & copymask) { int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; src += 2; if ((cpy = dst - offset) < (uchar_t *)d_start) return (-1); while (--mlen >= 0 && dst < d_end) *dst++ = *cpy++; } else { *dst++ = *src++; } } return (0); } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c (revision 247192) @@ -1,223 +1,223 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #ifdef ZFS_DEBUG #ifdef _KERNEL int reference_tracking_enable = FALSE; /* runs out of memory too easily */ #else int reference_tracking_enable = TRUE; #endif int reference_history = 4; /* tunable */ static kmem_cache_t *reference_cache; static kmem_cache_t *reference_history_cache; void refcount_sysinit(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); reference_history_cache = kmem_cache_create("reference_history_cache", sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void refcount_fini(void) { kmem_cache_destroy(reference_cache); kmem_cache_destroy(reference_history_cache); } void refcount_create(refcount_t *rc) { mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&rc->rc_list, sizeof (reference_t), offsetof(reference_t, ref_link)); list_create(&rc->rc_removed, sizeof (reference_t), offsetof(reference_t, ref_link)); rc->rc_count = 0; rc->rc_removed_count = 0; } void refcount_destroy_many(refcount_t *rc, uint64_t number) { reference_t *ref; ASSERT(rc->rc_count == number); while (ref = list_head(&rc->rc_list)) { list_remove(&rc->rc_list, ref); kmem_cache_free(reference_cache, ref); } list_destroy(&rc->rc_list); while (ref = list_head(&rc->rc_removed)) { list_remove(&rc->rc_removed, ref); kmem_cache_free(reference_history_cache, ref->ref_removed); kmem_cache_free(reference_cache, ref); } list_destroy(&rc->rc_removed); mutex_destroy(&rc->rc_mtx); } void refcount_destroy(refcount_t *rc) { refcount_destroy_many(rc, 0); } int refcount_is_zero(refcount_t *rc) { ASSERT(rc->rc_count >= 0); return (rc->rc_count == 0); } int64_t refcount_count(refcount_t *rc) { ASSERT(rc->rc_count >= 0); return (rc->rc_count); } int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder) { - reference_t *ref; + reference_t *ref = NULL; int64_t count; if (reference_tracking_enable) { ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; } mutex_enter(&rc->rc_mtx); ASSERT(rc->rc_count >= 0); if (reference_tracking_enable) list_insert_head(&rc->rc_list, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } int64_t refcount_add(refcount_t *rc, void *holder) { return (refcount_add_many(rc, 1, holder)); } int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) { reference_t *ref; int64_t count; mutex_enter(&rc->rc_mtx); ASSERT(rc->rc_count >= number); if (!reference_tracking_enable) { rc->rc_count -= number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { if (ref->ref_holder == holder && ref->ref_number == number) { list_remove(&rc->rc_list, ref); if (reference_history > 0) { ref->ref_removed = kmem_cache_alloc(reference_history_cache, KM_SLEEP); list_insert_head(&rc->rc_removed, ref); rc->rc_removed_count++; if (rc->rc_removed_count >= reference_history) { ref = list_tail(&rc->rc_removed); list_remove(&rc->rc_removed, ref); kmem_cache_free(reference_history_cache, ref->ref_removed); kmem_cache_free(reference_cache, ref); rc->rc_removed_count--; } } else { kmem_cache_free(reference_cache, ref); } rc->rc_count -= number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } } panic("No such hold %p on refcount %llx", holder, (u_longlong_t)(uintptr_t)rc); return (-1); } int64_t refcount_remove(refcount_t *rc, void *holder) { return (refcount_remove_many(rc, 1, holder)); } void refcount_transfer(refcount_t *dst, refcount_t *src) { int64_t count, removed_count; list_t list, removed; list_create(&list, sizeof (reference_t), offsetof(reference_t, ref_link)); list_create(&removed, sizeof (reference_t), offsetof(reference_t, ref_link)); mutex_enter(&src->rc_mtx); count = src->rc_count; removed_count = src->rc_removed_count; src->rc_count = 0; src->rc_removed_count = 0; list_move_tail(&list, &src->rc_list); list_move_tail(&removed, &src->rc_removed); mutex_exit(&src->rc_mtx); mutex_enter(&dst->rc_mtx); dst->rc_count += count; dst->rc_removed_count += removed_count; list_move_tail(&dst->rc_list, &list); list_move_tail(&dst->rc_removed, &removed); mutex_exit(&dst->rc_mtx); list_destroy(&list); list_destroy(&removed); } #endif /* ZFS_DEBUG */ Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c (revision 247192) @@ -1,2002 +1,2004 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 iXsystems, Inc * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ZFS System attributes: * * A generic mechanism to allow for arbitrary attributes * to be stored in a dnode. The data will be stored in the bonus buffer of * the dnode and if necessary a special "spill" block will be used to handle * overflow situations. The spill block will be sized to fit the data * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the * spill block is stored at the end of the current bonus buffer. Any * attributes that would be in the way of the blkptr_t will be relocated * into the spill block. * * Attribute registration: * * Stored persistently on a per dataset basis * a mapping between attribute "string" names and their actual attribute * numeric values, length, and byteswap function. The names are only used * during registration. All attributes are known by their unique attribute * id value. If an attribute can have a variable size then the value * 0 will be used to indicate this. * * Attribute Layout: * * Attribute layouts are a way to compactly store multiple attributes, but * without taking the overhead associated with managing each attribute * individually. Since you will typically have the same set of attributes * stored in the same order a single table will be used to represent that * layout. The ZPL for example will usually have only about 10 different * layouts (regular files, device files, symlinks, * regular files + scanstamp, files/dir with extended attributes, and then * you have the possibility of all of those minus ACL, because it would * be kicked out into the spill block) * * Layouts are simply an array of the attributes and their * ordering i.e. [0, 1, 4, 5, 2] * * Each distinct layout is given a unique layout number and that is whats * stored in the header at the beginning of the SA data buffer. * * A layout only covers a single dbuf (bonus or spill). If a set of * attributes is split up between the bonus buffer and a spill buffer then * two different layouts will be used. This allows us to byteswap the * spill without looking at the bonus buffer and keeps the on disk format of * the bonus and spill buffer the same. * * Adding a single attribute will cause the entire set of attributes to * be rewritten and could result in a new layout number being constructed * as part of the rewrite if no such layout exists for the new set of * attribues. The new attribute will be appended to the end of the already * existing attributes. * * Both the attribute registration and attribute layout information are * stored in normal ZAP attributes. Their should be a small number of * known layouts and the set of attributes is assumed to typically be quite * small. * * The registered attributes and layout "table" information is maintained * in core and a special "sa_os_t" is attached to the objset_t. * * A special interface is provided to allow for quickly applying * a large set of attributes at once. sa_replace_all_by_template() is * used to set an array of attributes. This is used by the ZPL when * creating a brand new file. The template that is passed into the function * specifies the attribute, size for variable length attributes, location of * data and special "data locator" function if the data isn't in a contiguous * location. * * Byteswap implications: * Since the SA attributes are not entirely self describing we can't do * the normal byteswap processing. The special ZAP layout attribute and * attribute registration attributes define the byteswap function and the * size of the attributes, unless it is variable sized. * The normal ZFS byteswapping infrastructure assumes you don't need * to read any objects in order to do the necessary byteswapping. Whereas * SA attributes can only be properly byteswapped if the dataset is opened * and the layout/attribute ZAP attributes are available. Because of this * the SA attributes will be byteswapped when they are first accessed by * the SA code that will read the SA data. */ typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, uint16_t length, int length_idx, boolean_t, void *userp); static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data); static void sa_idx_tab_rele(objset_t *os, void *arg); static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, int buflen); static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_data_op_t action, sa_data_locator_t *locator, void *datastart, uint16_t buflen, dmu_tx_t *tx); arc_byteswap_func_t *sa_bswap_table[] = { byteswap_uint64_array, byteswap_uint32_array, byteswap_uint16_array, byteswap_uint8_array, zfs_acl_byteswap, }; #define SA_COPY_DATA(f, s, t, l) \ { \ if (f == NULL) { \ if (l == 8) { \ *(uint64_t *)t = *(uint64_t *)s; \ } else if (l == 16) { \ *(uint64_t *)t = *(uint64_t *)s; \ *(uint64_t *)((uintptr_t)t + 8) = \ *(uint64_t *)((uintptr_t)s + 8); \ } else { \ bcopy(s, t, l); \ } \ } else \ sa_copy_data(f, s, t, l); \ } /* * This table is fixed and cannot be changed. Its purpose is to * allow the SA code to work with both old/new ZPL file systems. * It contains the list of legacy attributes. These attributes aren't * stored in the "attribute" registry zap objects, since older ZPL file systems * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will * use this static table. */ sa_attr_reg_t sa_legacy_attrs[] = { {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, }; /* * ZPL legacy layout * This is only used for objects of type DMU_OT_ZNODE */ sa_attr_type_t sa_legacy_zpl_layout[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; /* * Special dummy layout used for buffers with no attributes. */ sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; static int sa_legacy_attr_count = 16; static kmem_cache_t *sa_cache = NULL; /*ARGSUSED*/ static int sa_cache_constructor(void *buf, void *unused, int kmflag) { sa_handle_t *hdl = buf; hdl->sa_bonus_tab = NULL; hdl->sa_spill_tab = NULL; hdl->sa_os = NULL; hdl->sa_userp = NULL; hdl->sa_bonus = NULL; hdl->sa_spill = NULL; mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } /*ARGSUSED*/ static void sa_cache_destructor(void *buf, void *unused) { sa_handle_t *hdl = buf; mutex_destroy(&hdl->sa_lock); } void sa_cache_init(void) { sa_cache = kmem_cache_create("sa_cache", sizeof (sa_handle_t), 0, sa_cache_constructor, sa_cache_destructor, NULL, NULL, NULL, 0); } void sa_cache_fini(void) { if (sa_cache) kmem_cache_destroy(sa_cache); } static int layout_num_compare(const void *arg1, const void *arg2) { const sa_lot_t *node1 = arg1; const sa_lot_t *node2 = arg2; if (node1->lot_num > node2->lot_num) return (1); else if (node1->lot_num < node2->lot_num) return (-1); return (0); } static int layout_hash_compare(const void *arg1, const void *arg2) { const sa_lot_t *node1 = arg1; const sa_lot_t *node2 = arg2; if (node1->lot_hash > node2->lot_hash) return (1); if (node1->lot_hash < node2->lot_hash) return (-1); if (node1->lot_instance > node2->lot_instance) return (1); if (node1->lot_instance < node2->lot_instance) return (-1); return (0); } boolean_t sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) { int i; if (count != tbf->lot_attr_count) return (1); for (i = 0; i != count; i++) { if (attrs[i] != tbf->lot_attrs[i]) return (1); } return (0); } #define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) static uint64_t sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) { int i; uint64_t crc = -1ULL; for (i = 0; i != attr_count; i++) crc ^= SA_ATTR_HASH(attrs[i]); return (crc); } static int sa_get_spill(sa_handle_t *hdl) { int rc; if (hdl->sa_spill == NULL) { if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, &hdl->sa_spill)) == 0) VERIFY(0 == sa_build_index(hdl, SA_SPILL)); } else { rc = 0; } return (rc); } /* * Main attribute lookup/update function * returns 0 for success or non zero for failures * * Operates on bulk array, first failure will abort further processing */ int sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, sa_data_op_t data_op, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; int i; int error = 0; sa_buf_type_t buftypes; buftypes = 0; ASSERT(count > 0); for (i = 0; i != count; i++) { ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); bulk[i].sa_addr = NULL; /* First check the bonus buffer */ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { SA_ATTR_INFO(sa, hdl->sa_bonus_tab, SA_GET_HDR(hdl, SA_BONUS), bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); if (tx && !(buftypes & SA_BONUS)) { dmu_buf_will_dirty(hdl->sa_bonus, tx); buftypes |= SA_BONUS; } } if (bulk[i].sa_addr == NULL && ((error = sa_get_spill(hdl)) == 0)) { if (TOC_ATTR_PRESENT( hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { SA_ATTR_INFO(sa, hdl->sa_spill_tab, SA_GET_HDR(hdl, SA_SPILL), bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); if (tx && !(buftypes & SA_SPILL) && bulk[i].sa_size == bulk[i].sa_length) { dmu_buf_will_dirty(hdl->sa_spill, tx); buftypes |= SA_SPILL; } } } if (error && error != ENOENT) { return ((error == ECKSUM) ? EIO : error); } switch (data_op) { case SA_LOOKUP: if (bulk[i].sa_addr == NULL) return (ENOENT); if (bulk[i].sa_data) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_addr, bulk[i].sa_data, bulk[i].sa_size); } continue; case SA_UPDATE: /* existing rewrite of attr */ if (bulk[i].sa_addr && bulk[i].sa_size == bulk[i].sa_length) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_addr, bulk[i].sa_length); continue; } else if (bulk[i].sa_addr) { /* attr size change */ error = sa_modify_attrs(hdl, bulk[i].sa_attr, SA_REPLACE, bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_length, tx); } else { /* adding new attribute */ error = sa_modify_attrs(hdl, bulk[i].sa_attr, SA_ADD, bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_length, tx); } if (error) return (error); break; } } return (error); } static sa_lot_t * sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) { sa_os_t *sa = os->os_sa; sa_lot_t *tb, *findtb; int i; avl_index_t loc; ASSERT(MUTEX_HELD(&sa->sa_lock)); tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); tb->lot_attr_count = attr_count; tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, KM_SLEEP); bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); tb->lot_num = lot_num; tb->lot_hash = hash; tb->lot_instance = 0; if (zapadd) { char attr_name[8]; if (sa->sa_layout_attr_obj == 0) { sa->sa_layout_attr_obj = zap_create_link(os, DMU_OT_SA_ATTR_LAYOUTS, sa->sa_master_obj, SA_LAYOUTS, tx); } (void) snprintf(attr_name, sizeof (attr_name), "%d", (int)lot_num); VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, attr_name, 2, attr_count, attrs, tx)); } list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), offsetof(sa_idx_tab_t, sa_next)); for (i = 0; i != attr_count; i++) { if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) tb->lot_var_sizes++; } avl_add(&sa->sa_layout_num_tree, tb); /* verify we don't have a hash collision */ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { for (; findtb && findtb->lot_hash == hash; findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { if (findtb->lot_instance != tb->lot_instance) break; tb->lot_instance++; } } avl_add(&sa->sa_layout_hash_tree, tb); return (tb); } static void sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, int count, dmu_tx_t *tx, sa_lot_t **lot) { sa_lot_t *tb, tbsearch; avl_index_t loc; sa_os_t *sa = os->os_sa; boolean_t found = B_FALSE; mutex_enter(&sa->sa_lock); tbsearch.lot_hash = hash; tbsearch.lot_instance = 0; tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); if (tb) { for (; tb && tb->lot_hash == hash; tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { if (sa_layout_equal(tb, attrs, count) == 0) { found = B_TRUE; break; } } } if (!found) { tb = sa_add_layout_entry(os, attrs, count, avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); } mutex_exit(&sa->sa_lock); *lot = tb; } static int sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) { int error; uint32_t blocksize; if (size == 0) { blocksize = SPA_MINBLOCKSIZE; } else if (size > SPA_MAXBLOCKSIZE) { ASSERT(0); return (EFBIG); } else { blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); } error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); ASSERT(error == 0); return (error); } static void sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) { if (func == NULL) { bcopy(datastart, target, buflen); } else { boolean_t start; int bytes; void *dataptr; void *saptr = target; uint32_t length; start = B_TRUE; bytes = 0; while (bytes < buflen) { func(&dataptr, &length, buflen, start, datastart); bcopy(dataptr, saptr, length); saptr = (void *)((caddr_t)saptr + length); bytes += length; start = B_FALSE; } } } /* * Determine several different sizes * first the sa header size * the number of bytes to be stored * if spill would occur the index in the attribute array is returned * * the boolean will_spill will be set when spilling is necessary. It * is only set when the buftype is SA_BONUS */ static int sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total, boolean_t *will_spill) { int var_size = 0; int i; int j = -1; int full_space; int hdrsize; boolean_t done = B_FALSE; if (buftype == SA_BONUS && sa->sa_force_spill) { *total = 0; *index = 0; *will_spill = B_TRUE; return (0); } *index = -1; *total = 0; if (buftype == SA_BONUS) *will_spill = B_FALSE; hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : sizeof (sa_hdr_phys_t); full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size; ASSERT(IS_P2ALIGNED(full_space, 8)); for (i = 0; i != attr_count; i++) { boolean_t is_var_sz; *total = P2ROUNDUP(*total, 8); *total += attr_desc[i].sa_length; if (done) goto next; is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); if (is_var_sz) { var_size++; } if (is_var_sz && var_size > 1) { if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + *total < full_space) { /* * Account for header space used by array of * optional sizes of variable-length attributes. * Record the index in case this increase needs * to be reversed due to spill-over. */ hdrsize += sizeof (uint16_t); j = i; } else { done = B_TRUE; *index = i; if (buftype == SA_BONUS) *will_spill = B_TRUE; continue; } } /* * find index of where spill *could* occur. * Then continue to count of remainder attribute * space. The sum is used later for sizing bonus * and spill buffer. */ if (buftype == SA_BONUS && *index == -1 && (*total + P2ROUNDUP(hdrsize, 8)) > (full_space - sizeof (blkptr_t))) { *index = i; done = B_TRUE; } next: if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && buftype == SA_BONUS) *will_spill = B_TRUE; } /* * j holds the index of the last variable-sized attribute for * which hdrsize was increased. Reverse the increase if that * attribute will be relocated to the spill block. */ if (*will_spill && j == *index) hdrsize -= sizeof (uint16_t); hdrsize = P2ROUNDUP(hdrsize, 8); return (hdrsize); } #define BUF_SPACE_NEEDED(total, header) (total + header) /* * Find layout that corresponds to ordering of attributes * If not found a new layout number is created and added to * persistent layout tables. */ static int sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; uint64_t hash; sa_buf_type_t buftype; sa_hdr_phys_t *sahdr; void *data_start; int buf_space; sa_attr_type_t *attrs, *attrs_start; int i, lot_count; - int hdrsize, spillhdrsize; + int hdrsize; + int spillhdrsize = 0; int used; dmu_object_type_t bonustype; sa_lot_t *lot; int len_idx; int spill_used; boolean_t spilling; dmu_buf_will_dirty(hdl->sa_bonus, tx); bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); /* first determine bonus header size and sum of all attributes */ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, SA_BONUS, &i, &used, &spilling); if (used > SPA_MAXBLOCKSIZE) return (EFBIG); VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) : used + hdrsize, tx)); ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || bonustype == DMU_OT_SA); /* setup and size spill buffer when needed */ if (spilling) { boolean_t dummy; if (hdl->sa_spill == NULL) { VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, &hdl->sa_spill) == 0); } dmu_buf_will_dirty(hdl->sa_spill, tx); spillhdrsize = sa_find_sizes(sa, &attr_desc[i], attr_count - i, hdl->sa_spill, SA_SPILL, &i, &spill_used, &dummy); if (spill_used > SPA_MAXBLOCKSIZE) return (EFBIG); buf_space = hdl->sa_spill->db_size - spillhdrsize; if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > hdl->sa_spill->db_size) VERIFY(0 == sa_resize_spill(hdl, BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); } /* setup starting pointers to lay down data */ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; buftype = SA_BONUS; if (spilling) buf_space = (sa->sa_force_spill) ? 0 : SA_BLKPTR_SPACE - hdrsize; else buf_space = hdl->sa_bonus->db_size - hdrsize; attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, KM_SLEEP); lot_count = 0; for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { uint16_t length; ASSERT(IS_P2ALIGNED(data_start, 8)); ASSERT(IS_P2ALIGNED(buf_space, 8)); attrs[i] = attr_desc[i].sa_attr; length = SA_REGISTERED_LEN(sa, attrs[i]); if (length == 0) length = attr_desc[i].sa_length; else VERIFY(length == attr_desc[i].sa_length); if (buf_space < length) { /* switch to spill buffer */ VERIFY(spilling); VERIFY(bonustype == DMU_OT_SA); if (buftype == SA_BONUS && !sa->sa_force_spill) { sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); SA_SET_HDR(sahdr, lot->lot_num, hdrsize); } buftype = SA_SPILL; hash = -1ULL; len_idx = 0; sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; sahdr->sa_magic = SA_MAGIC; data_start = (void *)((uintptr_t)sahdr + spillhdrsize); attrs_start = &attrs[i]; buf_space = hdl->sa_spill->db_size - spillhdrsize; lot_count = 0; } hash ^= SA_ATTR_HASH(attrs[i]); attr_desc[i].sa_addr = data_start; attr_desc[i].sa_size = length; SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, data_start, length); if (sa->sa_attr_table[attrs[i]].sa_length == 0) { sahdr->sa_lengths[len_idx++] = length; } VERIFY((uintptr_t)data_start % 8 == 0); data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + length), 8); buf_space -= P2ROUNDUP(length, 8); lot_count++; } sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); /* * Verify that old znodes always have layout number 0. * Must be DMU_OT_SA for arbitrary layouts */ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || (bonustype == DMU_OT_SA && lot->lot_num > 1)); if (bonustype == DMU_OT_SA) { SA_SET_HDR(sahdr, lot->lot_num, buftype == SA_BONUS ? hdrsize : spillhdrsize); } kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); if (hdl->sa_bonus_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); hdl->sa_bonus_tab = NULL; } if (!sa->sa_force_spill) VERIFY(0 == sa_build_index(hdl, SA_BONUS)); if (hdl->sa_spill) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); if (!spilling) { /* * remove spill block that is no longer needed. */ dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; hdl->sa_spill_tab = NULL; VERIFY(0 == dmu_rm_spill(hdl->sa_os, sa_handle_object(hdl), tx)); } else { VERIFY(0 == sa_build_index(hdl, SA_SPILL)); } } return (0); } static void sa_free_attr_table(sa_os_t *sa) { int i; if (sa->sa_attr_table == NULL) return; for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_name) kmem_free(sa->sa_attr_table[i].sa_name, strlen(sa->sa_attr_table[i].sa_name) + 1); } kmem_free(sa->sa_attr_table, sizeof (sa_attr_table_t) * sa->sa_num_attrs); sa->sa_attr_table = NULL; } static int sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) { sa_os_t *sa = os->os_sa; uint64_t sa_attr_count = 0; - uint64_t sa_reg_count; + uint64_t sa_reg_count = 0; int error = 0; uint64_t attr_value; sa_attr_table_t *tb; zap_cursor_t zc; zap_attribute_t za; int registered_count = 0; int i; dmu_objset_type_t ostype = dmu_objset_type(os); sa->sa_user_table = kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); if (sa->sa_reg_attr_obj != 0) { error = zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count); /* * Make sure we retrieved a count and that it isn't zero */ if (error || (error == 0 && sa_attr_count == 0)) { if (error == 0) error = EINVAL; goto bail; } sa_reg_count = sa_attr_count; } if (ostype == DMU_OST_ZFS && sa_attr_count == 0) sa_attr_count += sa_legacy_attr_count; /* Allocate attribute numbers for attributes that aren't registered */ for (i = 0; i != count; i++) { boolean_t found = B_FALSE; int j; if (ostype == DMU_OST_ZFS) { for (j = 0; j != sa_legacy_attr_count; j++) { if (strcmp(reg_attrs[i].sa_name, sa_legacy_attrs[j].sa_name) == 0) { sa->sa_user_table[i] = sa_legacy_attrs[j].sa_attr; found = B_TRUE; } } } if (found) continue; if (sa->sa_reg_attr_obj) error = zap_lookup(os, sa->sa_reg_attr_obj, reg_attrs[i].sa_name, 8, 1, &attr_value); else error = ENOENT; switch (error) { case ENOENT: sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; sa_attr_count++; break; case 0: sa->sa_user_table[i] = ATTR_NUM(attr_value); break; default: goto bail; } } sa->sa_num_attrs = sa_attr_count; tb = sa->sa_attr_table = kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); /* * Attribute table is constructed from requested attribute list, * previously foreign registered attributes, and also the legacy * ZPL set of attributes. */ if (sa->sa_reg_attr_obj) { for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t value; value = za.za_first_integer; registered_count++; tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); tb[ATTR_NUM(value)].sa_registered = B_TRUE; if (tb[ATTR_NUM(value)].sa_name) { continue; } tb[ATTR_NUM(value)].sa_name = kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, strlen(za.za_name) +1); } zap_cursor_fini(&zc); /* * Make sure we processed the correct number of registered * attributes */ if (registered_count != sa_reg_count) { ASSERT(error != 0); goto bail; } } if (ostype == DMU_OST_ZFS) { for (i = 0; i != sa_legacy_attr_count; i++) { if (tb[i].sa_name) continue; tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; tb[i].sa_length = sa_legacy_attrs[i].sa_length; tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; tb[i].sa_registered = B_FALSE; tb[i].sa_name = kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, KM_SLEEP); (void) strlcpy(tb[i].sa_name, sa_legacy_attrs[i].sa_name, strlen(sa_legacy_attrs[i].sa_name) + 1); } } for (i = 0; i != count; i++) { sa_attr_type_t attr_id; attr_id = sa->sa_user_table[i]; if (tb[attr_id].sa_name) continue; tb[attr_id].sa_length = reg_attrs[i].sa_length; tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; tb[attr_id].sa_attr = attr_id; tb[attr_id].sa_name = kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, strlen(reg_attrs[i].sa_name) + 1); } sa->sa_need_attr_registration = (sa_attr_count != registered_count); return (0); bail: kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); sa->sa_user_table = NULL; sa_free_attr_table(sa); return ((error != 0) ? error : EINVAL); } int sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, sa_attr_type_t **user_table) { zap_cursor_t zc; zap_attribute_t za; sa_os_t *sa; dmu_objset_type_t ostype = dmu_objset_type(os); sa_attr_type_t *tb; int error; mutex_enter(&os->os_lock); if (os->os_sa) { mutex_enter(&os->os_sa->sa_lock); mutex_exit(&os->os_lock); tb = os->os_sa->sa_user_table; mutex_exit(&os->os_sa->sa_lock); *user_table = tb; return (0); } sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); sa->sa_master_obj = sa_obj; os->os_sa = sa; mutex_enter(&sa->sa_lock); mutex_exit(&os->os_lock); avl_create(&sa->sa_layout_num_tree, layout_num_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); if (sa_obj) { error = zap_lookup(os, sa_obj, SA_LAYOUTS, 8, 1, &sa->sa_layout_attr_obj); if (error != 0 && error != ENOENT) goto fail; error = zap_lookup(os, sa_obj, SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj); if (error != 0 && error != ENOENT) goto fail; } if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) goto fail; if (sa->sa_layout_attr_obj != 0) { uint64_t layout_count; error = zap_count(os, sa->sa_layout_attr_obj, &layout_count); /* * Layout number count should be > 0 */ if (error || (error == 0 && layout_count == 0)) { if (error == 0) error = EINVAL; goto fail; } for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { sa_attr_type_t *lot_attrs; uint64_t lot_num; lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * za.za_num_integers, KM_SLEEP); if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, za.za_name, 2, za.za_num_integers, lot_attrs))) != 0) { kmem_free(lot_attrs, sizeof (sa_attr_type_t) * za.za_num_integers); break; } VERIFY(ddi_strtoull(za.za_name, NULL, 10, (unsigned long long *)&lot_num) == 0); (void) sa_add_layout_entry(os, lot_attrs, za.za_num_integers, lot_num, sa_layout_info_hash(lot_attrs, za.za_num_integers), B_FALSE, NULL); kmem_free(lot_attrs, sizeof (sa_attr_type_t) * za.za_num_integers); } zap_cursor_fini(&zc); /* * Make sure layout count matches number of entries added * to AVL tree */ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { ASSERT(error != 0); goto fail; } } /* Add special layout number for old ZNODES */ if (ostype == DMU_OST_ZFS) { (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, sa_legacy_attr_count, 0, sa_layout_info_hash(sa_legacy_zpl_layout, sa_legacy_attr_count), B_FALSE, NULL); (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, 0, B_FALSE, NULL); } *user_table = os->os_sa->sa_user_table; mutex_exit(&sa->sa_lock); return (0); fail: os->os_sa = NULL; sa_free_attr_table(sa); if (sa->sa_user_table) kmem_free(sa->sa_user_table, sa->sa_user_table_sz); mutex_exit(&sa->sa_lock); kmem_free(sa, sizeof (sa_os_t)); return ((error == ECKSUM) ? EIO : error); } void sa_tear_down(objset_t *os) { sa_os_t *sa = os->os_sa; sa_lot_t *layout; void *cookie; kmem_free(sa->sa_user_table, sa->sa_user_table_sz); /* Free up attr table */ sa_free_attr_table(sa); cookie = NULL; while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) { sa_idx_tab_t *tab; while (tab = list_head(&layout->lot_idx_tab)) { ASSERT(refcount_count(&tab->sa_refcount)); sa_idx_tab_rele(os, tab); } } cookie = NULL; while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) { kmem_free(layout->lot_attrs, sizeof (sa_attr_type_t) * layout->lot_attr_count); kmem_free(layout, sizeof (sa_lot_t)); } avl_destroy(&sa->sa_layout_hash_tree); avl_destroy(&sa->sa_layout_num_tree); kmem_free(sa, sizeof (sa_os_t)); os->os_sa = NULL; } void sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t var_length, void *userp) { sa_idx_tab_t *idx_tab = userp; if (var_length) { ASSERT(idx_tab->sa_variable_lengths); idx_tab->sa_variable_lengths[length_idx] = length; } TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); } static void sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, sa_iterfunc_t func, sa_lot_t *tab, void *userp) { void *data_start; sa_lot_t *tb = tab; sa_lot_t search; avl_index_t loc; sa_os_t *sa = os->os_sa; int i; uint16_t *length_start = NULL; uint8_t length_idx = 0; if (tab == NULL) { search.lot_num = SA_LAYOUT_NUM(hdr, type); tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); ASSERT(tb); } if (IS_SA_BONUSTYPE(type)) { data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + offsetof(sa_hdr_phys_t, sa_lengths) + (sizeof (uint16_t) * tb->lot_var_sizes)), 8); length_start = hdr->sa_lengths; } else { data_start = hdr; } for (i = 0; i != tb->lot_attr_count; i++) { int attr_length, reg_length; uint8_t idx_len; reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; if (reg_length) { attr_length = reg_length; idx_len = 0; } else { attr_length = length_start[length_idx]; idx_len = length_idx++; } func(hdr, data_start, tb->lot_attrs[i], attr_length, idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + attr_length), 8); } } /*ARGSUSED*/ void sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t variable_length, void *userp) { sa_handle_t *hdl = userp; sa_os_t *sa = hdl->sa_os->os_sa; sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); } void sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) { sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); dmu_buf_impl_t *db; sa_os_t *sa = hdl->sa_os->os_sa; int num_lengths = 1; int i; ASSERT(MUTEX_HELD(&sa->sa_lock)); if (sa_hdr_phys->sa_magic == SA_MAGIC) return; db = SA_GET_DB(hdl, buftype); if (buftype == SA_SPILL) { arc_release(db->db_buf, NULL); arc_buf_thaw(db->db_buf); } sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); /* * Determine number of variable lenghts in header * The standard 8 byte header has one for free and a * 16 byte header would have 4 + 1; */ if (SA_HDR_SIZE(sa_hdr_phys) > 8) num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; for (i = 0; i != num_lengths; i++) sa_hdr_phys->sa_lengths[i] = BSWAP_16(sa_hdr_phys->sa_lengths[i]); sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, sa_byteswap_cb, NULL, hdl); if (buftype == SA_SPILL) arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); } static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) { sa_hdr_phys_t *sa_hdr_phys; dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); sa_os_t *sa = hdl->sa_os->os_sa; sa_idx_tab_t *idx_tab; sa_hdr_phys = SA_GET_HDR(hdl, buftype); mutex_enter(&sa->sa_lock); /* Do we need to byteswap? */ /* only check if not old znode */ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && sa_hdr_phys->sa_magic != 0) { VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC); sa_byteswap(hdl, buftype); } idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); if (buftype == SA_BONUS) hdl->sa_bonus_tab = idx_tab; else hdl->sa_spill_tab = idx_tab; mutex_exit(&sa->sa_lock); return (0); } /*ARGSUSED*/ void sa_evict(dmu_buf_t *db, void *sap) { panic("evicting sa dbuf %p\n", (void *)db); } static void sa_idx_tab_rele(objset_t *os, void *arg) { sa_os_t *sa = os->os_sa; sa_idx_tab_t *idx_tab = arg; if (idx_tab == NULL) return; mutex_enter(&sa->sa_lock); if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); if (idx_tab->sa_variable_lengths) kmem_free(idx_tab->sa_variable_lengths, sizeof (uint16_t) * idx_tab->sa_layout->lot_var_sizes); refcount_destroy(&idx_tab->sa_refcount); kmem_free(idx_tab->sa_idx_tab, sizeof (uint32_t) * sa->sa_num_attrs); kmem_free(idx_tab, sizeof (sa_idx_tab_t)); } mutex_exit(&sa->sa_lock); } static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) { sa_os_t *sa = os->os_sa; ASSERT(MUTEX_HELD(&sa->sa_lock)); (void) refcount_add(&idx_tab->sa_refcount, NULL); } void sa_handle_destroy(sa_handle_t *hdl) { mutex_enter(&hdl->sa_lock); (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, NULL, NULL, NULL); if (hdl->sa_bonus_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); hdl->sa_bonus_tab = NULL; } if (hdl->sa_spill_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); hdl->sa_spill_tab = NULL; } dmu_buf_rele(hdl->sa_bonus, NULL); if (hdl->sa_spill) dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); mutex_exit(&hdl->sa_lock); kmem_cache_free(sa_cache, hdl); } int sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { int error = 0; dmu_object_info_t doi; sa_handle_t *handle; #ifdef ZFS_DEBUG dmu_object_info_from_db(db, &doi); ASSERT(doi.doi_bonus_type == DMU_OT_SA || doi.doi_bonus_type == DMU_OT_ZNODE); #endif /* find handle, if it exists */ /* if one doesn't exist then create a new one, and initialize it */ handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; if (handle == NULL) { sa_handle_t *newhandle; handle = kmem_cache_alloc(sa_cache, KM_SLEEP); handle->sa_userp = userp; handle->sa_bonus = db; handle->sa_os = os; handle->sa_spill = NULL; error = sa_build_index(handle, SA_BONUS); newhandle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_set_user_ie(db, handle, NULL, sa_evict) : NULL; if (newhandle != NULL) { kmem_cache_free(sa_cache, handle); handle = newhandle; } } *handlepp = handle; return (error); } int sa_handle_get(objset_t *objset, uint64_t objid, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { dmu_buf_t *db; int error; if (error = dmu_bonus_hold(objset, objid, NULL, &db)) return (error); return (sa_handle_get_from_db(objset, db, userp, hdl_type, handlepp)); } int sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) { return (dmu_bonus_hold(objset, obj_num, tag, db)); } void sa_buf_rele(dmu_buf_t *db, void *tag) { dmu_buf_rele(db, tag); } int sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) { ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); } int sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) { int error; sa_bulk_attr_t bulk; bulk.sa_attr = attr; bulk.sa_data = buf; bulk.sa_length = buflen; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_lookup_impl(hdl, &bulk, 1); mutex_exit(&hdl->sa_lock); return (error); } #ifdef _KERNEL int sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) { int error; sa_bulk_attr_t bulk; bulk.sa_data = NULL; bulk.sa_attr = attr; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, uio->uio_resid), UIO_READ, uio); } mutex_exit(&hdl->sa_lock); return (error); } #endif void * sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data) { sa_idx_tab_t *idx_tab; sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data; sa_os_t *sa = os->os_sa; sa_lot_t *tb, search; avl_index_t loc; /* * Deterimine layout number. If SA node and header == 0 then * force the index table to the dummy "1" empty layout. * * The layout number would only be zero for a newly created file * that has not added any attributes yet, or with crypto enabled which * doesn't write any attributes to the bonus buffer. */ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); /* Verify header size is consistent with layout information */ ASSERT(tb); ASSERT(IS_SA_BONUSTYPE(bonustype) && SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) || (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); /* * See if any of the already existing TOC entries can be reused? */ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { boolean_t valid_idx = B_TRUE; int i; if (tb->lot_var_sizes != 0 && idx_tab->sa_variable_lengths != NULL) { for (i = 0; i != tb->lot_var_sizes; i++) { if (hdr->sa_lengths[i] != idx_tab->sa_variable_lengths[i]) { valid_idx = B_FALSE; break; } } } if (valid_idx) { sa_idx_tab_hold(os, idx_tab); return (idx_tab); } } /* No such luck, create a new entry */ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); idx_tab->sa_idx_tab = kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); idx_tab->sa_layout = tb; refcount_create(&idx_tab->sa_refcount); if (tb->lot_var_sizes) idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * tb->lot_var_sizes, KM_SLEEP); sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, tb, idx_tab); sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ sa_idx_tab_hold(os, idx_tab); /* one for layout */ list_insert_tail(&tb->lot_idx_tab, idx_tab); return (idx_tab); } void sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, boolean_t start, void *userdata) { ASSERT(start); *dataptr = userdata; *len = total_len; } static void sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) { uint64_t attr_value = 0; sa_os_t *sa = hdl->sa_os->os_sa; sa_attr_table_t *tb = sa->sa_attr_table; int i; mutex_enter(&sa->sa_lock); if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { mutex_exit(&sa->sa_lock); return; } if (sa->sa_reg_attr_obj == 0) { sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, DMU_OT_SA_ATTR_REGISTRATION, sa->sa_master_obj, SA_REGISTRY, tx); } for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_registered) continue; ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, tb[i].sa_byteswap); VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, tb[i].sa_name, 8, 1, &attr_value, tx)); tb[i].sa_registered = B_TRUE; } sa->sa_need_attr_registration = B_FALSE; mutex_exit(&sa->sa_lock); } /* * Replace all attributes with attributes specified in template. * If dnode had a spill buffer then those attributes will be * also be replaced, possibly with just an empty spill block * * This interface is intended to only be used for bulk adding of * attributes for a new file. It will also be used by the ZPL * when converting and old formatted znode to native SA support. */ int sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; if (sa->sa_need_attr_registration) sa_attr_register_sync(hdl, tx); return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); } int sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { int error; mutex_enter(&hdl->sa_lock); error = sa_replace_all_by_template_locked(hdl, attr_desc, attr_count, tx); mutex_exit(&hdl->sa_lock); return (error); } /* * Add/remove a single attribute or replace a variable-sized attribute value * with a value of a different size, and then rewrite the entire set * of attributes. * Same-length attribute value replacement (including fixed-length attributes) * is handled more efficiently by the upper layers. */ static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_data_op_t action, sa_data_locator_t *locator, void *datastart, uint16_t buflen, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; - int bonus_data_size, spill_data_size; + int bonus_data_size = 0; + int spill_data_size = 0; int spill_attr_count = 0; int error; uint16_t length; int i, j, k, length_idx; sa_hdr_phys_t *hdr; sa_idx_tab_t *idx_tab; int attr_count; int count; ASSERT(MUTEX_HELD(&hdl->sa_lock)); /* First make of copy of the old data */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); bcopy(hdl->sa_bonus->db_data, old_data[0], hdl->sa_bonus->db_size); bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; } else { old_data[0] = NULL; } DB_DNODE_EXIT(db); /* Bring spill buffer online if it isn't currently */ if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); bcopy(hdl->sa_spill->db_data, old_data[1], hdl->sa_spill->db_size); spill_attr_count = hdl->sa_spill_tab->sa_layout->lot_attr_count; } else if (error && error != ENOENT) { if (old_data[0]) kmem_free(old_data[0], bonus_data_size); return (error); } else { old_data[1] = NULL; } /* build descriptor of all attributes */ attr_count = bonus_attr_count + spill_attr_count; if (action == SA_ADD) attr_count++; else if (action == SA_REMOVE) attr_count--; attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); /* * loop through bonus and spill buffer if it exists, and * build up new attr_descriptor to reset the attributes */ k = j = 0; count = bonus_attr_count; hdr = SA_GET_HDR(hdl, SA_BONUS); idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); for (; k != 2; k++) { /* iterate over each attribute in layout */ for (i = 0, length_idx = 0; i != count; i++) { sa_attr_type_t attr; attr = idx_tab->sa_layout->lot_attrs[i]; if (attr == newattr) { /* duplicate attributes are not allowed */ ASSERT(action == SA_REPLACE || action == SA_REMOVE); /* must be variable-sized to be replaced here */ if (action == SA_REPLACE) { ASSERT(SA_REGISTERED_LEN(sa, attr) == 0); SA_ADD_BULK_ATTR(attr_desc, j, attr, locator, datastart, buflen); } } else { length = SA_REGISTERED_LEN(sa, attr); if (length == 0) { length = hdr->sa_lengths[length_idx]; } SA_ADD_BULK_ATTR(attr_desc, j, attr, NULL, (void *) (TOC_OFF(idx_tab->sa_idx_tab[attr]) + (uintptr_t)old_data[k]), length); } if (SA_REGISTERED_LEN(sa, attr) == 0) length_idx++; } if (k == 0 && hdl->sa_spill) { hdr = SA_GET_HDR(hdl, SA_SPILL); idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); count = spill_attr_count; } else { break; } } if (action == SA_ADD) { length = SA_REGISTERED_LEN(sa, newattr); if (length == 0) { length = buflen; } SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, datastart, buflen); } ASSERT3U(j, ==, attr_count); error = sa_build_layouts(hdl, attr_desc, attr_count, tx); if (old_data[0]) kmem_free(old_data[0], bonus_data_size); if (old_data[1]) kmem_free(old_data[1], spill_data_size); kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); return (error); } static int sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, dmu_tx_t *tx) { int error; sa_os_t *sa = hdl->sa_os->os_sa; dmu_object_type_t bonustype; bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); /* sync out registration table if necessary */ if (sa->sa_need_attr_registration) sa_attr_register_sync(hdl, tx); error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) sa->sa_update_cb(hdl, tx); return (error); } /* * update or add new attribute */ int sa_update(sa_handle_t *hdl, sa_attr_type_t type, void *buf, uint32_t buflen, dmu_tx_t *tx) { int error; sa_bulk_attr_t bulk; bulk.sa_attr = type; bulk.sa_data_func = NULL; bulk.sa_length = buflen; bulk.sa_data = buf; mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, &bulk, 1, tx); mutex_exit(&hdl->sa_lock); return (error); } int sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) { int error; sa_bulk_attr_t bulk; bulk.sa_attr = attr; bulk.sa_data = userdata; bulk.sa_data_func = locator; bulk.sa_length = buflen; mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, &bulk, 1, tx); mutex_exit(&hdl->sa_lock); return (error); } /* * Return size of an attribute */ int sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) { sa_bulk_attr_t bulk; int error; bulk.sa_data = NULL; bulk.sa_attr = attr; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { mutex_exit(&hdl->sa_lock); return (error); } *size = bulk.sa_size; mutex_exit(&hdl->sa_lock); return (0); } int sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) { ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); return (sa_lookup_impl(hdl, attrs, count)); } int sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) { int error; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_bulk_lookup_locked(hdl, attrs, count); mutex_exit(&hdl->sa_lock); return (error); } int sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) { int error; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, attrs, count, tx); mutex_exit(&hdl->sa_lock); return (error); } int sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) { int error; mutex_enter(&hdl->sa_lock); error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, NULL, 0, tx); mutex_exit(&hdl->sa_lock); return (error); } void sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) { dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); } void sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) { dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, blksize, nblocks); } void sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) { (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, oldhdl, newhdl, NULL, sa_evict); oldhdl->sa_bonus = NULL; } void sa_set_userp(sa_handle_t *hdl, void *ptr) { hdl->sa_userp = ptr; } dmu_buf_t * sa_get_db(sa_handle_t *hdl) { return ((dmu_buf_t *)hdl->sa_bonus); } void * sa_get_userdata(sa_handle_t *hdl) { return (hdl->sa_userp); } void sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) { ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); os->os_sa->sa_update_cb = func; } void sa_register_update_callback(objset_t *os, sa_update_cb_t *func) { mutex_enter(&os->os_sa->sa_lock); sa_register_update_callback_locked(os, func); mutex_exit(&os->os_sa->sa_lock); } uint64_t sa_handle_object(sa_handle_t *hdl) { return (hdl->sa_bonus->db_object); } boolean_t sa_enabled(objset_t *os) { return (os->os_sa == NULL); } int sa_set_sa_object(objset_t *os, uint64_t sa_object) { sa_os_t *sa = os->os_sa; if (sa->sa_master_obj) return (1); sa->sa_master_obj = sa_object; return (0); } int sa_hdrsize(void *arg) { sa_hdr_phys_t *hdr = arg; return (SA_HDR_SIZE(hdr)); } void sa_handle_lock(sa_handle_t *hdl) { ASSERT(hdl); mutex_enter(&hdl->sa_lock); } void sa_handle_unlock(sa_handle_t *hdl) { ASSERT(hdl); mutex_exit(&hdl->sa_lock); } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 247192) @@ -1,6645 +1,6646 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ /* * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" /* Check hostid on import? */ static int check_hostid = 1; SYSCTL_DECL(_vfs_zfs); TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, "Check hostid on import?"); typedef enum zti_modes { zti_mode_fixed, /* value is # of threads (min 1) */ zti_mode_online_percent, /* value is % of online CPUs */ zti_mode_batch, /* cpu-intensive; value is ignored */ zti_mode_null, /* don't create a taskq */ zti_nmodes } zti_modes_t; #define ZTI_FIX(n) { zti_mode_fixed, (n) } #define ZTI_PCT(n) { zti_mode_online_percent, (n) } #define ZTI_BATCH { zti_mode_batch, 0 } #define ZTI_NULL { zti_mode_null, 0 } #define ZTI_ONE ZTI_FIX(1) typedef struct zio_taskq_info { enum zti_modes zti_mode; uint_t zti_value; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "issue", "issue_high", "intr", "intr_high" }; /* * Define the taskq threads for the following I/O types: * NULL, READ, WRITE, FREE, CLAIM, and IOCTL */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; static dsl_syncfunc_t spa_sync_version; static dsl_syncfunc_t spa_sync_props; static dsl_checkfunc_t spa_change_guid_check; static dsl_syncfunc_t spa_change_guid_sync; static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ #ifdef PSRSET_BIND id_t zio_taskq_psrset_bind = PS_NONE; #endif #ifdef SYSDC boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ #endif uint_t zio_taskq_basedc = 80; /* base duty cycle */ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ extern int zfs_sync_pass_deferred_free; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); if (strval != NULL) VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); else VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size; uint64_t alloc; uint64_t space; uint64_t cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(spa_normal_class(spa)); size = metaslab_class_get_space(spa_normal_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); space = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; space += tvd->vdev_max_asize - tvd->vdev_asize; } spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == FREAD), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } if (pool != NULL) { dsl_dir_t *freedir = pool->dp_free_dir; /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (freedir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, freedir->dd_phys->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; int err; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) { mutex_exit(&spa->spa_props_lock); return (0); } /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_pool_t *dp; dsl_dataset_t *ds = NULL; dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); if (err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds)) { rw_exit(&dp->dp_config_rwlock); break; } strval = kmem_alloc( MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); rw_exit(&dp->dp_config_rwlock); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } spa_prop_add_list(*nvp, prop, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); mutex_exit(&spa->spa_props_lock); out: if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; - uint64_t objnum; + uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPROP_INVAL: if (!zpool_prop_feature(propname)) { error = EINVAL; break; } /* * Sanitize the input. */ if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = EINVAL; break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval != 0) { error = EINVAL; break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = EINVAL; break; } has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = EINVAL; break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = EINVAL; break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = ENOTSUP; break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = ENOTSUP; break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; uint64_t compress; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } if (error = dmu_objset_hold(strval, FTAG, &os)) break; /* Must be ZPL and not gzip compressed. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = ENOTSUP; } else if ((error = dsl_prop_get_integer(strval, zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL)) == 0 && !BOOTFS_COMPRESS_VALID(compress)) { error = ENOTSUP; } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < ZIO_FAILURE_MODE_WAIT || intval > ZIO_FAILURE_MODE_PANIC)) error = EINVAL; /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = EIO; } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = EINVAL; break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = EINVAL; break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { /* * The kernel doesn't have an easy isprint() * check. For this kernel check, we merely * check ASCII apart from DEL. Fix this if * there is an easy-to-use kernel isprint(). */ if (*check >= 0x7f) { error = EINVAL; break; } check++; } if (strlen(strval) > ZPROP_MAX_COMMENT) error = E2BIG; break; case ZPOOL_PROP_DEDUPDITTO: if (spa_version(spa) < SPA_VERSION_DEDUP) error = ENOTSUP; else error = nvpair_value_uint64(elem, &intval); if (error == 0 && intval != 0 && intval < ZIO_DEDUPDITTO_MIN) error = EINVAL; break; } if (error) break; } if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { uint64_t ver; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_version, spa, &ver, 6); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, spa, nvp, 6)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } /*ARGSUSED*/ static int spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; uint64_t *newguid = arg2; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (ENXIO); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; uint64_t *newguid = arg2; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); #ifdef __FreeBSD__ /* * TODO: until recent illumos logging changes are merged * log reguid as pool property change */ spa_history_log_internal(LOG_POOL_PROPSET, spa, tx, "guid change old=%llu new=%llu", oldguid, *newguid); #else spa_history_log_internal(spa, "guid change", tx, "old=%lld new=%lld", oldguid, *newguid); #endif } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. */ int spa_change_guid(spa_t *spa) { int error; uint64_t guid; mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, spa_change_guid_sync, spa, &guid, 5); if (error == 0) { spa_config_sync(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { spa_error_entry_t *sa = (spa_error_entry_t *)a; spa_error_entry_t *sb = (spa_error_entry_t *)b; int ret; ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_t)); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static taskq_t * spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, uint_t value) { uint_t flags = TASKQ_PREPOPULATE; boolean_t batch = B_FALSE; switch (mode) { case zti_mode_null: return (NULL); /* no taskq needed */ case zti_mode_fixed: ASSERT3U(value, >=, 1); value = MAX(value, 1); break; case zti_mode_batch: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = zio_taskq_batch_pct; break; case zti_mode_online_percent: flags |= TASKQ_THREADS_CPU_PCT; break; default: panic("unrecognized mode for %s taskq (%u:%u) in " "spa_activate()", name, mode, value); break; } #ifdef SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; return (taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags)); } #endif return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, spa->spa_proc, flags)); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; char name[32]; (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); spa->spa_zio_taskq[t][q] = spa_taskq_create(spa, name, mode, value); } } } #ifdef _KERNEL #ifdef SPA_PROCESS static void spa_thread(void *arg) { callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); #ifdef PSRSET_BIND /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } #endif #ifdef SYSDC if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } #endif spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif /* SPA_PROCESS */ #endif /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, int mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; #ifdef SPA_PROCESS /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* SPA_PROCESS */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ ASSERT(spa->spa_proc == &p0); if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } /* * Start TRIM thread. */ trim_thread_create(spa); list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); /* * Stop TRIM thread in case spa_unload() wasn't called directly * before spa_deactivate(). */ trim_thread_destroy(spa); txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_state_dirty_list); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { if (spa->spa_zio_taskq[t][q] != NULL) taskq_destroy(spa->spa_zio_taskq[t][q]); spa->spa_zio_taskq[t][q] = NULL; } } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); #ifdef SPA_PROCESS /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } #endif /* SPA_PROCESS */ } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ static int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (EINVAL); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { int i; ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Stop TRIM thread. */ trim_thread_destroy(spa); /* * Stop async tasks. */ spa_async_suspend(spa); /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { (void) zio_wait(spa->spa_async_zio_root); spa->spa_async_zio_root = NULL; } bpobj_close(&spa->spa_deferred_bpobj); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); for (i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } spa_config_exit(spa, SCL_ALL, FTAG); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ static void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ static void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (sav->sav_config != NULL) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); } else { nl2cache = 0; + newvdevs = NULL; } oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); } } /* * Purge vdevs that were dropped */ for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); if (sav->sav_config == NULL) goto out; sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = kmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); return (error); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); } } /* * Validate the current config against the MOS config */ static boolean_t spa_config_valid(spa_t *spa, nvlist_t *config) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); /* * If we're doing a normal import, then build up any additional * diagnostic information about missing devices in this config. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; if (tvd->vdev_ops == &vdev_missing_ops && mtvd->vdev_ops != &vdev_missing_ops && mtvd->vdev_islog) child[idx++] = vdev_config_generate(spa, mtvd, B_FALSE, 0); } if (idx) { VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, idx) == 0); VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); for (int i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); } /* * Compare the root vdev tree with the information we have * from the MOS config (mrvd). Check each top-level vdev * with the corresponding MOS config top-level (mtvd). */ for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; /* * Resolve any "missing" vdevs in the current configuration. * If we find that the MOS config has more accurate information * about the top-level vdev then use that vdev instead. */ if (tvd->vdev_ops == &vdev_missing_ops && mtvd->vdev_ops != &vdev_missing_ops) { if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) continue; /* * Device specific actions. */ if (mtvd->vdev_islog) { spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * XXX - once we have 'readonly' pool * support we should be able to handle * missing data devices by transitioning * the pool to readonly. */ continue; } /* * Swap the missing vdev with the data we were * able to obtain from the MOS config. */ vdev_remove_child(rvd, tvd); vdev_remove_child(mrvd, mtvd); vdev_add_child(rvd, mtvd); vdev_add_child(mrvd, tvd); spa_config_exit(spa, SCL_ALL, FTAG); vdev_load(mtvd); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_reopen(rvd); } else if (mtvd->vdev_islog) { /* * Load the slog device's state from the MOS config * since it's possible that the label does not * contain the most up-to-date information. */ vdev_load_log_state(tvd, mtvd); vdev_reopen(tvd); } } vdev_free(mrvd); spa_config_exit(spa, SCL_ALL, FTAG); /* * Ensure we were able to validate the config. */ return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); } /* * Check for missing log devices */ static int spa_check_logs(spa_t *spa) { switch (spa->spa_log_state) { case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, DS_FIND_CHILDREN)) { spa_set_log_state(spa, SPA_LOG_MISSING); return (1); } break; } return (0); } static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { metaslab_group_passivate(mg); slog_found = B_TRUE; } } return (slog_found); } static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) metaslab_group_activate(mg); } } int spa_offline_log(spa_t *spa) { int error = 0; if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, NULL, DS_FIND_CHILDREN)) == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { int i; for (i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_add_64(&sle->sle_meta_count, 1); else atomic_add_64(&sle->sle_data_count, 1); } zio_data_buf_free(zio->io_data, zio->io_size); } /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { if (bp != NULL) { zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); void *data = zio_data_buf_alloc(size); zio_nowait(zio_read(rio, spa, bp, data, size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); } return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_rewind_policy_t policy; boolean_t verify_ok = B_FALSE; int error; zpool_get_rewind_policy(spa->spa_config, &policy); if (policy.zrp_request & ZPOOL_NEVER_REWIND) return (0); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); (void) zio_wait(rio); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && sle.sle_data_count <= policy.zrp_maxdata) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); VERIFY(nvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (error) { if (error != ENXIO && error != EIO) error = EIO; return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) { return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val)); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (err); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig) { nvlist_t *config = spa->spa_config; char *ereport = FM_EREPORT_ZFS_POOL; char *comment; int error; uint64_t pool_guid; nvlist_t *nvl; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) return (EINVAL); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { error = EEXIST; } else { spa->spa_config_guid = pool_guid; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) { VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, KM_SLEEP) == 0); } nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); } spa->spa_minref = refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; return (error); } /* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ static int spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport) { int error = 0; nvlist_t *nvroot = NULL; nvlist_t *label; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; boolean_t missing_feat_write = B_FALSE; /* * If this is an untrusted config, access the pool in read-only mode. * This prevents things like resilvering recently removed devices. */ if (!mosconfig) spa->spa_mode = FREAD; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa->spa_load_state = state; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) return (EINVAL); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); ASSERT(spa->spa_root_vdev == rvd); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } /* * Try to open all vdevs, loading each label in the process. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); /* * We need to validate the vdev labels against the configuration that * we have in hand, which is dependent on the setting of mosconfig. If * mosconfig is true then we're validating the vdev labels based on * that config. Otherwise, we're validating against the cached config * (zpool.cache) that was read when we loaded the zfs module, and then * later we will recursively call spa_load() and validate against * the vdev config. * * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd, mosconfig); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) return (ENXIO); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL || nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { VERIFY(nvlist_add_string(unsup_feat, nvpair_name(nvp), "") == 0); } } if (!nvlist_empty(unsup_feat)) { VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); nvlist_free(unsup_feat); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } /* * If the vdev guid sum doesn't match the uberblock, we have an * incomplete configuration. We first check to see if the pool * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). * If it is, defer the vdev_guid_sum check till later so we * can handle missing vdevs. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!feature_is_supported(spa->spa_meta_objset, spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { if (!feature_is_supported(spa->spa_meta_objset, spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, unsup_feat, enabled_feat)) { missing_feat_write = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (missing_feat_write && spa_writeable(spa))) { return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } } spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!mosconfig) { uint64_t hostid; nvlist_t *policy = NULL, *nvconfig; if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; VERIFY(nvlist_lookup_string(nvconfig, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); #ifdef _KERNEL myhostid = zone_get_hostid(NULL); #else /* _KERNEL */ /* * We're emulating the system's hostid in userland, so * we can't use zone_get_hostid(). */ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); #endif /* _KERNEL */ if (check_hostid && hostid != 0 && myhostid != 0 && hostid != myhostid) { nvlist_free(nvconfig); cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " "See: http://illumos.org/msg/ZFS-8000-EY", spa_name(spa), hostname, (unsigned long)hostid); return (EBADF); } } if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY, &policy) == 0) VERIFY(nvlist_add_nvlist(nvconfig, ZPOOL_REWIND_POLICY, policy) == 0); spa_config_set(spa, nvconfig); spa_unload(spa); spa_deactivate(spa); spa_activate(spa, orig_mode); return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, &spa->spa_dedup_ditto); spa->spa_autoreplace = (autoreplace != 0); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev state for all toplevel vdevs. */ vdev_load(rvd); /* * Propagate the leaf DTLs we just loaded all the way up the tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); /* * Load the DDTs (dedup tables). */ error = ddt_load(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_update_dspace(spa); /* * Validate the config, using the MOS config to fill in any * information which might be missing. If we fail to validate * the config then declare the pool unfit for use. If we're * assembling a pool from a split, the log is not transferred * over. */ if (type != SPA_IMPORT_ASSEMBLE) { nvlist_t *nvconfig; if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!spa_config_valid(spa, nvconfig)) { nvlist_free(nvconfig); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } nvlist_free(nvconfig); /* * Now that we've validated the config, check the state of the * root vdev. If it can't be opened, it indicates one or * more toplevel vdevs are faulted. */ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) return (ENXIO); if (spa_check_logs(spa)) { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } if (missing_feat_write) { ASSERT(state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (state != SPA_LOAD_TRYIMPORT) { if (error = spa_load_verify(spa)) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; ASSERT(state != SPA_LOAD_TRYIMPORT); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(spa_get_dsl(spa), spa_first_txg(spa)); (void) dmu_objset_find(spa_name(spa), zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by either zil_check_log_chain() * (invoked from spa_check_logs()) or zil_claim() above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || state == SPA_LOAD_IMPORT || state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asychronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); /* * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(rvd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* * Delete any inconsistent datasets. */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); } return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) { int mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg--; spa_activate(spa, mode); spa_async_suspend(spa); return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig); if (load_error == 0) return (0); if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state, mosconfig); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (mutex_owner(&spa_namespace_lock) != curthread) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (ENOENT); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_rewind_policy_t policy; firstopen = B_TRUE; zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zrp_request & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, policy.zrp_request); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (ENOENT); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); #ifdef __FreeBSD__ #ifdef _KERNEL if (firstopen) zvol_create_minors(pool); #endif #endif } *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); if (nl2cache != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); } } } static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; zap_cursor_t zc; zap_attribute_t za; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features) == 0); nvlist_free(features); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; VERIFY(nvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); if (spa_suspended(spa)) VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (EINVAL); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (ENOTSUP); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = EINVAL; goto out; } /* * The L2ARC currently only supports disk devices in * kernel context. For user-level testing, we allow it. */ #ifdef _KERNEL if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { error = ENOTBLK; vdev_free(vd); goto out; } #endif vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatentating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs) == 0); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) VERIFY(nvlist_dup(olddevs[i], &newdevs[i], KM_SLEEP) == 0); for (i = 0; i < ndevs; i++) VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], KM_SLEEP) == 0); VERIFY(nvlist_remove(sav->sav_config, config, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, newdevs, ndevs + oldndevs) == 0); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs) == 0); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, const char *history_str, nvlist_t *zplprops) { spa_t *spa; char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; boolean_t has_features; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (EEXIST); } /* * Allocate a new spa_t structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, NULL, altroot); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } has_features = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) has_features = B_TRUE; } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = EINVAL; if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (int c = 0; c < rvd->vdev_children; c++) { vdev_metaslab_set_size(rvd->vdev_child[c]); vdev_expand(rvd->vdev_child[c], txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (spa_version(spa) >= SPA_VERSION_FEATURES) spa_feature_create_zap_objects(spa, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY) spa_history_create_obj(spa, tx); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(spa, props, tx); } dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * We explicitly wait for the first transaction to complete so that our * bean counters are appropriately updated. */ txg_wait_synced(spa->spa_dsl_pool, txg); spa_config_sync(spa, B_FALSE, B_TRUE); if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); spa_history_log_version(spa, LOG_POOL_CREATE); spa->spa_minref = refcount_count(&spa->spa_refcount); mutex_exit(&spa_namespace_lock); return (0); } #ifdef _KERNEL #if defined(sun) /* * Get the root pool information from the root disk, then import the root pool * during the system boot up time. */ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); static nvlist_t * spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) { nvlist_t *config; nvlist_t *nvtop, *nvroot; uint64_t pgid; if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) return (NULL); /* * Add this top-level vdev to the child array. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &nvtop, 1) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); return (config); } /* * Walk the vdev tree and see if we can find a device with "better" * configuration. A configuration is "better" if the label on that * device has a more recent txg. */ static void spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) { for (int c = 0; c < vd->vdev_children; c++) spa_alt_rootvdev(vd->vdev_child[c], avd, txg); if (vd->vdev_ops->vdev_op_leaf) { nvlist_t *label; uint64_t label_txg; if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, &label) != 0) return; VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg) == 0); /* * Do we have a better boot device? */ if (label_txg > *txg) { *txg = label_txg; *avd = vd; } nvlist_free(label); } } /* * Import a root pool. * * For x86. devpath_list will consist of devid and/or physpath name of * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). * The GRUB "findroot" command will return the vdev we should boot. * * For Sparc, devpath_list consists the physpath name of the booting device * no matter the rootpool is a single device pool or a mirrored pool. * e.g. * "/pci@1f,0/ide@d/disk@0,0:a" */ int spa_import_rootpool(char *devpath, char *devid) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t guid, txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(devpath, devid, &guid); #if defined(_OBP) && defined(_KERNEL) if (config == NULL) { if (strstr(devpath, "/iscsi/ssd") != NULL) { /* iscsi boot */ get_iscsi_bootpath_phy(devpath); config = spa_generate_rootconf(devpath, devid, &guid); } } #endif if (config == NULL) { cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); return (EIO); } VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pname)) != NULL) { /* * Remove the existing root pool from the namespace so that we * can replace it with the correct config we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } /* * Get the boot vdev. */ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", (u_longlong_t)guid); error = ENOENT; goto out; } /* * Determine if there is a better boot device. */ avd = bvd; spa_alt_rootvdev(rvd, &avd, &txg); if (avd != bvd) { cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " "try booting from '%s'", avd->vdev_path); error = EINVAL; goto out; } /* * If the boot device is part of a spare vdev then ensure that * we're booting off the active spare. */ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && !bvd->vdev_isspare) { cmn_err(CE_NOTE, "The boot device is currently spared. Please " "try booting from '%s'", bvd->vdev_parent-> vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); error = EINVAL; goto out; } error = 0; spa_history_log_version(spa, LOG_POOL_IMPORT); out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (error); } #else extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count); static nvlist_t * spa_generate_rootconf(const char *name) { nvlist_t **configs, **tops; nvlist_t *config; nvlist_t *best_cfg, *nvtop, *nvroot; uint64_t *holes; uint64_t best_txg; uint64_t nchildren; uint64_t pgid; uint64_t count; uint64_t i; uint_t nholes; if (vdev_geom_read_pool_label(name, &configs, &count) != 0) return (NULL); ASSERT3U(count, !=, 0); best_txg = 0; for (i = 0; i < count; i++) { uint64_t txg; VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if (txg > best_txg) { best_txg = txg; best_cfg = configs[i]; } } /* * Multi-vdev root pool configuration discovery is not supported yet. */ nchildren = 1; nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); holes = NULL; nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, &holes, &nholes); tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); for (i = 0; i < nchildren; i++) { if (i >= count) break; if (configs[i] == NULL) continue; VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); nvlist_dup(nvtop, &tops[i], KM_SLEEP); } for (i = 0; holes != NULL && i < nholes; i++) { if (i >= nchildren) continue; if (tops[holes[i]] != NULL) continue; nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, holes[i]) == 0); VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 0) == 0); } for (i = 0; i < nchildren; i++) { if (tops[i] != NULL) continue; nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, VDEV_TYPE_MISSING) == 0); VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, i) == 0); VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 0) == 0); } /* * Create pool config based on the best vdev config. */ nvlist_dup(best_cfg, &config, KM_SLEEP); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, tops, nchildren) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); /* * Drop vdev config elements that should not be present at pool level. */ nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); for (i = 0; i < count; i++) nvlist_free(configs[i]); kmem_free(configs, count * sizeof(void *)); for (i = 0; i < nchildren; i++) nvlist_free(tops[i]); kmem_free(tops, nchildren * sizeof(void *)); nvlist_free(nvroot); return (config); } int spa_import_rootpool(const char *name) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(name); mutex_enter(&spa_namespace_lock); if (config != NULL) { VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && strcmp(name, pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if ((spa = spa_lookup(pname)) != NULL) { /* * Remove the existing root pool from the namespace so * that we can replace it with the correct config * we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); /* * Set spa_ubsync.ub_version as it can be used in vdev_alloc() * via spa_version(). */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; } else if ((spa = spa_lookup(name)) == NULL) { cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", name); return (EIO); } else { VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); } spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (0); } #endif /* sun */ #endif /* * Import a non-root pool into the system. */ int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_rewind_policy_t policy; uint64_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (EEXIST); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = FREAD; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_config_sync(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); spa_history_log_version(spa, LOG_POOL_IMPORT); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_rewind_policy(config, &policy); if (policy.zrp_request & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; /* * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig * because the user-supplied config is actually the one to trust when * doing an import. */ if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, policy.zrp_request); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_L2CACHE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); mutex_exit(&spa_namespace_lock); spa_history_log_version(spa, LOG_POOL_IMPORT); #ifdef __FreeBSD__ #ifdef _KERNEL zvol_create_minors(pool); #endif #endif return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; char *poolname; spa_t *spa; uint64_t state; int error; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); /* * Pass off the heavy lifting to spa_load(). * Pass TRUE for mosconfig because the user-supplied config * is actually the one to trust when doing an import. */ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname) == 0); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & FWRITE)) return (EROFS); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (ENOENT); } /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); /* * The pool will be in core if it's openable, * in which case we can modify its state. */ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { /* * Objsets may be open only because they're dirty, so we * have to force it to sync before checking spa_refcnt. */ txg_wait_synced(spa->spa_dsl_pool, 0); /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (EBUSY); } /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (EXDEV); } /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } } spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); return (0); } /* * Destroy a storage pool. */ int spa_destroy(char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg, id; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * Transfer each new top-level vdev from vd to rvd. */ for (int c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. */ for (id = 0; id < rvd->vdev_children; id++) { if (rvd->vdev_child[id]->vdev_ishole) { vdev_free(rvd->vdev_child[id]); break; } } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * Spares can't replace logs */ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) sprintf(oldvd->vdev_path, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* mark the device being resilvered */ newvd->vdev_resilvering = B_TRUE; /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Restart the resilver */ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); if (spa->spa_bootfs) spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); return (0); } /* * Detach a device from a mirror or replacing vdev. * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; - uint64_t unspare_guid; + uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a spare, then it implies * that the spare should become a real disk, and be removed from the * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0 && pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); cvd->vdev_resilvering = B_FALSE; } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_offline_log(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || vd->vdev_ishole) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = EINVAL; break; } } /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = EINVAL; break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = ENODEV; break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || vml[c]->vdev_ishole || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = EINVAL; break; } if (vdev_dtl_required(vml[c])) { error = EBUSY; break; } /* we need certain info from the top level */ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children) == 0); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl) == 0); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); #ifndef sun /* mark that we are creating new spa by splitting */ newspa->spa_splitting_newspa = B_TRUE; #endif /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); #ifndef sun newspa->spa_splitting_newspa = B_FALSE; #endif if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { VERIFY(nvlist_alloc(&newspa->spa_config_splitting, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL) { vdev_split(vml[c]); if (error == 0) spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, "split new pool %s from pool %s", newname, spa_name(spa)); kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { for (int i = 0; i < count; i++) { uint64_t guid; VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (guid == target_guid) return (nvpp[i]); } return (NULL); } static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; if (count > 1) newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); for (int i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); } VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); for (int i = 0; i < count - 1; i++) nvlist_free(newdev[i]); if (count > 1) kmem_free(newdev, (count - 1) * sizeof (void *)); } /* * Evacuate the device. */ static int spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) { uint64_t txg; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); ASSERT(vd == vd->vdev_top); /* * Evacuate the device. We don't hold the config lock as writer * since we need to do I/O but we do keep the * spa_namespace_lock held. Once this completes the device * should no longer have any blocks allocated on it. */ if (vd->vdev_islog) { if (vd->vdev_stat.vs_alloc != 0) error = spa_offline_log(spa); } else { error = ENOTSUP; } if (error) return (error); /* * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ ASSERT0(vd->vdev_stat.vs_alloc); txg = spa_vdev_config_enter(spa); vd->vdev_removing = B_TRUE; vdev_dirty(vd, 0, NULL, txg); vdev_config_dirty(vd); spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); return (0); } /* * Complete the removal by cleaning up the namespace. */ static void spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; uint64_t id = vd->vdev_id; boolean_t last_vdev = (id == (rvd->vdev_children - 1)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vd == vd->vdev_top); /* * Only remove any devices which are empty. */ if (vd->vdev_stat.vs_alloc != 0) return; (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) vdev_state_clean(vd); if (list_link_active(&vd->vdev_config_dirty_node)) vdev_config_clean(vd); vdev_free(vd); if (last_vdev) { vdev_compact_children(rvd); } else { vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); } vdev_config_dirty(rvd); /* * Reassess the health of our root vdev. */ vdev_reopen(rvd); } /* * Remove a device from the pool - * * Removing a device from the vdev namespace requires several steps * and can take a significant amount of time. As a result we use * the spa_vdev_config_[enter/exit] functions which allow us to * grab and release the spa_config_lock while still holding the namespace * lock. During each step the configuration is synced out. */ /* * Remove a device from the pool. Currently, this supports removing only hot * spares, slogs, and level 2 ARC devices. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; metaslab_group_t *mg; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; uint_t nspares, nl2cache; int error = 0; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); ASSERT(spa_writeable(spa)); if (!locked) txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_spares.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { /* * Only remove the hot spare if it's not currently in use * in this pool. */ if (vd == NULL || unspare) { spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } else { error = EBUSY; } } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { /* * Cache devices can always be removed. */ spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); ASSERT(vd == vd->vdev_top); /* * XXX - Once we have bp-rewrite this should * become the common case. */ mg = vd->vdev_mg; /* * Stop allocating from this vdev. */ metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * Attempt to evacuate the vdev. */ error = spa_vdev_remove_evacuate(spa, vd); txg = spa_vdev_config_enter(spa); /* * If we couldn't evacuate the vdev, unwind. */ if (error) { metaslab_group_activate(mg); return (spa_vdev_exit(spa, NULL, txg, error)); } /* * Clean up the vdev namespace. */ spa_vdev_remove_from_namespace(spa, vd); } else if (vd != NULL) { /* * Normal vdevs cannot be removed (yet). */ error = ENOTSUP; } else { /* * There is no vdev of any kind with the specified guid. */ error = ENOENT; } if (!locked) return (spa_vdev_exit(spa, NULL, txg, error)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * current spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); } /* * Update the stored path or FRU for this vdev. */ int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (EBUSY); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (ENOTSUP); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { sysevent_id_t eid; nvlist_t *attr; char *physpath; if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); nvlist_free(attr); kmem_free(physpath, MAXPATHLEN); } static void spa_async_thread(void *arg) { spa_t *spa = arg; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(LOG_POOL_VDEV_ONLINE, spa, NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), new_space, new_space - old_space); } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE) spa_vdev_resilver_done(spa); /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) dsl_resilver_restart(spa->spa_dsl_pool, 0); /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa->spa_async_tasks && !spa->spa_async_suspended && spa->spa_async_thread == NULL && rootdir != NULL && !vn_is_readonly(rootdir)) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, tx); return (0); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), zio->io_flags)); return (0); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dbuf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = kmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); kmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (sav->sav_count == 0) { VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; if (list_is_empty(&spa->spa_config_dirty_list)) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); if (spa->spa_config_syncing) nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; uint64_t version = *(uint64_t *)arg2; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(version <= SPA_VERSION); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); } /* * Set zpool properties. */ static void spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; objset_t *mos = spa->spa_meta_objset; nvlist_t *nvp = arg2; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; char *strval, *fname; zpool_prop_t prop; const char *propname; zprop_type_t proptype; zfeature_info_t *feature; switch (prop = zpool_name_to_prop(nvpair_name(elem))) { case ZPROP_INVAL: /* * We checked this earlier in spa_prop_validate(). */ ASSERT(zpool_prop_feature(nvpair_name(elem))); fname = strchr(nvpair_name(elem), '@') + 1; VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); spa_feature_enable(spa, feature, tx); break; case ZPOOL_PROP_VERSION: VERIFY(nvpair_value_uint64(elem, &intval) == 0); /* * The version is synced seperatly before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persisitent * properties. */ break; case ZPOOL_PROP_COMMENT: VERIFY(nvpair_value_string(elem, &strval) == 0); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. It's unnecessary * to do this for pool creation since the vdev's * configuratoin has already been dirtied. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); VERIFY(nvpair_value_string(elem, &strval) == 0); VERIFY(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx) == 0); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { VERIFY(nvpair_value_uint64(elem, &intval) == 0); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY(zpool_prop_index_to_string( prop, intval, &unused) == 0); } VERIFY(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx) == 0); } else { ASSERT(0); /* not allowed */ } switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_DEDUPDITTO: spa->spa_dedup_ditto = intval; break; default: break; } } /* log internal history if this is not a zpool create */ if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && tx->tx_txg != TXG_INITIAL) { spa_history_log_internal(LOG_POOL_PROPSET, spa, tx, "%s %lld %s", nvpair_name(elem), intval, spa_name(spa)); } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { dsl_pool_t *dp = spa->spa_dsl_pool; ASSERT(spa->spa_sync_pass == 1); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; int error; VERIFY(spa_writeable(spa)); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while (list_head(&spa->spa_state_dirty_list) != NULL) { /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); tx = dmu_tx_create_assigned(dp, txg); /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY(0 == zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } /* * If anything has changed in this txg, or if someone is waiting * for this txg to sync (eg, spa_vdev_remove()), push the * deferred frees from the previous txg. If not, leave them * alone so that we don't generate work on an otherwise idle * system. */ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || !txg_list_empty(&dp->dp_dirty_dirs, txg) || !txg_list_empty(&dp->dp_sync_tasks, txg) || ((dsl_scan_active(dp->dp_scan) || txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(defer_bpo, spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } /* * Iterate to convergence. */ do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(free_bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } else { bplist_iterate(free_bpl, bpobj_enqueue_cb, defer_bpo, tx); } ddt_sync(spa, txg); dsl_scan_sync(dp, tx); while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) vdev_sync(vd, txg); if (pass == 1) spa_sync_upgrades(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few * random top-level vdevs that are known to be visible in the * config cache (see spa_vdev_add() for a complete description). * If there *are* dirty vdevs, sync the uberblock to all vdevs. */ for (;;) { /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_DVAS_PER_BP]; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; if (vd->vdev_ms_array == 0 || vd->vdev_islog) continue; svd[svdcount++] = vd; if (svdcount == SPA_DVAS_PER_BP) break; } error = vdev_config_sync(svd, svdcount, txg, B_FALSE); if (error != 0) error = vdev_config_sync(svd, svdcount, txg, B_TRUE); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg, B_FALSE); if (error != 0) error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg, B_TRUE); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL); zio_resume_wait(spa); } dmu_tx_commit(tx); /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } spa->spa_ubsync = spa->spa_uberblock; dsl_pool_sync_done(dp, txg); /* * Update usable space statistics. */ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) vdev_sync_done(vd, txg); spa_update_dspace(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); spa->spa_sync_pass = 0; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); ASSERT(version >= spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &spareguid) == 0 && spareguid == guid) return (B_TRUE); } return (B_FALSE); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } /* * Post a sysevent corresponding to the given event. The 'name' must be one of * the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) { #ifdef _KERNEL sysevent_t *ev; sysevent_attr_list_t *attr = NULL; sysevent_value_t value; sysevent_id_t eid; ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", SE_SLEEP); value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = spa_name(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) goto done; value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = spa_guid(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) goto done; if (vd) { value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = vd->vdev_guid; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, SE_SLEEP) != 0) goto done; if (vd->vdev_path) { value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = vd->vdev_path; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, &value, SE_SLEEP) != 0) goto done; } } if (sysevent_attach_attributes(ev, attr) != 0) goto done; attr = NULL; (void) log_sysevent(ev, SE_SLEEP, &eid); done: if (attr) sysevent_free_attr(attr); sysevent_free(ev); #endif } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c (revision 247192) @@ -1,2172 +1,2173 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include /* * Virtual device vector for RAID-Z. * * This vdev supports single, double, and triple parity. For single parity, * we use a simple XOR of all the data columns. For double or triple parity, * we use a special case of Reed-Solomon coding. This extends the * technique described in "The mathematics of RAID-6" by H. Peter Anvin by * drawing on the system described in "A Tutorial on Reed-Solomon Coding for * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the * former is also based. The latter is designed to provide higher performance * for writes. * * Note that the Plank paper claimed to support arbitrary N+M, but was then * amended six years later identifying a critical flaw that invalidates its * claims. Nevertheless, the technique can be adapted to work for up to * triple parity. For additional parity, the amendment "Note: Correction to * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding * is viable, but the additional complexity means that write performance will * suffer. * * All of the methods above operate on a Galois field, defined over the * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements * can be expressed with a single byte. Briefly, the operations on the * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B * o multiplication of A by 2 is defined by the following bitwise expression: * (A * 2)_7 = A_6 * (A * 2)_6 = A_5 * (A * 2)_5 = A_4 * (A * 2)_4 = A_3 + A_7 * (A * 2)_3 = A_2 + A_7 * (A * 2)_2 = A_1 + A_7 * (A * 2)_1 = A_0 * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). * As an aside, this multiplication is derived from the error correcting * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather * than field addition). The inverse of a field element A (A^-1) is therefore * A ^ (255 - 1) = A^254. * * The up-to-three parity columns, P, Q, R over several data columns, * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival * XOR operation, and 2 and 4 can be computed quickly and generate linearly- * independent coefficients. (There are no additional coefficients that have * this property which is why the uncorrected Plank method breaks down.) * * See the reconstruction code below for how P, Q and R can used individually * or in concert to recover missing data columns. */ typedef struct raidz_col { uint64_t rc_devidx; /* child device index for I/O */ uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ void *rc_data; /* I/O data */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ } raidz_col_t; typedef struct raidz_map { uint64_t rm_cols; /* Regular column count */ uint64_t rm_scols; /* Count including skipped columns */ uint64_t rm_bigcols; /* Number of oversized columns */ uint64_t rm_asize; /* Actual total I/O size */ uint64_t rm_missingdata; /* Count of missing data devices */ uint64_t rm_missingparity; /* Count of missing parity devices */ uint64_t rm_firstdatacol; /* First data column/parity count */ uint64_t rm_nskip; /* Skipped sectors for padding */ uint64_t rm_skipstart; /* Column index of padding start */ void *rm_datacopy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) /* * We provide a mechanism to perform the field multiplication operation on a * 64-bit value all at once rather than a byte at a time. This works by * creating a mask from the top bit in each byte and using that to * conditionally apply the XOR of 0x1d. */ #define VDEV_RAIDZ_64MUL_2(x, mask) \ { \ (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ ((mask) & 0x1d1d1d1d1d1d1d1d); \ } #define VDEV_RAIDZ_64MUL_4(x, mask) \ { \ VDEV_RAIDZ_64MUL_2((x), mask); \ VDEV_RAIDZ_64MUL_2((x), mask); \ } /* * Force reconstruction to use the general purpose method. */ int vdev_raidz_default_to_general; /* * These two tables represent powers and logs of 2 in the Galois field defined * above. These values were computed by repeatedly multiplying by 2 as above. */ static const uint8_t vdev_raidz_pow2[256] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 }; static const uint8_t vdev_raidz_log2[256] = { 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, }; static void vdev_raidz_generate_parity(raidz_map_t *rm); /* * Multiply a given number by 2 raised to the given power. */ static uint8_t vdev_raidz_exp2(uint_t a, int exp) { if (a == 0) return (0); ASSERT(exp >= 0); ASSERT(vdev_raidz_log2[a] > 0 || a == 1); exp += vdev_raidz_log2[a]; if (exp > 255) exp -= 255; return (vdev_raidz_pow2[exp]); } static void vdev_raidz_map_free(raidz_map_t *rm) { int c; size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { if (rm->rm_col[c].rc_data != NULL) zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, rm->rm_col[c].rc_size); } size = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; if (rm->rm_datacopy != NULL) zio_buf_free(rm->rm_datacopy, size); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } static void vdev_raidz_map_free_vsd(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; ASSERT0(rm->rm_freed); rm->rm_freed = 1; if (rm->rm_reports == 0) vdev_raidz_map_free(rm); } /*ARGSUSED*/ static void vdev_raidz_cksum_free(void *arg, size_t ignored) { raidz_map_t *rm = arg; ASSERT3U(rm->rm_reports, >, 0); if (--rm->rm_reports == 0 && rm->rm_freed != 0) vdev_raidz_map_free(rm); } static void vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) { raidz_map_t *rm = zcr->zcr_cbdata; size_t c = zcr->zcr_cbinfo; size_t x; const char *good = NULL; const char *bad = rm->rm_col[c].rc_data; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); return; } if (c < rm->rm_firstdatacol) { /* * The first time through, calculate the parity blocks for * the good data (this relies on the fact that the good * data never changes for a given logical ZIO) */ if (rm->rm_col[0].rc_gdata == NULL) { char *bad_parity[VDEV_RAIDZ_MAXPARITY]; char *buf; /* * Set up the rm_col[]s to generate the parity for * good_data, first saving the parity bufs and * replacing them with buffers to hold the result. */ for (x = 0; x < rm->rm_firstdatacol; x++) { bad_parity[x] = rm->rm_col[x].rc_data; rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = zio_buf_alloc(rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ buf = (char *)good_data; for (; x < rm->rm_cols; x++) { rm->rm_col[x].rc_data = buf; buf += rm->rm_col[x].rc_size; } /* * Construct the parity from the good data. */ vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ for (x = 0; x < rm->rm_firstdatacol; x++) rm->rm_col[x].rc_data = bad_parity[x]; buf = rm->rm_datacopy; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { rm->rm_col[x].rc_data = buf; buf += rm->rm_col[x].rc_size; } } ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); good = rm->rm_col[c].rc_gdata; } else { /* adjust good_data to point at the start of our column */ good = good_data; for (x = rm->rm_firstdatacol; x < c; x++) good += rm->rm_col[x].rc_size; } /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); } /* * Invoked indirectly by zfs_ereport_start_checksum(), called * below when our read operation fails completely. The main point * is to keep a copy of everything we read from disk, so that at * vdev_raidz_cksum_finish() time we can compare it with the good data. */ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; caddr_t buf; raidz_map_t *rm = zio->io_vsd; size_t size; /* set up the report and bump the refcount */ zcr->zcr_cbdata = rm; zcr->zcr_cbinfo = c; zcr->zcr_finish = vdev_raidz_cksum_finish; zcr->zcr_free = vdev_raidz_cksum_free; rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); if (rm->rm_datacopy != NULL) return; /* * It's the first time we're called for this raidz_map_t, so we need * to copy the data aside; there's no guarantee that our zio's buffer * won't be re-used for something else. * * Our parity data is already in separate buffers, so there's no need * to copy them. */ size = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; buf = rm->rm_datacopy = zio_buf_alloc(size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; bcopy(col->rc_data, buf, col->rc_size); col->rc_data = buf; buf += col->rc_size; } ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { vdev_raidz_map_free_vsd, vdev_raidz_cksum_report }; static raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; uint64_t b = zio->io_offset >> unit_shift; uint64_t s = zio->io_size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; q = s / (dcols - nparity); r = s - q * (dcols - nparity); bc = (r == 0 ? 0 : r + nparity); tot = s + nparity * (q + (r == 0 ? 0 : 1)); if (q == 0) { acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { acols = dcols; scols = dcols; } ASSERT3U(acols, <=, scols); rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); rm->rm_cols = acols; rm->rm_scols = scols; rm->rm_bigcols = bc; rm->rm_skipstart = bc; rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; rm->rm_datacopy = NULL; rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; asize = 0; for (c = 0; c < scols; c++) { col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << unit_shift; } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; rm->rm_col[c].rc_data = NULL; rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_skipped = 0; if (c >= acols) rm->rm_col[c].rc_size = 0; else if (c < bc) rm->rm_col[c].rc_size = (q + 1) << unit_shift; else rm->rm_col[c].rc_size = q << unit_shift; asize += rm->rm_col[c].rc_size; } ASSERT3U(asize, ==, tot << unit_shift); rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_nskip, <=, nparity); if (zio->io_type != ZIO_TYPE_FREE) { for (c = 0; c < rm->rm_firstdatacol; c++) { rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); } rm->rm_col[c].rc_data = zio->io_data; for (c = c + 1; c < acols; c++) { rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + rm->rm_col[c - 1].rc_size; } } /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read * during normal operation, that that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical * matter unless we juggle the parity between all devices evenly, we * won't see any benefit. Further, occasional writes that aren't a * multiple of the LCM of the number of children and the minimum * stripe width are sufficient to avoid pessimal behavior. * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. * * If we intend to skip a sector in the zeroth column for padding * we must make sure to note this swap. We will never intend to * skip the first column since at least one data and one parity * column must appear in each row. */ ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { devidx = rm->rm_col[0].rc_devidx; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; rm->rm_col[1].rc_devidx = devidx; rm->rm_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; return (rm); } static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { uint64_t *p, *src, pcount, ccount, i; int c; pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; ccount = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { ASSERT(ccount == pcount); for (i = 0; i < ccount; i++, src++, p++) { *p = *src; } } else { ASSERT(ccount <= pcount); for (i = 0; i < ccount; i++, src++, p++) { *p ^= *src; } } } } static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { uint64_t *p, *q, *src, pcnt, ccnt, mask, i; int c; pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); for (i = 0; i < ccnt; i++, src++, p++, q++) { *p = *src; *q = *src; } for (; i < pcnt; i++, src++, p++, q++) { *p = 0; *q = 0; } } else { ASSERT(ccnt <= pcnt); /* * Apply the algorithm described above by multiplying * the previous result and adding in the new value. */ for (i = 0; i < ccnt; i++, src++, p++, q++) { *p ^= *src; VDEV_RAIDZ_64MUL_2(*q, mask); *q ^= *src; } /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ for (; i < pcnt; i++, q++) { VDEV_RAIDZ_64MUL_2(*q, mask); } } } } static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; int c; pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_R].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; r = rm->rm_col[VDEV_RAIDZ_R].rc_data; ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { *p = *src; *q = *src; *r = *src; } for (; i < pcnt; i++, src++, p++, q++, r++) { *p = 0; *q = 0; *r = 0; } } else { ASSERT(ccnt <= pcnt); /* * Apply the algorithm described above by multiplying * the previous result and adding in the new value. */ for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { *p ^= *src; VDEV_RAIDZ_64MUL_2(*q, mask); *q ^= *src; VDEV_RAIDZ_64MUL_4(*r, mask); *r ^= *src; } /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ for (; i < pcnt; i++, q++, r++) { VDEV_RAIDZ_64MUL_2(*q, mask); VDEV_RAIDZ_64MUL_4(*r, mask); } } } } /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. */ static void vdev_raidz_generate_parity(raidz_map_t *rm) { switch (rm->rm_firstdatacol) { case 1: vdev_raidz_generate_parity_p(rm); break; case 2: vdev_raidz_generate_parity_pq(rm); break; case 3: vdev_raidz_generate_parity_pqr(rm); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } static int vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, i; int x = tgts[0]; int c; ASSERT(ntgts == 1); ASSERT(x >= rm->rm_firstdatacol); ASSERT(x < rm->rm_cols); xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); ASSERT(xcount > 0); src = rm->rm_col[VDEV_RAIDZ_P].rc_data; dst = rm->rm_col[x].rc_data; for (i = 0; i < xcount; i++, dst++, src++) { *dst = *src; } for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; dst = rm->rm_col[x].rc_data; if (c == x) continue; ccount = rm->rm_col[c].rc_size / sizeof (src[0]); count = MIN(ccount, xcount); for (i = 0; i < count; i++, dst++, src++) { *dst ^= *src; } } return (1 << VDEV_RAIDZ_P); } static int vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, mask, i; uint8_t *b; int x = tgts[0]; int c, j, exp; ASSERT(ntgts == 1); xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; dst = rm->rm_col[x].rc_data; if (c == x) ccount = 0; else ccount = rm->rm_col[c].rc_size / sizeof (src[0]); count = MIN(ccount, xcount); if (c == rm->rm_firstdatacol) { for (i = 0; i < count; i++, dst++, src++) { *dst = *src; } for (; i < xcount; i++, dst++) { *dst = 0; } } else { for (i = 0; i < count; i++, dst++, src++) { VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } for (; i < xcount; i++, dst++) { VDEV_RAIDZ_64MUL_2(*dst, mask); } } } src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; dst = rm->rm_col[x].rc_data; exp = 255 - (rm->rm_cols - 1 - x); for (i = 0; i < xcount; i++, dst++, src++) { *dst ^= *src; for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { *b = vdev_raidz_exp2(*b, exp); } } return (1 << VDEV_RAIDZ_Q); } static int vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; void *pdata, *qdata; uint64_t xsize, ysize, i; int x = tgts[0]; int y = tgts[1]; ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rm->rm_firstdatacol); ASSERT(y < rm->rm_cols); ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as * though columns x and y were full of zeros -- Pxy and Qxy. We want to * reuse the parity generation mechanism without trashing the actual * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; xsize = rm->rm_col[x].rc_size; ysize = rm->rm_col[y].rc_size; rm->rm_col[VDEV_RAIDZ_P].rc_data = zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); rm->rm_col[VDEV_RAIDZ_Q].rc_data = zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); rm->rm_col[x].rc_size = 0; rm->rm_col[y].rc_size = 0; vdev_raidz_generate_parity_pq(rm); rm->rm_col[x].rc_size = xsize; rm->rm_col[y].rc_size = ysize; p = pdata; q = qdata; pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; xd = rm->rm_col[x].rc_data; yd = rm->rm_col[y].rc_data; /* * We now have: * Pxy = P + D_x + D_y * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y * * We can then solve for D_x: * D_x = A * (P + Pxy) + B * (Q + Qxy) * where * A = 2^(x - y) * (2^(x - y) + 1)^-1 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 * * With D_x in hand, we can easily solve for D_y: * D_y = P + Pxy + D_x */ a = vdev_raidz_pow2[255 + x - y]; b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ vdev_raidz_exp2(*q ^ *qxy, bexp); if (i < ysize) *yd = *p ^ *pxy ^ *xd; } zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, rm->rm_col[VDEV_RAIDZ_P].rc_size); zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, rm->rm_col[VDEV_RAIDZ_Q].rc_size); /* * Restore the saved parity data. */ rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } /* BEGIN CSTYLED */ /* * In the general case of reconstruction, we must solve the system of linear * equations defined by the coeffecients used to generate parity as well as * the contents of the data and parity disks. This can be expressed with * vectors for the original data (D) and the actual data (d) and parity (p) * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): * * __ __ __ __ * | | __ __ | p_0 | * | V | | D_0 | | p_m-1 | * | | x | : | = | d_0 | * | I | | D_n-1 | | : | * | | ~~ ~~ | d_n-1 | * ~~ ~~ ~~ ~~ * * I is simply a square identity matrix of size n, and V is a vandermonde * matrix defined by the coeffecients we chose for the various parity columns * (1, 2, 4). Note that these values were chosen both for simplicity, speedy * computation as well as linear separability. * * __ __ __ __ * | 1 .. 1 1 1 | | p_0 | * | 2^n-1 .. 4 2 1 | __ __ | : | * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | * | 1 .. 0 0 0 | | D_1 | | d_0 | * | 0 .. 0 0 0 | x | D_2 | = | d_1 | * | : : : : | | : | | d_2 | * | 0 .. 1 0 0 | | D_n-1 | | : | * | 0 .. 0 1 0 | ~~ ~~ | : | * | 0 .. 0 0 1 | | d_n-1 | * ~~ ~~ ~~ ~~ * * Note that I, V, d, and p are known. To compute D, we must invert the * matrix and use the known data and parity values to reconstruct the unknown * data values. We begin by removing the rows in V|I and d|p that correspond * to failed or missing columns; we then make V|I square (n x n) and d|p * sized n by removing rows corresponding to unused parity from the bottom up * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' * using Gauss-Jordan elimination. In the example below we use m=3 parity * columns, n=8 data columns, with errors in d_1, d_2, and p_1: * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks * | 19 205 116 29 64 16 4 1 | / / * | 1 0 0 0 0 0 0 0 | / / * | 0 1 0 0 0 0 0 0 | <--' / * (V|I) = | 0 0 1 0 0 0 0 0 | <---' * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 | * (V|I)' = | 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We * have carefully chosen the seed values 1, 2, and 4 to ensure that this * matrix is not singular. * __ __ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 0 0 1 0 0 0 0 0 | * | 167 100 5 41 159 169 217 208 | * | 166 100 4 40 158 168 216 209 | * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values * of the missing data. * * As is apparent from the example above, the only non-trivial rows in the * inverse matrix correspond to the data disks that we're trying to * reconstruct. Indeed, those are the only rows we need as the others would * only be useful for reconstructing data known or assumed to be valid. For * that reason, we only build the coefficients in the rows that correspond to * targeted columns. */ /* END CSTYLED */ static void vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); /* * Fill in the missing rows of interest. */ for (i = 0; i < nmap; i++) { ASSERT3S(0, <=, map[i]); ASSERT3S(map[i], <=, 2); pow = map[i] * n; if (pow > 255) pow -= 255; ASSERT(pow <= 255); for (j = 0; j < n; j++) { pow -= map[i]; if (pow < 0) pow += 255; rows[i][j] = vdev_raidz_pow2[pow]; } } } static void vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; uint8_t log; /* * Assert that the first nmissing entries from the array of used * columns correspond to parity columns and that subsequent entries * correspond to data columns. */ for (i = 0; i < nmissing; i++) { ASSERT3S(used[i], <, rm->rm_firstdatacol); } for (; i < n; i++) { ASSERT3S(used[i], >=, rm->rm_firstdatacol); } /* * First initialize the storage where we'll compute the inverse rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { invrows[i][j] = (i == j) ? 1 : 0; } } /* * Subtract all trivial rows from the rows of consequence. */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { ASSERT3U(used[j], >=, rm->rm_firstdatacol); jj = used[j] - rm->rm_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; } } /* * For each of the rows of interest, we must normalize it and subtract * a multiple of it from the other rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < missing[i]; j++) { ASSERT0(rows[i][j]); } ASSERT3U(rows[i][missing[i]], !=, 0); /* * Compute the inverse of the first element and multiply each * element in the row by that value. */ log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; for (j = 0; j < n; j++) { rows[i][j] = vdev_raidz_exp2(rows[i][j], log); invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); } for (ii = 0; ii < nmissing; ii++) { if (i == ii) continue; ASSERT3U(rows[ii][missing[i]], !=, 0); log = vdev_raidz_log2[rows[ii][missing[i]]]; for (j = 0; j < n; j++) { rows[ii][j] ^= vdev_raidz_exp2(rows[i][j], log); invrows[ii][j] ^= vdev_raidz_exp2(invrows[i][j], log); } } } /* * Verify that the data that is left in the rows are properly part of * an identity matrix. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { if (j == missing[i]) { ASSERT3U(rows[i][j], ==, 1); } else { ASSERT0(rows[i][j]); } } } } static void vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; uint8_t *src; uint64_t ccount; uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; - uint8_t log, val; + uint8_t log = 0; + uint8_t val; int ll; uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; psize = sizeof (invlog[0][0]) * n * nmissing; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing; i++) { invlog[i] = pp; pp += n; } for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { ASSERT3U(invrows[i][j], !=, 0); invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; } } for (i = 0; i < n; i++) { c = used[i]; ASSERT3U(c, <, rm->rm_cols); src = rm->rm_col[c].rc_data; ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; ASSERT3U(cc, >=, rm->rm_firstdatacol); ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); dst[j] = rm->rm_col[cc].rc_data; dcount[j] = rm->rm_col[cc].rc_size; } ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; for (cc = 0; cc < nmissing; cc++) { if (x >= dcount[cc]) continue; if (*src == 0) { val = 0; } else { if ((ll = log + invlog[cc][i]) >= 255) ll -= 255; val = vdev_raidz_pow2[ll]; } if (i == 0) dst[cc][x] = val; else dst[cc][x] ^= val; } } } kmem_free(p, psize); } static int vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; int code = 0; n = rm->rm_cols - rm->rm_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { if (tgts[t] >= rm->rm_firstdatacol) { missing_rows[nmissing_rows++] = tgts[t] - rm->rm_firstdatacol; } } /* * Figure out which parity columns to use to help generate the missing * data columns. */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); ASSERT(c < rm->rm_firstdatacol); /* * Skip any targeted parity columns. */ if (c == tgts[tt]) { tt++; continue; } code |= 1 << c; parity_map[i] = c; i++; } ASSERT(code != 0); ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing_rows; i++) { rows[i] = pp; pp += n; invrows[i] = pp; pp += n; } used = pp; for (i = 0; i < nmissing_rows; i++) { used[i] = parity_map[i]; } for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { if (tt < nmissing_rows && c == missing_rows[tt] + rm->rm_firstdatacol) { tt++; continue; } ASSERT3S(i, <, n); used[i] = c; i++; } /* * Initialize the interesting rows of the matrix. */ vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); return (code); } static int vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; int i, c; int code; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; /* * The tgts list must already be sorted. */ for (i = 1; i < nt; i++) { ASSERT(t[i] > t[i - 1]); } nbadparity = rm->rm_firstdatacol; nbaddata = rm->rm_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rm->rm_cols; c++) { if (c < rm->rm_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; } else if (rm->rm_col[c].rc_error != 0) { tgts[ntgts++] = c; } else if (c >= rm->rm_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; nbadparity--; } } ASSERT(ntgts >= nt); ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); dt = &tgts[nbadparity]; /* * See if we can use any of our optimized reconstruction routines. */ if (!vdev_raidz_default_to_general) { switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) return (vdev_raidz_reconstruct_p(rm, dt, 1)); ASSERT(rm->rm_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) return (vdev_raidz_reconstruct_q(rm, dt, 1)); ASSERT(rm->rm_firstdatacol > 2); break; case 2: ASSERT(rm->rm_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) return (vdev_raidz_reconstruct_pq(rm, dt, 2)); ASSERT(rm->rm_firstdatacol > 2); break; } } code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); return (code); } static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *ashift) { vdev_t *cvd; uint64_t nparity = vd->vdev_nparity; int c; int lasterror = 0; int numerrors = 0; ASSERT(nparity > 0); if (nparity > VDEV_RAIDZ_MAXPARITY || vd->vdev_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *ashift = MAX(*ashift, cvd->vdev_ashift); } *asize *= vd->vdev_children; *max_asize *= vd->vdev_children; if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_raidz_close(vdev_t *vd) { int c; for (c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t cols = vd->vdev_children; uint64_t nparity = vd->vdev_nparity; asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; return (asize); } static void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; } static int vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_t *cvd; raidz_map_t *rm; raidz_col_t *rc; int c, i; rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_FREE) { for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } return (ZIO_PIPELINE_CONTINUE); } if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } /* * Generate optional I/Os for any skipped sectors to improve * aggregation contiguity. */ for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { ASSERT(c <= rm->rm_scols); if (c == rm->rm_scols) c = 0; rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset + rc->rc_size, NULL, 1 << tvd->vdev_ashift, zio->io_type, zio->io_priority, ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } return (ZIO_PIPELINE_CONTINUE); } ASSERT(zio->io_type == ZIO_TYPE_READ); /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_readable(cvd)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else rm->rm_missingparity++; rc->rc_error = ENXIO; rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else rm->rm_missingparity++; rc->rc_error = ESTALE; rc->rc_skipped = 1; continue; } if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } return (ZIO_PIPELINE_CONTINUE); } /* * Report a checksum error for a child of a RAID-Z device. */ static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; zfs_ereport_post_checksum(zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, &zbc); } } /* * We keep track of whether or not there were any injected errors, so that * any ereports we generate can note it. */ static int raidz_checksum_verify(zio_t *zio) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; int ret = zio_checksum_error(zio, &zbc); if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; return (ret); } /* * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the * number such failures. */ static int raidz_parity_verify(zio_t *zio, raidz_map_t *rm) { void *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; raidz_col_t *rc; for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zio_buf_alloc(rc->rc_size); bcopy(rc->rc_data, orig[c], rc->rc_size); } vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = ECKSUM; ret++; } zio_buf_free(orig[c], rc->rc_size); } return (ret); } /* * Keep statistics on all the ways that we used parity to correct data. */ static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; static int vdev_raidz_worst_error(raidz_map_t *rm) { int error = 0; for (int c = 0; c < rm->rm_cols; c++) error = zio_worst_error(error, rm->rm_col[c].rc_error); return (error); } /* * Iterate over all combinations of bad data and attempt a reconstruction. * Note that the algorithm below is non-optimal because it doesn't take into * account how reconstruction is actually performed. For example, with * triple-parity RAID-Z the reconstruction procedure is the same if column 4 * is targeted as invalid as if columns 1 and 4 are targeted since in both * cases we'd only use parity information in column 0. */ static int vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) { raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; void *orig[VDEV_RAIDZ_MAXPARITY]; int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *tgts = &tstore[1]; int current, next, i, c, n; int code, ret = 0; ASSERT(total_errors < rm->rm_firstdatacol); /* * This simplifies one edge condition. */ tgts[-1] = -1; for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { /* * Initialize the targets array by finding the first n columns * that contain no error. * * If there were no data errors, we need to ensure that we're * always explicitly attempting to reconstruct at least one * data column. To do this, we simply push the highest target * up into the data columns. */ for (c = 0, i = 0; i < n; i++) { if (i == n - 1 && data_errors == 0 && c < rm->rm_firstdatacol) { c = rm->rm_firstdatacol; } while (rm->rm_col[c].rc_error != 0) { c++; ASSERT3S(c, <, rm->rm_cols); } tgts[i] = c++; } /* * Setting tgts[n] simplifies the other edge condition. */ tgts[n] = rm->rm_cols; /* * These buffers were allocated in previous iterations. */ for (i = 0; i < n - 1; i++) { ASSERT(orig[i] != NULL); } orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); current = 0; next = tgts[current]; while (current != n) { tgts[current] = next; current = 0; /* * Save off the original data that we're going to * attempt to reconstruct. */ for (i = 0; i < n; i++) { ASSERT(orig[i] != NULL); c = tgts[i]; ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; bcopy(rc->rc_data, orig[i], rc->rc_size); } /* * Attempt a reconstruction and exit the outer loop on * success. */ code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(zio) == 0) { atomic_inc_64(&raidz_corrected[code]); for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; ASSERT(rc->rc_error == 0); if (rc->rc_tried) raidz_checksum_error(zio, rc, orig[i]); rc->rc_error = ECKSUM; } ret = code; goto done; } /* * Restore the original data. */ for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; bcopy(orig[i], rc->rc_data, rc->rc_size); } do { /* * Find the next valid column after the current * position.. */ for (next = tgts[current] + 1; next < rm->rm_cols && rm->rm_col[next].rc_error != 0; next++) continue; ASSERT(next <= tgts[current + 1]); /* * If that spot is available, we're done here. */ if (next != tgts[current + 1]) break; /* * Otherwise, find the next valid column after * the previous position. */ for (c = tgts[current - 1] + 1; rm->rm_col[c].rc_error != 0; c++) continue; tgts[current] = c; current++; } while (current != n); } } n--; done: for (i = 0; i < n; i++) { zio_buf_free(orig[i], rm->rm_col[0].rc_size); } return (ret); } static void vdev_raidz_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *cvd; raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; int n, c; int tgts[VDEV_RAIDZ_MAXPARITY]; int code; ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; if (rc->rc_error) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ if (c < rm->rm_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; total_errors++; } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { parity_untried++; } } if (zio->io_type == ZIO_TYPE_WRITE) { /* * XXX -- for now, treat partial writes as a success. * (If we couldn't write enough columns to reconstruct * the data, the I/O failed. Otherwise, good enough.) * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ /* XXPOLICY */ if (total_errors > rm->rm_firstdatacol) zio->io_error = vdev_raidz_worst_error(rm); return; } else if (zio->io_type == ZIO_TYPE_FREE) { return; } ASSERT(zio->io_type == ZIO_TYPE_READ); /* * There are three potential phases for a read: * 1. produce valid data from the columns read * 2. read all disks and try again * 3. perform combinatorial reconstruction * * Each phase is progressively both more expensive and less likely to * occur. If we encounter more errors than we can repair or all phases * fail, we have no choice but to return an error. */ /* * If the number of errors we saw was correctable -- less than or equal * to the number of parity disks read -- attempt to produce data that * has a valid checksum. Naturally, this case applies in the absence of * any errors. */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { if (data_errors == 0) { if (raidz_checksum_verify(zio) == 0) { /* * If we read parity information (unnecessarily * as it happens since no reconstruction was * needed) regenerate and verify the parity. * We also regenerate parity when resilvering * so we can write it out to the failed device * later. */ if (parity_errors + parity_untried < rm->rm_firstdatacol || (zio->io_flags & ZIO_FLAG_RESILVER)) { n = raidz_parity_verify(zio, rm); unexpected_errors += n; ASSERT(parity_errors + n <= rm->rm_firstdatacol); } goto done; } } else { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we * wouldn't be here in the correctable case. There must * also have been fewer parity errors than parity * columns or, again, we wouldn't be in this code path. */ ASSERT(parity_untried == 0); ASSERT(parity_errors < rm->rm_firstdatacol); /* * Identify the data columns that reported an error. */ n = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; if (rc->rc_error != 0) { ASSERT(n < VDEV_RAIDZ_MAXPARITY); tgts[n++] = c; } } ASSERT(rm->rm_firstdatacol >= n); code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(zio) == 0) { atomic_inc_64(&raidz_corrected[code]); /* * If we read more parity disks than were used * for reconstruction, confirm that the other * parity disks produced correct data. This * routine is suboptimal in that it regenerates * the parity that we already used in addition * to the parity that we're attempting to * verify, but this should be a relatively * uncommon case, and can be optimized if it * becomes a problem. Note that we regenerate * parity when resilvering so we can write it * out to failed devices later. */ if (parity_errors < rm->rm_firstdatacol - n || (zio->io_flags & ZIO_FLAG_RESILVER)) { n = raidz_parity_verify(zio, rm); unexpected_errors += n; ASSERT(parity_errors + n <= rm->rm_firstdatacol); } goto done; } } } /* * This isn't a typical situation -- either we got a read error or * a child silently returned bad data. Read every block so we can * try again with as much data and parity as we can track down. If * we've already been through once before, all children will be marked * as tried so we'll proceed to combinatorial reconstruction. */ unexpected_errors = 1; rm->rm_missingdata = 0; rm->rm_missingparity = 0; for (c = 0; c < rm->rm_cols; c++) { if (rm->rm_col[c].rc_tried) continue; zio_vdev_io_redone(zio); do { rc = &rm->rm_col[c]; if (rc->rc_tried) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_data, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } while (++c < rm->rm_cols); return; } /* * At this point we've attempted to reconstruct the data given the * errors we detected, and we've attempted to read all columns. There * must, therefore, be one or more additional problems -- silent errors * resulting in invalid data rather than explicit I/O errors resulting * in absent data. We check if there is enough additional data to * possibly reconstruct the data and then perform combinatorial * reconstruction over all possible combinations. If that fails, * we're cooked. */ if (total_errors > rm->rm_firstdatacol) { zio->io_error = vdev_raidz_worst_error(rm); } else if (total_errors < rm->rm_firstdatacol && (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { /* * If we didn't use all the available parity for the * combinatorial reconstruction, verify that the remaining * parity is correct. */ if (code != (1 << rm->rm_firstdatacol) - 1) (void) raidz_parity_verify(zio, rm); } else { /* * We're here because either: * * total_errors == rm_first_datacol, or * vdev_raidz_combrec() failed * * In either case, there is enough bad data to prevent * reconstruction. * * Start checksum ereports for all children which haven't * failed, and the IO wasn't speculative. */ zio->io_error = ECKSUM; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; if (rc->rc_error == 0) { zio_bad_cksum_t zbc; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; zfs_ereport_start_checksum( zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, rc->rc_offset, rc->rc_size, (void *)(uintptr_t)c, &zbc); } } } } done: zio_checksum_verified(zio); if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. */ for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; if (rc->rc_error == 0) continue; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted > vd->vdev_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, vdev_raidz_asize, vdev_raidz_io_start, vdev_raidz_io_done, vdev_raidz_state_change, NULL, NULL, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c (revision 247192) @@ -1,872 +1,872 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * The 512-byte leaf is broken into 32 16-byte chunks. * chunk number n means l_chunk[n], even though the header precedes it. * the names are stored null-terminated. */ #include #include #include #include #include #include #include #include #include static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); #define CHAIN_END 0xffff /* end of the chunk chain */ /* half the (current) minimum block size */ #define MAX_ARRAY_BYTES (8<<10) #define LEAF_HASH(l, h) \ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len))) #define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) static void zap_memset(void *a, int c, size_t n) { char *cp = a; char *cpend = cp + n; while (cp < cpend) *cp++ = c; } static void stv(int len, void *addr, uint64_t value) { switch (len) { case 1: *(uint8_t *)addr = value; return; case 2: *(uint16_t *)addr = value; return; case 4: *(uint32_t *)addr = value; return; case 8: *(uint64_t *)addr = value; return; } ASSERT(!"bad int len"); } static uint64_t ldv(int len, const void *addr) { switch (len) { case 1: return (*(uint8_t *)addr); case 2: return (*(uint16_t *)addr); case 4: return (*(uint32_t *)addr); case 8: return (*(uint64_t *)addr); } ASSERT(!"bad int len"); return (0xFEEDFACEDEADBEEFULL); } void zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) { int i; zap_leaf_t l; l.l_bs = highbit(size)-1; l.l_phys = buf; buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); struct zap_leaf_entry *le; switch (lc->l_free.lf_type) { case ZAP_CHUNK_ENTRY: le = &lc->l_entry; le->le_type = BSWAP_8(le->le_type); le->le_value_intlen = BSWAP_8(le->le_value_intlen); le->le_next = BSWAP_16(le->le_next); le->le_name_chunk = BSWAP_16(le->le_name_chunk); le->le_name_numints = BSWAP_16(le->le_name_numints); le->le_value_chunk = BSWAP_16(le->le_value_chunk); le->le_value_numints = BSWAP_16(le->le_value_numints); le->le_cd = BSWAP_32(le->le_cd); le->le_hash = BSWAP_64(le->le_hash); break; case ZAP_CHUNK_FREE: lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); break; case ZAP_CHUNK_ARRAY: lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); /* la_array doesn't need swapping */ break; default: ASSERT(!"bad leaf type"); } } } void zap_leaf_init(zap_leaf_t *l, boolean_t sort) { int i; l->l_bs = highbit(l->l_dbuf->db_size)-1; zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header)); zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; } ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; l->l_phys->l_hdr.lh_block_type = ZBT_LEAF; l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC; l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); if (sort) l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; } /* * Routines which manipulate leaf chunks (l_chunk[]). */ static uint16_t zap_leaf_chunk_alloc(zap_leaf_t *l) { int chunk; ASSERT(l->l_phys->l_hdr.lh_nfree > 0); chunk = l->l_phys->l_hdr.lh_freelist; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; l->l_phys->l_hdr.lh_nfree--; return (chunk); } static void zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) { struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); zlf->lf_type = ZAP_CHUNK_FREE; zlf->lf_next = l->l_phys->l_hdr.lh_freelist; bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ l->l_phys->l_hdr.lh_freelist = chunk; l->l_phys->l_hdr.lh_nfree++; } /* * Routines which manipulate leaf arrays (zap_leaf_array type chunks). */ static uint16_t zap_leaf_array_create(zap_leaf_t *l, const char *buf, int integer_size, int num_integers) { uint16_t chunk_head; uint16_t *chunkp = &chunk_head; int byten = 0; - uint64_t value; + uint64_t value = 0; int shift = (integer_size-1)*8; int len = num_integers; ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES); while (len > 0) { uint16_t chunk = zap_leaf_chunk_alloc(l); struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int i; la->la_type = ZAP_CHUNK_ARRAY; for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { if (byten == 0) value = ldv(integer_size, buf); la->la_array[i] = value >> shift; value <<= 8; if (++byten == integer_size) { byten = 0; buf += integer_size; if (--len == 0) break; } } *chunkp = chunk; chunkp = &la->la_next; } *chunkp = CHAIN_END; return (chunk_head); } static void zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) { uint16_t chunk = *chunkp; *chunkp = CHAIN_END; while (chunk != CHAIN_END) { int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, ZAP_CHUNK_ARRAY); zap_leaf_chunk_free(l, chunk); chunk = nextchunk; } } /* array_len and buf_len are in integers, not bytes */ static void zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, void *buf) { int len = MIN(array_len, buf_len); int byten = 0; uint64_t value = 0; char *p = buf; ASSERT3U(array_int_len, <=, buf_int_len); /* Fast path for one 8-byte integer */ if (array_int_len == 8 && buf_int_len == 8 && len == 1) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; uint8_t *ip = la->la_array; uint64_t *buf64 = buf; *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; return; } /* Fast path for an array of 1-byte integers (eg. the entry name) */ if (array_int_len == 1 && buf_int_len == 1 && buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { while (chunk != CHAIN_END) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES); p += ZAP_LEAF_ARRAY_BYTES; chunk = la->la_next; } return; } while (len > 0) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int i; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { value = (value << 8) | la->la_array[i]; byten++; if (byten == array_int_len) { stv(buf_int_len, p, value); byten = 0; len--; if (len == 0) return; p += buf_int_len; } } chunk = la->la_next; } } static boolean_t zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_numints) { int bseen = 0; if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { uint64_t *thiskey; boolean_t match; ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); thiskey = kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP); zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, sizeof (*thiskey), array_numints, thiskey); match = bcmp(thiskey, zn->zn_key_orig, array_numints * sizeof (*thiskey)) == 0; kmem_free(thiskey, array_numints * sizeof (*thiskey)); return (match); } ASSERT(zn->zn_key_intlen == 1); if (zn->zn_matchtype == MT_FIRST) { char *thisname = kmem_alloc(array_numints, KM_SLEEP); boolean_t match; zap_leaf_array_read(l, chunk, sizeof (char), array_numints, sizeof (char), array_numints, thisname); match = zap_match(zn, thisname); kmem_free(thisname, array_numints); return (match); } /* * Fast path for exact matching. * First check that the lengths match, so that we don't read * past the end of the zn_key_orig array. */ if (array_numints != zn->zn_key_orig_numints) return (B_FALSE); while (bseen < array_numints) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) break; chunk = la->la_next; bseen += toread; } return (bseen == array_numints); } /* * Routines which manipulate leaf entries. */ int zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) { uint16_t *chunkp; struct zap_leaf_entry *le; ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); again: for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); *chunkp != CHAIN_END; chunkp = &le->le_next) { uint16_t chunk = *chunkp; le = ZAP_LEAF_ENTRY(l, chunk); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (le->le_hash != zn->zn_hash) continue; /* * NB: the entry chain is always sorted by cd on * normalized zap objects, so this will find the * lowest-cd match for MT_FIRST. */ ASSERT(zn->zn_matchtype == MT_EXACT || (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); if (zap_leaf_array_match(l, zn, le->le_name_chunk, le->le_name_numints)) { zeh->zeh_num_integers = le->le_value_numints; zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; zeh->zeh_leaf = l; return (0); } } /* * NB: we could of course do this in one pass, but that would be * a pain. We'll see if MT_BEST is even used much. */ if (zn->zn_matchtype == MT_BEST) { zn->zn_matchtype = MT_FIRST; goto again; } return (ENOENT); } /* Return (h1,cd1 >= h2,cd2) */ #define HCD_GTEQ(h1, cd1, h2, cd2) \ ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) int zap_leaf_lookup_closest(zap_leaf_t *l, uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) { uint16_t chunk; uint64_t besth = -1ULL; uint32_t bestcd = -1U; uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; uint16_t lh; struct zap_leaf_entry *le; ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { for (chunk = l->l_phys->l_hash[lh]; chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { ASSERT3U(bestlh, >=, lh); bestlh = lh; besth = le->le_hash; bestcd = le->le_cd; zeh->zeh_num_integers = le->le_value_numints; zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_fakechunk = chunk; zeh->zeh_chunkp = &zeh->zeh_fakechunk; zeh->zeh_leaf = l; } } } return (bestcd == -1U ? ENOENT : 0); } int zap_entry_read(const zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, void *buf) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (le->le_value_intlen > integer_size) return (EINVAL); zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_value_intlen, le->le_value_numints, integer_size, num_integers, buf); if (zeh->zeh_num_integers > num_integers) return (EOVERFLOW); return (0); } int zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, char *buf) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8, le->le_name_numints, 8, buflen / 8, buf); } else { zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, le->le_name_numints, 1, buflen, buf); } if (le->le_name_numints > buflen) return (EOVERFLOW); return (0); } int zap_entry_update(zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, const void *buf) { int delta_chunks; zap_leaf_t *l = zeh->zeh_leaf; struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) return (EAGAIN); zap_leaf_array_free(l, &le->le_value_chunk); le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); le->le_value_numints = num_integers; le->le_value_intlen = integer_size; return (0); } void zap_entry_remove(zap_entry_handle_t *zeh) { uint16_t entry_chunk; struct zap_leaf_entry *le; zap_leaf_t *l = zeh->zeh_leaf; ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); entry_chunk = *zeh->zeh_chunkp; le = ZAP_LEAF_ENTRY(l, entry_chunk); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); zap_leaf_array_free(l, &le->le_name_chunk); zap_leaf_array_free(l, &le->le_value_chunk); *zeh->zeh_chunkp = le->le_next; zap_leaf_chunk_free(l, entry_chunk); l->l_phys->l_hdr.lh_nentries--; } int zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, uint8_t integer_size, uint64_t num_integers, const void *buf, zap_entry_handle_t *zeh) { uint16_t chunk; uint16_t *chunkp; struct zap_leaf_entry *le; uint64_t valuelen; int numchunks; uint64_t h = zn->zn_hash; valuelen = integer_size * num_integers; numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) return (E2BIG); if (cd == ZAP_NEED_CD) { /* find the lowest unused cd */ if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { cd = 0; for (chunk = *LEAF_HASH_ENTPTR(l, h); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); if (le->le_cd > cd) break; if (le->le_hash == h) { ASSERT3U(cd, ==, le->le_cd); cd++; } } } else { /* old unsorted format; do it the O(n^2) way */ for (cd = 0; ; cd++) { for (chunk = *LEAF_HASH_ENTPTR(l, h); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); if (le->le_hash == h && le->le_cd == cd) { break; } } /* If this cd is not in use, we are good. */ if (chunk == CHAIN_END) break; } } /* * We would run out of space in a block before we could * store enough entries to run out of CD values. */ ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); } if (l->l_phys->l_hdr.lh_nfree < numchunks) return (EAGAIN); /* make the entry */ chunk = zap_leaf_chunk_alloc(l); le = ZAP_LEAF_ENTRY(l, chunk); le->le_type = ZAP_CHUNK_ENTRY; le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig, zn->zn_key_intlen, zn->zn_key_orig_numints); le->le_name_numints = zn->zn_key_orig_numints; le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); le->le_value_numints = num_integers; le->le_value_intlen = integer_size; le->le_hash = h; le->le_cd = cd; /* link it into the hash chain */ /* XXX if we did the search above, we could just use that */ chunkp = zap_leaf_rehash_entry(l, chunk); l->l_phys->l_hdr.lh_nentries++; zeh->zeh_leaf = l; zeh->zeh_num_integers = num_integers; zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; return (0); } /* * Determine if there is another entry with the same normalized form. * For performance purposes, either zn or name must be provided (the * other can be NULL). Note, there usually won't be any hash * conflicts, in which case we don't need the concatenated/normalized * form of the name. But all callers have one of these on hand anyway, * so might as well take advantage. A cleaner but slower interface * would accept neither argument, and compute the normalized name as * needed (using zap_name_alloc(zap_entry_read_name(zeh))). */ boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, const char *name, zap_t *zap) { uint64_t chunk; struct zap_leaf_entry *le; boolean_t allocdzn = B_FALSE; if (zap->zap_normflags == 0) return (B_FALSE); for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); if (le->le_hash != zeh->zeh_hash) continue; if (le->le_cd == zeh->zeh_cd) continue; if (zn == NULL) { zn = zap_name_alloc(zap, name, MT_FIRST); allocdzn = B_TRUE; } if (zap_leaf_array_match(zeh->zeh_leaf, zn, le->le_name_chunk, le->le_name_numints)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for transferring entries between leafs. */ static uint16_t * zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); struct zap_leaf_entry *le2; uint16_t *chunkp; /* * keep the entry chain sorted by cd * NB: this will not cause problems for unsorted leafs, though * it is unnecessary there. */ for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash); *chunkp != CHAIN_END; chunkp = &le2->le_next) { le2 = ZAP_LEAF_ENTRY(l, *chunkp); if (le2->le_cd > le->le_cd) break; } le->le_next = *chunkp; *chunkp = entry; return (chunkp); } static uint16_t zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) { uint16_t new_chunk; uint16_t *nchunkp = &new_chunk; while (chunk != CHAIN_END) { uint16_t nchunk = zap_leaf_chunk_alloc(nl); struct zap_leaf_array *nla = &ZAP_LEAF_CHUNK(nl, nchunk).l_array; struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int nextchunk = la->la_next; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); *nla = *la; /* structure assignment */ zap_leaf_chunk_free(l, chunk); chunk = nextchunk; *nchunkp = nchunk; nchunkp = &nla->la_next; } *nchunkp = CHAIN_END; return (new_chunk); } static void zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) { struct zap_leaf_entry *le, *nle; uint16_t chunk; le = ZAP_LEAF_ENTRY(l, entry); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); chunk = zap_leaf_chunk_alloc(nl); nle = ZAP_LEAF_ENTRY(nl, chunk); *nle = *le; /* structure assignment */ (void) zap_leaf_rehash_entry(nl, chunk); nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); nle->le_value_chunk = zap_leaf_transfer_array(l, le->le_value_chunk, nl); zap_leaf_chunk_free(l, entry); l->l_phys->l_hdr.lh_nentries--; nl->l_phys->l_hdr.lh_nentries++; } /* * Transfer the entries whose hash prefix ends in 1 to the new leaf. */ void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { int i; int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len; /* set new prefix and prefix_len */ l->l_phys->l_hdr.lh_prefix <<= 1; l->l_phys->l_hdr.lh_prefix_len++; nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1; nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len; /* break existing hash chains */ zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); if (sort) l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; /* * Transfer entries whose hash bit 'bit' is set to nl; rehash * the remaining entries * * NB: We could find entries via the hashtable instead. That * would be O(hashents+numents) rather than O(numblks+numents), * but this accesses memory more sequentially, and when we're * called, the block is usually pretty full. */ for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); if (le->le_type != ZAP_CHUNK_ENTRY) continue; if (le->le_hash & (1ULL << bit)) zap_leaf_transfer_entry(l, i, nl); else (void) zap_leaf_rehash_entry(l, i); } } void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) { int i, n; n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->l_phys->l_hdr.lh_prefix_len; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_leafs_with_2n_pointers[n]++; n = l->l_phys->l_hdr.lh_nentries/5; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_blocks_with_n5_entries[n]++; n = ((1<l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / (1<zs_blocks_n_tenths_full[n]++; for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { int nentries = 0; int chunk = l->l_phys->l_hash[i]; while (chunk != CHAIN_END) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, chunk); n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_entries_using_n_chunks[n]++; chunk = le->le_next; nentries++; } n = nentries; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_buckets_with_n_entries[n]++; } } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c (revision 247192) @@ -1,199 +1,199 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include void zfs_oldace_byteswap(ace_t *ace, int ace_cnt) { int i; for (i = 0; i != ace_cnt; i++, ace++) { ace->a_who = BSWAP_32(ace->a_who); ace->a_access_mask = BSWAP_32(ace->a_access_mask); ace->a_flags = BSWAP_16(ace->a_flags); ace->a_type = BSWAP_16(ace->a_type); } } /* * swap ace_t and ace_oject_t */ void zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) { caddr_t end; caddr_t ptr; - zfs_ace_t *zacep; + zfs_ace_t *zacep = NULL; ace_t *acep; uint16_t entry_type; size_t entry_size; int ace_type; end = (caddr_t)buf + size; ptr = buf; while (ptr < end) { if (zfs_layout) { /* * Avoid overrun. Embedded aces can have one * of several sizes. We don't know exactly * how many our present, only the size of the * buffer containing them. That size may be * larger than needed to hold the aces * present. As long as we do not do any * swapping beyond the end of our block we are * okay. It it safe to swap any non-ace data * within the block since it is just zeros. */ if (ptr + sizeof (zfs_ace_hdr_t) > end) { break; } zacep = (zfs_ace_t *)ptr; zacep->z_hdr.z_access_mask = BSWAP_32(zacep->z_hdr.z_access_mask); zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags); ace_type = zacep->z_hdr.z_type = BSWAP_16(zacep->z_hdr.z_type); entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; } else { /* Overrun avoidance */ if (ptr + sizeof (ace_t) > end) { break; } acep = (ace_t *)ptr; acep->a_access_mask = BSWAP_32(acep->a_access_mask); acep->a_flags = BSWAP_16(acep->a_flags); ace_type = acep->a_type = BSWAP_16(acep->a_type); acep->a_who = BSWAP_32(acep->a_who); entry_type = acep->a_flags & ACE_TYPE_FLAGS; } switch (entry_type) { case ACE_OWNER: case ACE_EVERYONE: case (ACE_IDENTIFIER_GROUP | ACE_GROUP): entry_size = zfs_layout ? sizeof (zfs_ace_hdr_t) : sizeof (ace_t); break; case ACE_IDENTIFIER_GROUP: default: /* Overrun avoidance */ if (zfs_layout) { if (ptr + sizeof (zfs_ace_t) <= end) { zacep->z_fuid = BSWAP_64(zacep->z_fuid); } else { entry_size = sizeof (zfs_ace_t); break; } } switch (ace_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: entry_size = zfs_layout ? sizeof (zfs_object_ace_t) : sizeof (ace_object_t); break; default: entry_size = zfs_layout ? sizeof (zfs_ace_t) : sizeof (ace_t); break; } } ptr = ptr + entry_size; } } /* ARGSUSED */ void zfs_oldacl_byteswap(void *buf, size_t size) { int cnt; /* * Arggh, since we don't know how many ACEs are in * the array, we have to swap the entire block */ cnt = size / sizeof (ace_t); zfs_oldace_byteswap((ace_t *)buf, cnt); } /* ARGSUSED */ void zfs_acl_byteswap(void *buf, size_t size) { zfs_ace_byteswap(buf, size, B_TRUE); } void zfs_znode_byteswap(void *buf, size_t size) { znode_phys_t *zp = buf; ASSERT(size >= sizeof (znode_phys_t)); zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]); zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]); zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]); zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]); zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]); zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]); zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]); zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]); zp->zp_gen = BSWAP_64(zp->zp_gen); zp->zp_mode = BSWAP_64(zp->zp_mode); zp->zp_size = BSWAP_64(zp->zp_size); zp->zp_parent = BSWAP_64(zp->zp_parent); zp->zp_links = BSWAP_64(zp->zp_links); zp->zp_xattr = BSWAP_64(zp->zp_xattr); zp->zp_rdev = BSWAP_64(zp->zp_rdev); zp->zp_flags = BSWAP_64(zp->zp_flags); zp->zp_uid = BSWAP_64(zp->zp_uid); zp->zp_gid = BSWAP_64(zp->zp_gid); zp->zp_zap = BSWAP_64(zp->zp_zap); zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size); zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count); if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], ZFS_ACE_SPACE); } else { zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT); } } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c (revision 247192) @@ -1,764 +1,767 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #endif #include /* * FUID Domain table(s). * * The FUID table is stored as a packed nvlist of an array * of nvlists which contain an index, domain string and offset * * During file system initialization the nvlist(s) are read and * two AVL trees are created. One tree is keyed by the index number * and the other by the domain string. Nodes are never removed from * trees, but new entries may be added. If a new entry is added then * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then * be responsible for calling zfs_fuid_sync() to sync the changes to disk. * */ #define FUID_IDX "fuid_idx" #define FUID_DOMAIN "fuid_domain" #define FUID_OFFSET "fuid_offset" #define FUID_NVP_ARRAY "fuid_nvlist" typedef struct fuid_domain { avl_node_t f_domnode; avl_node_t f_idxnode; ksiddomain_t *f_ksid; uint64_t f_idx; } fuid_domain_t; static char *nulldomain = ""; /* * Compare two indexes. */ static int idx_compare(const void *arg1, const void *arg2) { const fuid_domain_t *node1 = arg1; const fuid_domain_t *node2 = arg2; if (node1->f_idx < node2->f_idx) return (-1); else if (node1->f_idx > node2->f_idx) return (1); return (0); } /* * Compare two domain strings. */ static int domain_compare(const void *arg1, const void *arg2) { const fuid_domain_t *node1 = arg1; const fuid_domain_t *node2 = arg2; int val; val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); if (val == 0) return (0); return (val > 0 ? 1 : -1); } void zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree) { avl_create(idx_tree, idx_compare, sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); avl_create(domain_tree, domain_compare, sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); } /* * load initial fuid domain and idx trees. This function is used by * both the kernel and zdb. */ uint64_t zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, avl_tree_t *domain_tree) { dmu_buf_t *db; uint64_t fuid_size; ASSERT(fuid_obj != 0); VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); fuid_size = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); if (fuid_size) { nvlist_t **fuidnvp; nvlist_t *nvp = NULL; uint_t count; char *packed; int i; packed = kmem_alloc(fuid_size, KM_SLEEP); VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed, DMU_READ_PREFETCH) == 0); VERIFY(nvlist_unpack(packed, fuid_size, &nvp, 0) == 0); VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, &fuidnvp, &count) == 0); for (i = 0; i != count; i++) { fuid_domain_t *domnode; char *domain; uint64_t idx; VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, &domain) == 0); VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, &idx) == 0); domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); domnode->f_idx = idx; domnode->f_ksid = ksid_lookupdomain(domain); avl_add(idx_tree, domnode); avl_add(domain_tree, domnode); } nvlist_free(nvp); kmem_free(packed, fuid_size); } return (fuid_size); } void zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) { fuid_domain_t *domnode; void *cookie; cookie = NULL; while (domnode = avl_destroy_nodes(domain_tree, &cookie)) ksiddomain_rele(domnode->f_ksid); avl_destroy(domain_tree); cookie = NULL; while (domnode = avl_destroy_nodes(idx_tree, &cookie)) kmem_free(domnode, sizeof (fuid_domain_t)); avl_destroy(idx_tree); } char * zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) { fuid_domain_t searchnode, *findnode; avl_index_t loc; searchnode.f_idx = idx; findnode = avl_find(idx_tree, &searchnode, &loc); return (findnode ? findnode->f_ksid->kd_name : nulldomain); } #ifdef _KERNEL /* * Load the fuid table(s) into memory. */ static void zfs_fuid_init(zfsvfs_t *zfsvfs) { rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); if (zfsvfs->z_fuid_loaded) { rw_exit(&zfsvfs->z_fuid_lock); return; } zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (zfsvfs->z_fuid_obj != 0) { zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); } zfsvfs->z_fuid_loaded = B_TRUE; rw_exit(&zfsvfs->z_fuid_lock); } /* * sync out AVL trees to persistent storage. */ void zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { nvlist_t *nvp; nvlist_t **fuids; size_t nvsize = 0; char *packed; dmu_buf_t *db; fuid_domain_t *domnode; int numnodes; int i; if (!zfsvfs->z_fuid_dirty) { return; } rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); /* * First see if table needs to be created? */ if (zfsvfs->z_fuid_obj == 0) { zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, sizeof (uint64_t), tx); VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, sizeof (uint64_t), 1, &zfsvfs->z_fuid_obj, tx) == 0); } VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); numnodes = avl_numnodes(&zfsvfs->z_fuid_idx); fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++, domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) { VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, domnode->f_idx) == 0); VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, domnode->f_ksid->kd_name) == 0); } VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, fuids, numnodes) == 0); for (i = 0; i != numnodes; i++) nvlist_free(fuids[i]); kmem_free(fuids, numnodes * sizeof (void *)); VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); packed = kmem_alloc(nvsize, KM_SLEEP); VERIFY(nvlist_pack(nvp, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); nvlist_free(nvp); zfsvfs->z_fuid_size = nvsize; dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, zfsvfs->z_fuid_size, packed, tx); kmem_free(packed, zfsvfs->z_fuid_size); VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; dmu_buf_rele(db, FTAG); zfsvfs->z_fuid_dirty = B_FALSE; rw_exit(&zfsvfs->z_fuid_lock); } /* * Query domain table for a given domain. * * If domain isn't found and addok is set, it is added to AVL trees and * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be * necessary for the caller or another thread to detect the dirty table * and sync out the changes. */ int zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, boolean_t addok) { fuid_domain_t searchnode, *findnode; avl_index_t loc; krw_t rw = RW_READER; /* * If the dummy "nobody" domain then return an index of 0 * to cause the created FUID to be a standard POSIX id * for the user nobody. */ if (domain[0] == '\0') { if (retdomain) *retdomain = nulldomain; return (0); } searchnode.f_ksid = ksid_lookupdomain(domain); if (retdomain) *retdomain = searchnode.f_ksid->kd_name; if (!zfsvfs->z_fuid_loaded) zfs_fuid_init(zfsvfs); retry: rw_enter(&zfsvfs->z_fuid_lock, rw); findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc); if (findnode) { rw_exit(&zfsvfs->z_fuid_lock); ksiddomain_rele(searchnode.f_ksid); return (findnode->f_idx); } else if (addok) { fuid_domain_t *domnode; uint64_t retidx; if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { rw_exit(&zfsvfs->z_fuid_lock); rw = RW_WRITER; goto retry; } domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); domnode->f_ksid = searchnode.f_ksid; retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1; avl_add(&zfsvfs->z_fuid_domain, domnode); avl_add(&zfsvfs->z_fuid_idx, domnode); zfsvfs->z_fuid_dirty = B_TRUE; rw_exit(&zfsvfs->z_fuid_lock); return (retidx); } else { rw_exit(&zfsvfs->z_fuid_lock); return (-1); } } /* * Query domain table by index, returning domain string * * Returns a pointer from an avl node of the domain string. * */ const char * zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) { char *domain; if (idx == 0 || !zfsvfs->z_use_fuids) return (NULL); if (!zfsvfs->z_fuid_loaded) zfs_fuid_init(zfsvfs); rw_enter(&zfsvfs->z_fuid_lock, RW_READER); if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty) domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); else domain = nulldomain; rw_exit(&zfsvfs->z_fuid_lock); ASSERT(domain); return (domain); } void zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) { *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP); } uid_t zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, cred_t *cr, zfs_fuid_type_t type) { uint32_t index = FUID_INDEX(fuid); const char *domain; uid_t id; if (index == 0) return (fuid); domain = zfs_fuid_find_by_idx(zfsvfs, index); ASSERT(domain != NULL); #ifdef sun if (type == ZFS_OWNER || type == ZFS_ACE_USER) { (void) kidmap_getuidbysid(crgetzone(cr), domain, FUID_RID(fuid), &id); } else { (void) kidmap_getgidbysid(crgetzone(cr), domain, FUID_RID(fuid), &id); } #else /* !sun */ id = UID_NOBODY; #endif /* !sun */ return (id); } /* * Add a FUID node to the list of fuid's being created for this * ACL * * If ACL has multiple domains, then keep only one copy of each unique * domain. */ void zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, uint64_t idx, uint64_t id, zfs_fuid_type_t type) { zfs_fuid_t *fuid; zfs_fuid_domain_t *fuid_domain; zfs_fuid_info_t *fuidp; uint64_t fuididx; boolean_t found = B_FALSE; if (*fuidpp == NULL) *fuidpp = zfs_fuid_info_alloc(); fuidp = *fuidpp; /* * First find fuid domain index in linked list * * If one isn't found then create an entry. */ for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains); fuid_domain; fuid_domain = list_next(&fuidp->z_domains, fuid_domain), fuididx++) { if (idx == fuid_domain->z_domidx) { found = B_TRUE; break; } } if (!found) { fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP); fuid_domain->z_domain = domain; fuid_domain->z_domidx = idx; list_insert_tail(&fuidp->z_domains, fuid_domain); fuidp->z_domain_str_sz += strlen(domain) + 1; fuidp->z_domain_cnt++; } if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { /* * Now allocate fuid entry and add it on the end of the list */ fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); fuid->z_id = id; fuid->z_domidx = idx; fuid->z_logfuid = FUID_ENCODE(fuididx, rid); list_insert_tail(&fuidp->z_fuids, fuid); fuidp->z_fuid_cnt++; } else { if (type == ZFS_OWNER) fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid); else fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid); } } /* * Create a file system FUID, based on information in the users cred * * If cred contains KSID_OWNER then it should be used to determine * the uid otherwise cred's uid will be used. By default cred's gid * is used unless it's an ephemeral ID in which case KSID_GROUP will * be used if it exists. */ uint64_t zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, cred_t *cr, zfs_fuid_info_t **fuidp) { uint64_t idx; ksid_t *ksid; uint32_t rid; char *kdomain; const char *domain; uid_t id; VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); if (!zfsvfs->z_use_fuids || (ksid == NULL)) { id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); if (IS_EPHEMERAL(id)) return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); return ((uint64_t)id); } /* * ksid is present and FUID is supported */ id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr); if (!IS_EPHEMERAL(id)) return ((uint64_t)id); if (type == ZFS_GROUP) id = ksid_getid(ksid); rid = ksid_getrid(ksid); domain = ksid_getdomain(ksid); idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); return (FUID_ENCODE(idx, rid)); } /* * Create a file system FUID for an ACL ace * or a chown/chgrp of the file. * This is similar to zfs_fuid_create_cred, except that * we can't find the domain + rid information in the * cred. Instead we have to query Winchester for the * domain and rid. * * During replay operations the domain+rid information is * found in the zfs_fuid_info_t that the replay code has * attached to the zfsvfs of the file system. */ uint64_t zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp) { const char *domain; char *kdomain; uint32_t fuid_idx = FUID_INDEX(id); uint32_t rid; idmap_stat status; - uint64_t idx; + uint64_t idx = 0; zfs_fuid_t *zfuid = NULL; - zfs_fuid_info_t *fuidp; + zfs_fuid_info_t *fuidp = NULL; /* * If POSIX ID, or entry is already a FUID then * just return the id * * We may also be handed an already FUID'ized id via * chmod. */ if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) return (id); if (zfsvfs->z_replay) { fuidp = zfsvfs->z_fuid_replay; /* * If we are passed an ephemeral id, but no * fuid_info was logged then return NOBODY. * This is most likely a result of idmap service * not being available. */ if (fuidp == NULL) return (UID_NOBODY); + VERIFY3U(type, >=, ZFS_OWNER); + VERIFY3U(type, <=, ZFS_ACE_GROUP); + switch (type) { case ZFS_ACE_USER: case ZFS_ACE_GROUP: zfuid = list_head(&fuidp->z_fuids); rid = FUID_RID(zfuid->z_logfuid); idx = FUID_INDEX(zfuid->z_logfuid); break; case ZFS_OWNER: rid = FUID_RID(fuidp->z_fuid_owner); idx = FUID_INDEX(fuidp->z_fuid_owner); break; case ZFS_GROUP: rid = FUID_RID(fuidp->z_fuid_group); idx = FUID_INDEX(fuidp->z_fuid_group); break; }; - domain = fuidp->z_domain_table[idx -1]; + domain = fuidp->z_domain_table[idx - 1]; } else { if (type == ZFS_OWNER || type == ZFS_ACE_USER) status = kidmap_getsidbyuid(crgetzone(cr), id, &domain, &rid); else status = kidmap_getsidbygid(crgetzone(cr), id, &domain, &rid); if (status != 0) { /* * When returning nobody we will need to * make a dummy fuid table entry for logging * purposes. */ rid = UID_NOBODY; domain = nulldomain; } } idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); if (!zfsvfs->z_replay) zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); else if (zfuid != NULL) { list_remove(&fuidp->z_fuids, zfuid); kmem_free(zfuid, sizeof (zfs_fuid_t)); } return (FUID_ENCODE(idx, rid)); } void zfs_fuid_destroy(zfsvfs_t *zfsvfs) { rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); if (!zfsvfs->z_fuid_loaded) { rw_exit(&zfsvfs->z_fuid_lock); return; } zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); rw_exit(&zfsvfs->z_fuid_lock); } /* * Allocate zfs_fuid_info for tracking FUIDs created during * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR() */ zfs_fuid_info_t * zfs_fuid_info_alloc(void) { zfs_fuid_info_t *fuidp; fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP); list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t), offsetof(zfs_fuid_domain_t, z_next)); list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t), offsetof(zfs_fuid_t, z_next)); return (fuidp); } /* * Release all memory associated with zfs_fuid_info_t */ void zfs_fuid_info_free(zfs_fuid_info_t *fuidp) { zfs_fuid_t *zfuid; zfs_fuid_domain_t *zdomain; while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { list_remove(&fuidp->z_fuids, zfuid); kmem_free(zfuid, sizeof (zfs_fuid_t)); } if (fuidp->z_domain_table != NULL) kmem_free(fuidp->z_domain_table, (sizeof (char **)) * fuidp->z_domain_cnt); while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { list_remove(&fuidp->z_domains, zdomain); kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); } kmem_free(fuidp, sizeof (zfs_fuid_info_t)); } /* * Check to see if id is a groupmember. If cred * has ksid info then sidlist is checked first * and if still not found then POSIX groups are checked * * Will use a straight FUID compare when possible. */ boolean_t zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) { #ifdef sun ksid_t *ksid = crgetsid(cr, KSID_GROUP); ksidlist_t *ksidlist = crgetsidlist(cr); #endif /* !sun */ uid_t gid; #ifdef sun if (ksid && ksidlist) { int i; ksid_t *ksid_groups; uint32_t idx = FUID_INDEX(id); uint32_t rid = FUID_RID(id); ksid_groups = ksidlist->ksl_sids; for (i = 0; i != ksidlist->ksl_nsid; i++) { if (idx == 0) { if (id != IDMAP_WK_CREATOR_GROUP_GID && id == ksid_groups[i].ks_id) { return (B_TRUE); } } else { const char *domain; domain = zfs_fuid_find_by_idx(zfsvfs, idx); ASSERT(domain != NULL); if (strcmp(domain, IDMAP_WK_CREATOR_SID_AUTHORITY) == 0) return (B_FALSE); if ((strcmp(domain, ksid_groups[i].ks_domain->kd_name) == 0) && rid == ksid_groups[i].ks_rid) return (B_TRUE); } } } #endif /* !sun */ /* * Not found in ksidlist, check posix groups */ gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); return (groupmember(gid, cr)); } void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { if (zfsvfs->z_fuid_obj == 0) { dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, FUID_SIZE_ESTIMATE(zfsvfs)); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); } else { dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, FUID_SIZE_ESTIMATE(zfsvfs)); } } #endif Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c (revision 247192) @@ -1,681 +1,680 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * These zfs_log_* functions must be called within a dmu tx, in one * of 2 contexts depending on zilog->z_replay: * * Non replay mode * --------------- * We need to record the transaction so that if it is committed to * the Intent Log then it can be replayed. An intent log transaction * structure (itx_t) is allocated and all the information necessary to * possibly replay the transaction is saved in it. The itx is then assigned * a sequence number and inserted in the in-memory list anchored in the zilog. * * Replay mode * ----------- * We need to mark the intent log record as replayed in the log header. * This is done in the same transaction as the replay so that they * commit atomically. */ int zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) { int isxvattr = (vap->va_mask & AT_XVATTR); switch (type) { case Z_FILE: if (vsecp == NULL && !isxvattr) return (TX_CREATE); if (vsecp && isxvattr) #ifdef TODO return (TX_CREATE_ACL_ATTR); #else panic("%s:%u: unsupported condition", __func__, __LINE__); #endif if (vsecp) return (TX_CREATE_ACL); else return (TX_CREATE_ATTR); /*NOTREACHED*/ case Z_DIR: if (vsecp == NULL && !isxvattr) return (TX_MKDIR); if (vsecp && isxvattr) #ifdef TODO return (TX_MKDIR_ACL_ATTR); #else panic("%s:%u: unsupported condition", __func__, __LINE__); #endif if (vsecp) return (TX_MKDIR_ACL); else return (TX_MKDIR_ATTR); case Z_XATTRDIR: return (TX_MKXATTR); } ASSERT(0); return (TX_MAX_TYPE); } /* * build up the log data necessary for logging xvattr_t * First lr_attr_t is initialized. following the lr_attr_t * is the mapsize and attribute bitmap copied from the xvattr_t. * Following the bitmap and bitmapsize two 64 bit words are reserved * for the create time which may be set. Following the create time * records a single 64 bit integer which has the bits to set on * replay for the xvattr. */ static void zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) { uint32_t *bitmap; uint64_t *attrs; uint64_t *crtime; xoptattr_t *xoap; void *scanstamp; int i; xoap = xva_getxoptattr(xvap); ASSERT(xoap); lrattr->lr_attr_masksize = xvap->xva_mapsize; bitmap = &lrattr->lr_attr_bitmap; for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { *bitmap = xvap->xva_reqattrmap[i]; } /* Now pack the attributes up in a single uint64_t */ attrs = (uint64_t *)bitmap; crtime = attrs + 1; scanstamp = (caddr_t)(crtime + 2); *attrs = 0; if (XVA_ISSET_REQ(xvap, XAT_READONLY)) *attrs |= (xoap->xoa_readonly == 0) ? 0 : XAT0_READONLY; if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) *attrs |= (xoap->xoa_hidden == 0) ? 0 : XAT0_HIDDEN; if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) *attrs |= (xoap->xoa_system == 0) ? 0 : XAT0_SYSTEM; if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) *attrs |= (xoap->xoa_archive == 0) ? 0 : XAT0_ARCHIVE; if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) *attrs |= (xoap->xoa_immutable == 0) ? 0 : XAT0_IMMUTABLE; if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) *attrs |= (xoap->xoa_nounlink == 0) ? 0 : XAT0_NOUNLINK; if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) *attrs |= (xoap->xoa_appendonly == 0) ? 0 : XAT0_APPENDONLY; if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) *attrs |= (xoap->xoa_opaque == 0) ? 0 : XAT0_APPENDONLY; if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) *attrs |= (xoap->xoa_nodump == 0) ? 0 : XAT0_NODUMP; if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : XAT0_AV_QUARANTINED; if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) *attrs |= (xoap->xoa_av_modified == 0) ? 0 : XAT0_AV_MODIFIED; if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) *attrs |= (xoap->xoa_reparse == 0) ? 0 : XAT0_REPARSE; if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) *attrs |= (xoap->xoa_offline == 0) ? 0 : XAT0_OFFLINE; if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) *attrs |= (xoap->xoa_sparse == 0) ? 0 : XAT0_SPARSE; } static void * zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) { zfs_fuid_t *zfuid; uint64_t *fuidloc = start; /* First copy in the ACE FUIDs */ for (zfuid = list_head(&fuidp->z_fuids); zfuid; zfuid = list_next(&fuidp->z_fuids, zfuid)) { *fuidloc++ = zfuid->z_logfuid; } return (fuidloc); } static void * zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) { zfs_fuid_domain_t *zdomain; /* now copy in the domain info, if any */ if (fuidp->z_domain_str_sz != 0) { for (zdomain = list_head(&fuidp->z_domains); zdomain; zdomain = list_next(&fuidp->z_domains, zdomain)) { bcopy((void *)zdomain->z_domain, start, strlen(zdomain->z_domain) + 1); start = (caddr_t)start + strlen(zdomain->z_domain) + 1; } } return (start); } /* * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, * TX_MKDIR_ATTR and TX_MKXATTR * transactions. * * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID * domain information appended prior to the name. In this case the * uid/gid in the log record will be a log centric FUID. * * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that * may contain attributes, ACL and optional fuid information. * * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify * and ACL and normal users/groups in the ACEs. * * There may be an optional xvattr attribute information similar * to zfs_log_setattr. * * Also, after the file name "domain" strings may be appended. */ void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp, vattr_t *vap) { itx_t *itx; lr_create_t *lr; lr_acl_create_t *lracl; - size_t aclsize; + size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0; size_t xvatsize = 0; size_t txsize; xvattr_t *xvap = (xvattr_t *)vap; void *end; size_t lrsize; size_t namesize = strlen(name) + 1; size_t fuidsz = 0; if (zil_replaying(zilog, tx)) return; /* * If we have FUIDs present then add in space for * domains and ACE fuid's if any. */ if (fuidp) { fuidsz += fuidp->z_domain_str_sz; fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); } if (vap->va_mask & AT_XVATTR) xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || (int)txtype == TX_MKXATTR) { txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; lrsize = sizeof (*lr); } else { - aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; txsize = sizeof (lr_acl_create_t) + namesize + fuidsz + ZIL_ACE_LENGTH(aclsize) + xvatsize; lrsize = sizeof (lr_acl_create_t); } itx = zil_itx_create(txtype, txsize); lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; lr->lr_mode = zp->z_mode; if (!IS_EPHEMERAL(zp->z_uid)) { lr->lr_uid = (uint64_t)zp->z_uid; } else { lr->lr_uid = fuidp->z_fuid_owner; } if (!IS_EPHEMERAL(zp->z_gid)) { lr->lr_gid = (uint64_t)zp->z_gid; } else { lr->lr_gid = fuidp->z_fuid_group; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, sizeof (uint64_t)); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), lr->lr_crtime, sizeof (uint64_t) * 2); if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev, sizeof (lr->lr_rdev)) != 0) lr->lr_rdev = 0; /* * Fill in xvattr info if any */ if (vap->va_mask & AT_XVATTR) { zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); end = (caddr_t)lr + lrsize + xvatsize; } else { end = (caddr_t)lr + lrsize; } /* Now fill in any ACL info */ if (vsecp) { lracl = (lr_acl_create_t *)&itx->itx_lr; lracl->lr_aclcnt = vsecp->vsa_aclcnt; lracl->lr_acl_bytes = aclsize; lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; else lracl->lr_acl_flags = 0; bcopy(vsecp->vsa_aclentp, end, aclsize); end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); } /* drop in FUID info */ if (fuidp) { end = zfs_log_fuid_ids(fuidp, end); end = zfs_log_fuid_domains(fuidp, end); } /* * Now place file name in log record */ bcopy(name, end, namesize); zil_itx_assign(zilog, itx, tx); } /* * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. */ void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, char *name, uint64_t foid) { itx_t *itx; lr_remove_t *lr; size_t namesize = strlen(name) + 1; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(txtype, sizeof (*lr) + namesize); lr = (lr_remove_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; bcopy(name, (char *)(lr + 1), namesize); itx->itx_oid = foid; zil_itx_assign(zilog, itx, tx); } /* * zfs_log_link() handles TX_LINK transactions. */ void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name) { itx_t *itx; lr_link_t *lr; size_t namesize = strlen(name) + 1; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(txtype, sizeof (*lr) + namesize); lr = (lr_link_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_link_obj = zp->z_id; bcopy(name, (char *)(lr + 1), namesize); zil_itx_assign(zilog, itx, tx); } /* * zfs_log_symlink() handles TX_SYMLINK transactions. */ void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, char *link) { itx_t *itx; lr_create_t *lr; size_t namesize = strlen(name) + 1; size_t linksize = strlen(link) + 1; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; lr->lr_uid = zp->z_uid; lr->lr_gid = zp->z_gid; lr->lr_mode = zp->z_mode; (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, sizeof (uint64_t)); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), lr->lr_crtime, sizeof (uint64_t) * 2); bcopy(name, (char *)(lr + 1), namesize); bcopy(link, (char *)(lr + 1) + namesize, linksize); zil_itx_assign(zilog, itx, tx); } /* * zfs_log_rename() handles TX_RENAME transactions. */ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) { itx_t *itx; lr_rename_t *lr; size_t snamesize = strlen(sname) + 1; size_t dnamesize = strlen(dname) + 1; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); lr = (lr_rename_t *)&itx->itx_lr; lr->lr_sdoid = sdzp->z_id; lr->lr_tdoid = tdzp->z_id; bcopy(sname, (char *)(lr + 1), snamesize); bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); itx->itx_oid = szp->z_id; zil_itx_assign(zilog, itx, tx); } /* * zfs_log_write() handles TX_WRITE transactions. */ ssize_t zfs_immediate_write_sz = 32768; void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag) { itx_wr_state_t write_state; boolean_t slogging; uintptr_t fsync_cnt; ssize_t immediate_write_sz; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) ? 0 : zfs_immediate_write_sz; slogging = spa_has_slogs(zilog->zl_spa) && (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz) write_state = WR_INDIRECT; else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; else write_state = WR_NEED_COPY; if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); } while (resid) { itx_t *itx; lr_write_t *lr; ssize_t len; /* * If the write would overflow the largest block then split it. */ if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA) len = SPA_MAXBLOCKSIZE >> 1; else len = resid; itx = zil_itx_create(txtype, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; } itx->itx_wr_state = write_state; if (write_state == WR_NEED_COPY) itx->itx_sod += len; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zp->z_zfsvfs; if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && (fsync_cnt == 0)) itx->itx_sync = B_FALSE; zil_itx_assign(zilog, itx, tx); off += len; resid -= len; } } /* * zfs_log_truncate() handles TX_TRUNCATE transactions. */ void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len) { itx_t *itx; lr_truncate_t *lr; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len; itx->itx_sync = (zp->z_sync_cnt != 0); zil_itx_assign(zilog, itx, tx); } /* * zfs_log_setattr() handles TX_SETATTR transactions. */ void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) { itx_t *itx; lr_setattr_t *lr; xvattr_t *xvap = (xvattr_t *)vap; size_t recsize = sizeof (lr_setattr_t); void *start; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; /* * If XVATTR set, then log record size needs to allow * for lr_attr_t + xvattr mask, mapsize and create time * plus actual attribute values */ if (vap->va_mask & AT_XVATTR) recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); if (fuidp) recsize += fuidp->z_domain_str_sz; itx = zil_itx_create(txtype, recsize); lr = (lr_setattr_t *)&itx->itx_lr; lr->lr_foid = zp->z_id; lr->lr_mask = (uint64_t)mask_applied; lr->lr_mode = (uint64_t)vap->va_mode; if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid)) lr->lr_uid = fuidp->z_fuid_owner; else lr->lr_uid = (uint64_t)vap->va_uid; if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid)) lr->lr_gid = fuidp->z_fuid_group; else lr->lr_gid = (uint64_t)vap->va_gid; lr->lr_size = (uint64_t)vap->va_size; ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); start = (lr_setattr_t *)(lr + 1); if (vap->va_mask & AT_XVATTR) { zfs_log_xvattr((lr_attr_t *)start, xvap); start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize); } /* * Now stick on domain information if any on end */ if (fuidp) (void) zfs_log_fuid_domains(fuidp, start); itx->itx_sync = (zp->z_sync_cnt != 0); zil_itx_assign(zilog, itx, tx); } /* * zfs_log_acl() handles TX_ACL transactions. */ void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) { itx_t *itx; lr_acl_v0_t *lrv0; lr_acl_t *lr; int txtype; int lrsize; size_t txsize; size_t aclbytes = vsecp->vsa_aclentsz; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? TX_ACL_V0 : TX_ACL; if (txtype == TX_ACL) lrsize = sizeof (*lr); else lrsize = sizeof (*lrv0); txsize = lrsize + ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + (fuidp ? fuidp->z_domain_str_sz : 0) + sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0); itx = zil_itx_create(txtype, txsize); lr = (lr_acl_t *)&itx->itx_lr; lr->lr_foid = zp->z_id; if (txtype == TX_ACL) { lr->lr_acl_bytes = aclbytes; lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; else lr->lr_acl_flags = 0; } lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; if (txtype == TX_ACL_V0) { lrv0 = (lr_acl_v0_t *)lr; bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); } else { void *start = (ace_t *)(lr + 1); bcopy(vsecp->vsa_aclentp, start, aclbytes); start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); if (fuidp) { start = zfs_log_fuid_ids(fuidp, start); (void) zfs_log_fuid_domains(fuidp, start); } } itx->itx_sync = (zp->z_sync_cnt != 0); zil_itx_assign(zilog, itx, tx); } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c (revision 247192) @@ -1,605 +1,605 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012 by Delphix. All rights reserved. */ /* * This file contains the code to implement file range locking in * ZFS, although there isn't much specific to ZFS (all that comes to mind * support for growing the blocksize). * * Interface * --------- * Defined in zfs_rlock.h but essentially: * rl = zfs_range_lock(zp, off, len, lock_type); * zfs_range_unlock(rl); * zfs_range_reduce(rl, off, len); * * AVL tree * -------- * An AVL tree is used to maintain the state of the existing ranges * that are locked for exclusive (writer) or shared (reader) use. * The starting range offset is used for searching and sorting the tree. * * Common case * ----------- * The (hopefully) usual case is of no overlaps or contention for * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree * searched that finds no overlap, and *this* rl_t is placed in the tree. * * Overlaps/Reference counting/Proxy locks * --------------------------------------- * The avl code only allows one node at a particular offset. Also it's very * inefficient to search through all previous entries looking for overlaps * (because the very 1st in the ordered list might be at offset 0 but * cover the whole file). * So this implementation uses reference counts and proxy range locks. * Firstly, only reader locks use reference counts and proxy locks, * because writer locks are exclusive. * When a reader lock overlaps with another then a proxy lock is created * for that range and replaces the original lock. If the overlap * is exact then the reference count of the proxy is simply incremented. * Otherwise, the proxy lock is split into smaller lock ranges and * new proxy locks created for non overlapping ranges. * The reference counts are adjusted accordingly. * Meanwhile, the orginal lock is kept around (this is the callers handle) * and its offset and length are used when releasing the lock. * * Thread coordination * ------------------- * In order to make wakeups efficient and to ensure multiple continuous * readers on a range don't starve a writer for the same range lock, * two condition variables are allocated in each rl_t. * If a writer (or reader) can't get a range it initialises the writer * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; * and waits on that cv. When a thread unlocks that range it wakes up all * writers then all readers before destroying the lock. * * Append mode writes * ------------------ * Append mode writes need to lock a range at the end of a file. * The offset of the end of the file is determined under the * range locking mutex, and the lock type converted from RL_APPEND to * RL_WRITER and the range locked. * * Grow block handling * ------------------- * ZFS supports multiple block sizes currently upto 128K. The smallest * block size is used for the file which is grown as needed. During this * growth all other writers and readers must be excluded. * So if the block size needs to be grown then the whole file is * exclusively locked, then later the caller will reduce the lock * range to just the range to be written using zfs_reduce_range. */ #include /* * Check if a write lock can be grabbed, or wait and recheck until available. */ static void zfs_range_lock_writer(znode_t *zp, rl_t *new) { avl_tree_t *tree = &zp->z_range_avl; rl_t *rl; avl_index_t where; uint64_t end_size; uint64_t off = new->r_off; uint64_t len = new->r_len; for (;;) { /* * Range locking is also used by zvol and uses a * dummied up znode. However, for zvol, we don't need to * append or grow blocksize, and besides we don't have * a "sa" data or z_zfsvfs - so skip that processing. * * Yes, this is ugly, and would be solved by not handling * grow or append in range lock code. If that was done then * we could make the range locking code generically available * to other non-zfs consumers. */ if (zp->z_vnode) { /* caller is ZPL */ /* * If in append mode pick up the current end of file. * This is done under z_range_lock to avoid races. */ if (new->r_type == RL_APPEND) new->r_off = zp->z_size; /* * If we need to grow the block size then grab the whole * file range. This is also done under z_range_lock to * avoid races. */ end_size = MAX(zp->z_size, new->r_off + len); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { new->r_off = 0; new->r_len = UINT64_MAX; } } /* * First check for the usual case of no locks */ if (avl_numnodes(tree) == 0) { new->r_type = RL_WRITER; /* convert to writer */ avl_add(tree, new); return; } /* * Look for any locks in the range. */ rl = avl_find(tree, new, &where); if (rl) goto wait; /* already locked at same offset */ rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); if (rl && (rl->r_off < new->r_off + new->r_len)) goto wait; rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); if (rl && rl->r_off + rl->r_len > new->r_off) goto wait; new->r_type = RL_WRITER; /* convert possible RL_APPEND */ avl_insert(tree, new, where); return; wait: if (!rl->r_write_wanted) { cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); rl->r_write_wanted = B_TRUE; } cv_wait(&rl->r_wr_cv, &zp->z_range_lock); /* reset to original */ new->r_off = off; new->r_len = len; } } /* * If this is an original (non-proxy) lock then replace it by * a proxy and return the proxy. */ static rl_t * zfs_range_proxify(avl_tree_t *tree, rl_t *rl) { rl_t *proxy; if (rl->r_proxy) return (rl); /* already a proxy */ ASSERT3U(rl->r_cnt, ==, 1); ASSERT(rl->r_write_wanted == B_FALSE); ASSERT(rl->r_read_wanted == B_FALSE); avl_remove(tree, rl); rl->r_cnt = 0; /* create a proxy range lock */ proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); proxy->r_off = rl->r_off; proxy->r_len = rl->r_len; proxy->r_cnt = 1; proxy->r_type = RL_READER; proxy->r_proxy = B_TRUE; proxy->r_write_wanted = B_FALSE; proxy->r_read_wanted = B_FALSE; avl_add(tree, proxy); return (proxy); } /* * Split the range lock at the supplied offset * returning the *front* proxy. */ static rl_t * zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) { rl_t *front, *rear; ASSERT3U(rl->r_len, >, 1); ASSERT3U(off, >, rl->r_off); ASSERT3U(off, <, rl->r_off + rl->r_len); ASSERT(rl->r_write_wanted == B_FALSE); ASSERT(rl->r_read_wanted == B_FALSE); /* create the rear proxy range lock */ rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); rear->r_off = off; rear->r_len = rl->r_off + rl->r_len - off; rear->r_cnt = rl->r_cnt; rear->r_type = RL_READER; rear->r_proxy = B_TRUE; rear->r_write_wanted = B_FALSE; rear->r_read_wanted = B_FALSE; front = zfs_range_proxify(tree, rl); front->r_len = off - rl->r_off; avl_insert_here(tree, rear, front, AVL_AFTER); return (front); } /* * Create and add a new proxy range lock for the supplied range. */ static void zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) { rl_t *rl; ASSERT(len); rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); rl->r_off = off; rl->r_len = len; rl->r_cnt = 1; rl->r_type = RL_READER; rl->r_proxy = B_TRUE; rl->r_write_wanted = B_FALSE; rl->r_read_wanted = B_FALSE; avl_add(tree, rl); } static void zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) { rl_t *next; uint64_t off = new->r_off; uint64_t len = new->r_len; /* * prev arrives either: * - pointing to an entry at the same offset * - pointing to the entry with the closest previous offset whose * range may overlap with the new range * - null, if there were no ranges starting before the new one */ if (prev) { if (prev->r_off + prev->r_len <= off) { prev = NULL; } else if (prev->r_off != off) { /* * convert to proxy if needed then * split this entry and bump ref count */ prev = zfs_range_split(tree, prev, off); prev = AVL_NEXT(tree, prev); /* move to rear range */ } } ASSERT((prev == NULL) || (prev->r_off == off)); if (prev) next = prev; else next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); if (next == NULL || off + len <= next->r_off) { /* no overlaps, use the original new rl_t in the tree */ avl_insert(tree, new, where); return; } if (off < next->r_off) { /* Add a proxy for initial range before the overlap */ zfs_range_new_proxy(tree, off, next->r_off - off); } new->r_cnt = 0; /* will use proxies in tree */ /* * We now search forward through the ranges, until we go past the end * of the new range. For each entry we make it a proxy if it * isn't already, then bump its reference count. If there's any * gaps between the ranges then we create a new proxy range. */ for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { if (off + len <= next->r_off) break; if (prev && prev->r_off + prev->r_len < next->r_off) { /* there's a gap */ ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); zfs_range_new_proxy(tree, prev->r_off + prev->r_len, next->r_off - (prev->r_off + prev->r_len)); } if (off + len == next->r_off + next->r_len) { /* exact overlap with end */ next = zfs_range_proxify(tree, next); next->r_cnt++; return; } if (off + len < next->r_off + next->r_len) { /* new range ends in the middle of this block */ next = zfs_range_split(tree, next, off + len); next->r_cnt++; return; } ASSERT3U(off + len, >, next->r_off + next->r_len); next = zfs_range_proxify(tree, next); next->r_cnt++; } /* Add the remaining end range. */ zfs_range_new_proxy(tree, prev->r_off + prev->r_len, (off + len) - (prev->r_off + prev->r_len)); } /* * Check if a reader lock can be grabbed, or wait and recheck until available. */ static void zfs_range_lock_reader(znode_t *zp, rl_t *new) { avl_tree_t *tree = &zp->z_range_avl; rl_t *prev, *next; avl_index_t where; uint64_t off = new->r_off; uint64_t len = new->r_len; /* * Look for any writer locks in the range. */ retry: prev = avl_find(tree, new, &where); if (prev == NULL) prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); /* * Check the previous range for a writer lock overlap. */ if (prev && (off < prev->r_off + prev->r_len)) { if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { if (!prev->r_read_wanted) { cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); prev->r_read_wanted = B_TRUE; } cv_wait(&prev->r_rd_cv, &zp->z_range_lock); goto retry; } if (off + len < prev->r_off + prev->r_len) goto got_lock; } /* * Search through the following ranges to see if there's * write lock any overlap. */ if (prev) next = AVL_NEXT(tree, prev); else next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); for (; next; next = AVL_NEXT(tree, next)) { if (off + len <= next->r_off) goto got_lock; if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { if (!next->r_read_wanted) { cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); next->r_read_wanted = B_TRUE; } cv_wait(&next->r_rd_cv, &zp->z_range_lock); goto retry; } if (off + len <= next->r_off + next->r_len) goto got_lock; } got_lock: /* * Add the read lock, which may involve splitting existing * locks and bumping ref counts (r_cnt). */ zfs_range_add_reader(tree, new, prev, where); } /* * Lock a range (offset, length) as either shared (RL_READER) * or exclusive (RL_WRITER). Returns the range lock structure * for later unlocking or reduce range (if entire file * previously locked as RL_WRITER). */ rl_t * zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) { rl_t *new; ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); new = kmem_alloc(sizeof (rl_t), KM_SLEEP); new->r_zp = zp; new->r_off = off; if (len + off < off) /* overflow */ len = UINT64_MAX - off; new->r_len = len; new->r_cnt = 1; /* assume it's going to be in the tree */ new->r_type = type; new->r_proxy = B_FALSE; new->r_write_wanted = B_FALSE; new->r_read_wanted = B_FALSE; mutex_enter(&zp->z_range_lock); if (type == RL_READER) { /* * First check for the usual case of no locks */ if (avl_numnodes(&zp->z_range_avl) == 0) avl_add(&zp->z_range_avl, new); else zfs_range_lock_reader(zp, new); } else zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ mutex_exit(&zp->z_range_lock); return (new); } /* * Unlock a reader lock */ static void zfs_range_unlock_reader(znode_t *zp, rl_t *remove) { avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl, *next; + rl_t *rl, *next = NULL; uint64_t len; /* * The common case is when the remove entry is in the tree * (cnt == 1) meaning there's been no other reader locks overlapping * with this one. Otherwise the remove entry will have been * removed from the tree and replaced by proxies (one or * more ranges mapping to the entire range). */ if (remove->r_cnt == 1) { avl_remove(tree, remove); if (remove->r_write_wanted) { cv_broadcast(&remove->r_wr_cv); cv_destroy(&remove->r_wr_cv); } if (remove->r_read_wanted) { cv_broadcast(&remove->r_rd_cv); cv_destroy(&remove->r_rd_cv); } } else { ASSERT0(remove->r_cnt); ASSERT0(remove->r_write_wanted); ASSERT0(remove->r_read_wanted); /* * Find start proxy representing this reader lock, * then decrement ref count on all proxies * that make up this range, freeing them as needed. */ rl = avl_find(tree, remove, NULL); ASSERT(rl); ASSERT(rl->r_cnt); ASSERT(rl->r_type == RL_READER); for (len = remove->r_len; len != 0; rl = next) { len -= rl->r_len; if (len) { next = AVL_NEXT(tree, rl); ASSERT(next); ASSERT(rl->r_off + rl->r_len == next->r_off); ASSERT(next->r_cnt); ASSERT(next->r_type == RL_READER); } rl->r_cnt--; if (rl->r_cnt == 0) { avl_remove(tree, rl); if (rl->r_write_wanted) { cv_broadcast(&rl->r_wr_cv); cv_destroy(&rl->r_wr_cv); } if (rl->r_read_wanted) { cv_broadcast(&rl->r_rd_cv); cv_destroy(&rl->r_rd_cv); } kmem_free(rl, sizeof (rl_t)); } } } kmem_free(remove, sizeof (rl_t)); } /* * Unlock range and destroy range lock structure. */ void zfs_range_unlock(rl_t *rl) { znode_t *zp = rl->r_zp; ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); ASSERT(!rl->r_proxy); mutex_enter(&zp->z_range_lock); if (rl->r_type == RL_WRITER) { /* writer locks can't be shared or split */ avl_remove(&zp->z_range_avl, rl); mutex_exit(&zp->z_range_lock); if (rl->r_write_wanted) { cv_broadcast(&rl->r_wr_cv); cv_destroy(&rl->r_wr_cv); } if (rl->r_read_wanted) { cv_broadcast(&rl->r_rd_cv); cv_destroy(&rl->r_rd_cv); } kmem_free(rl, sizeof (rl_t)); } else { /* * lock may be shared, let zfs_range_unlock_reader() * release the lock and free the rl_t */ zfs_range_unlock_reader(zp, rl); mutex_exit(&zp->z_range_lock); } } /* * Reduce range locked as RL_WRITER from whole file to specified range. * Asserts the whole file is exclusivly locked and so there's only one * entry in the tree. */ void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) { znode_t *zp = rl->r_zp; /* Ensure there are no other locks */ ASSERT(avl_numnodes(&zp->z_range_avl) == 1); ASSERT(rl->r_off == 0); ASSERT(rl->r_type == RL_WRITER); ASSERT(!rl->r_proxy); ASSERT3U(rl->r_len, ==, UINT64_MAX); ASSERT3U(rl->r_cnt, ==, 1); mutex_enter(&zp->z_range_lock); rl->r_off = off; rl->r_len = len; mutex_exit(&zp->z_range_lock); if (rl->r_write_wanted) cv_broadcast(&rl->r_wr_cv); if (rl->r_read_wanted) cv_broadcast(&rl->r_rd_cv); } /* * AVL comparison function used to order range locks * Locks are ordered on the start offset of the range. */ int zfs_range_compare(const void *arg1, const void *arg2) { const rl_t *rl1 = arg1; const rl_t *rl2 = arg2; if (rl1->r_off > rl2->r_off) return (1); if (rl1->r_off < rl2->r_off) return (-1); return (0); } Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c (revision 247192) @@ -1,2489 +1,2496 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owner can perform privileged operation on his file systems"); int zfs_debug_level; TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level); SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0, "Debug level"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_objset_close(zfsvfs_t *zfsvfs); static void zfs_freevfs(vfs_t *vfsp); static struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, .vfs_root = zfs_root, .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; /*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; error = vfs_stdsync(vfsp, waitfor); if (error != 0) return (error); ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (sys_shutdown && spa_suspended(dp->dp_spa)) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } #ifndef __FreeBSD__ static int zfs_create_unique_device(dev_t *dev) { major_t new_major; do { ASSERT3U(zfs_minor, <=, MAXMIN32); minor_t start = zfs_minor; do { mutex_enter(&zfs_dev_mtx); if (zfs_minor >= MAXMIN32) { /* * If we're still using the real major * keep out of /dev/zfs and /dev/zvol minor * number space. If we're using a getudev()'ed * major number, we can use all of its minors. */ if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) zfs_minor = ZFS_MIN_MINOR; else zfs_minor = 0; } else { zfs_minor++; } *dev = makedevice(zfs_major, zfs_minor); mutex_exit(&zfs_dev_mtx); } while (vfs_devismounted(*dev) && zfs_minor != start); if (zfs_minor == start) { /* * We are using all ~262,000 minor numbers for the * current major number. Create a new major number. */ if ((new_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "zfs_mount: Can't get unique major " "device number."); return (-1); } mutex_enter(&zfs_dev_mtx); zfs_major = new_major; zfs_minor = 0; mutex_exit(&zfs_dev_mtx); } else { break; } /* CONSTANTCONDITION */ } while (1); return (0); } #endif /* !__FreeBSD__ */ static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); } else { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; #endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval < SPA_MINBLOCKSIZE || newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) newval = SPA_MAXBLOCKSIZE; zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void vscan_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_vscan = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; - int readonly, do_readonly = B_FALSE; - int setuid, do_setuid = B_FALSE; - int exec, do_exec = B_FALSE; - int xattr, do_xattr = B_FALSE; - int atime, do_atime = B_FALSE; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; + boolean_t devices = B_FALSE; + boolean_t do_devices = B_FALSE; + boolean_t xattr = B_FALSE; + boolean_t do_xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else { if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { xattr = B_FALSE; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { xattr = B_TRUE; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else { char osname[MAXNAMELEN]; dmu_objset_name(os, osname); if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, NULL)) { return (error); } } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "xattr", xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "recordsize", blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "readonly", readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "setuid", setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "exec", exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "snapdir", snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "aclmode", acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, "vscan", vscan_changed_cb, zfsvfs); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: /* * We may attempt to unregister some callbacks that are not * registered, but this is OK; it will simply return ENOMSG, * which we will ignore. */ (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs); (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); return (error); } static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { int error = 0; /* * Is it a valid type of object to track? */ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (ENOENT); /* * If we have a NULL data pointer * then assume the id's aren't changing and * return EEXIST to the dmu to let it know to * use the same ids */ if (data == NULL) return (EEXIST); if (bonustype == DMU_OT_ZNODE) { znode_phys_t *znp = data; *userp = znp->zp_uid; *groupp = znp->zp_gid; } else { int hdrsize; sa_hdr_phys_t *sap = data; sa_hdr_phys_t sa = *sap; boolean_t swap = B_FALSE; ASSERT(bonustype == DMU_OT_SA); if (sa.sa_magic == 0) { /* * This should only happen for newly created * files that haven't had the znode data filled * in yet. */ *userp = 0; *groupp = 0; return (0); } if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { sa.sa_magic = SA_MAGIC; sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); swap = B_TRUE; } else { VERIFY3U(sa.sa_magic, ==, SA_MAGIC); } hdrsize = sa_hdrsize(&sa); VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); if (swap) { *userp = BSWAP_64(*userp); *groupp = BSWAP_64(*groupp); } } return (error); } static void fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, char *domainbuf, int buflen, uid_t *ridp) { uint64_t fuid; const char *domain; fuid = strtonum(fuidstr, NULL); domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); if (domain) (void) strlcpy(domainbuf, domain, buflen); else domainbuf[0] = '\0'; *ridp = FUID_RID(fuid); } static uint64_t zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) { switch (type) { case ZFS_PROP_USERUSED: return (DMU_USERUSED_OBJECT); case ZFS_PROP_GROUPUSED: return (DMU_GROUPUSED_OBJECT); case ZFS_PROP_USERQUOTA: return (zfsvfs->z_userquota_obj); case ZFS_PROP_GROUPQUOTA: return (zfsvfs->z_groupquota_obj); } return (0); } int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) { int error; zap_cursor_t zc; zap_attribute_t za; zfs_useracct_t *buf = vbuf; uint64_t obj; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (ENOTSUP); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) { *bufsizep = 0; return (0); } for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > *bufsizep) break; fuidstr_to_sid(zfsvfs, za.za_name, buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); buf->zu_space = za.za_first_integer; buf++; } if (error == ENOENT) error = 0; ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; *cookiep = zap_cursor_serialize(&zc); zap_cursor_fini(&zc); return (error); } /* * buf must be big enough (eg, 32 bytes) */ static int id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, char *buf, boolean_t addok) { uint64_t fuid; int domainid = 0; if (domain && domain[0]) { domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); if (domainid == -1) return (ENOENT); } fuid = FUID_ENCODE(domainid, rid); (void) sprintf(buf, "%llx", (longlong_t)fuid); return (0); } int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t *valp) { char buf[32]; int err; uint64_t obj; *valp = 0; if (!dmu_objset_userspace_present(zfsvfs->z_os)) return (ENOTSUP); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) return (0); err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); if (err) return (err); err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); if (err == ENOENT) err = 0; return (err); } int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, const char *domain, uint64_t rid, uint64_t quota) { char buf[32]; int err; dmu_tx_t *tx; uint64_t *objp; boolean_t fuid_dirtied; if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) return (EINVAL); if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) return (ENOTSUP); objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : &zfsvfs->z_groupquota_obj; err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); if (err) return (err); fuid_dirtied = zfsvfs->z_fuid_dirty; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); if (*objp == 0) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, zfs_userquota_prop_prefixes[type]); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } mutex_enter(&zfsvfs->z_lock); if (*objp == 0) { *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, DMU_OT_NONE, 0, tx); VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); } mutex_exit(&zfsvfs->z_lock); if (quota == 0) { err = zap_remove(zfsvfs->z_os, *objp, buf, tx); if (err == ENOENT) err = 0; } else { err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); } ASSERT(err == 0); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); dmu_tx_commit(tx); return (err); } boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) { char buf[32]; uint64_t used, quota, usedobj, quotaobj; int err; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); (void) sprintf(buf, "%llx", (longlong_t)fuid); err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); if (err != 0) return (B_FALSE); err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); if (err != 0) return (B_FALSE); return (used >= quota); } boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) { uint64_t fuid; uint64_t quotaobj; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; fuid = isgroup ? zp->z_gid : zp->z_uid; if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); } int zfsvfs_create(const char *osname, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; uint64_t zval; int i, error; uint64_t sa_obj; zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); /* * We claim to always be readonly so we can open snapshots; * other ZPL code will prevent us from writing to snapshots. */ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); if (error) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } /* * Initialize the zfs-specific filesystem structure. * Should probably make this a kmem cache, shuffle fields, * and just bzero up to z_hold_mtx[]. */ zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error) { goto out; } else if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); error = ENOTSUP; goto out; } if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) goto out; zfsvfs->z_norm = (int)zval; if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) goto out; zfsvfs->z_utf8 = (zval != 0); if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) goto out; zfsvfs->z_case = (uint_t)zval; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error) return (error); } else { /* * Pre SA versions file systems should never touch * either the attribute registration or layout objects. */ sa_obj = 0; } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error) goto out; if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error) goto out; ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error) goto out; error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error && error != ENOENT) goto out; error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error && error != ENOENT) goto out; error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error && error != ENOENT) goto out; error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error && error != ENOENT) goto out; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); rrw_init(&zfsvfs->z_teardown_lock); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); *zfvp = zfsvfs; return (0); out: dmu_objset_disown(os, zfsvfs); *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; else zfs_unlinked_drain(zfsvfs); /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } return (0); } extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; /* * This is a barrier to prevent the filesystem from going away in * zfs_znode_move() until we can safely ensure that the filesystem is * not unmounted. We consider the filesystem valid before the barrier * and invalid after the barrier. */ rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrw_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); if (zfsvfs->z_vfs) { if (zfsvfs->z_use_fuids) { vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } else { vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } } zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; vnode_t *vp; ASSERT(vfsp); ASSERT(osname); error = zfsvfs_create(osname, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | vfsp->mnt_vfc->vfc_typenum & 0xFF; /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } vfs_mountedfrom(vfsp, osname); /* Grab extra reference. */ VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0); VOP_UNLOCK(vp, 0); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_add_32(&zfs_active_fs_count, 1); } return (error); } void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; struct dsl_dataset *ds; /* * Unregister properties. */ if (!dmu_objset_is_snapshot(os)) { ds = dmu_objset_ds(os); VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs) == 0); VERIFY(dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs) == 0); } } #ifdef SECLABEL /* * Convert a decimal digit string to a uint64_t integer. */ static int str_to_uint64(char *str, uint64_t *objnum) { uint64_t num = 0; while (*str) { if (*str < '0' || *str > '9') return (EINVAL); num = num*10 + *str++ - '0'; } *objnum = num; return (0); } /* * The boot path passed from the boot loader is in the form of * "rootpool-name/root-filesystem-object-number'. Convert this * string to a dataset name: "rootpool-name/root-filesystem-name". */ static int zfs_parse_bootfs(char *bpath, char *outpath) { char *slashp; uint64_t objnum; int error; if (*bpath == 0 || *bpath == '/') return (EINVAL); (void) strcpy(outpath, bpath); slashp = strchr(bpath, '/'); /* if no '/', just return the pool name */ if (slashp == NULL) { return (0); } /* if not a number, just return the root dataset name */ if (str_to_uint64(slashp+1, &objnum)) { return (0); } *slashp = '\0'; error = dsl_dsobj_to_dsname(bpath, objnum, outpath); *slashp = '/'; return (error); } /* * zfs_check_global_label: * Check that the hex label string is appropriate for the dataset * being mounted into the global_zone proper. * * Return an error if the hex label string is not default or * admin_low/admin_high. For admin_low labels, the corresponding * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); if (strcasecmp(hexsl, ADMIN_LOW) == 0) { /* must be readonly */ uint64_t rdonly; if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (EACCES); return (rdonly ? 0 : EACCES); } return (EACCES); } /* * zfs_mount_label_policy: * Determine whether the mount is allowed according to MAC check. * by comparing (where appropriate) label of the dataset against * the label of the zone being mounted into. If the dataset has * no label, create one. * * Returns: * 0 : access allowed * >0 : error code, such as EACCES */ static int zfs_mount_label_policy(vfs_t *vfsp, char *osname) { int error, retv; zone_t *mntzone = NULL; ts_label_t *mnt_tsl; bslabel_t *mnt_sl; bslabel_t ds_sl; char ds_hexsl[MAXNAMELEN]; retv = EACCES; /* assume the worst */ /* * Start by getting the dataset label if it exists. */ error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error) return (EACCES); /* * If labeling is NOT enabled, then disallow the mount of datasets * which have a non-default label already. No other label checks * are needed. */ if (!is_system_labeled()) { if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); return (EACCES); } /* * Get the label of the mountpoint. If mounting into the global * zone (i.e. mountpoint is not within an active zone and the * zoned property is off), the label must be default or * admin_low/admin_high only; no other checks are needed. */ mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); if (mntzone->zone_id == GLOBAL_ZONEID) { uint64_t zoned; zone_rele(mntzone); if (dsl_prop_get_integer(osname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (EACCES); if (!zoned) return (zfs_check_global_label(osname, ds_hexsl)); else /* * This is the case of a zone dataset being mounted * initially, before the zone has been fully created; * allow this mount into global zone. */ return (0); } mnt_tsl = mntzone->zone_slabel; ASSERT(mnt_tsl != NULL); label_hold(mnt_tsl); mnt_sl = label2bslabel(mnt_tsl); if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { /* * The dataset doesn't have a real label, so fabricate one. */ char *str = NULL; if (l_to_str_internal(mnt_sl, &str) == 0 && dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) retv = 0; if (str != NULL) kmem_free(str, strlen(str) + 1); } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { /* * Now compare labels to complete the MAC check. If the * labels are equal then allow access. If the mountpoint * label dominates the dataset label, allow readonly access. * Otherwise, access is denied. */ if (blequal(mnt_sl, &ds_sl)) retv = 0; else if (bldominates(mnt_sl, &ds_sl)) { vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); retv = 0; } } label_rele(mnt_tsl); zone_rele(mntzone); return (retv); } #endif /* SECLABEL */ #ifdef OPENSOLARIS_MOUNTROOT static int zfs_mountroot(vfs_t *vfsp, enum whymountroot why) { int error = 0; static int zfsrootdone = 0; zfsvfs_t *zfsvfs = NULL; znode_t *zp = NULL; vnode_t *vp = NULL; char *zfs_bootfs; char *zfs_devid; ASSERT(vfsp); /* * The filesystem that we mount as root is defined in the * boot property "zfs-bootfs" with a format of * "poolname/root-dataset-objnum". */ if (why == ROOT_INIT) { if (zfsrootdone++) return (EBUSY); /* * the process of doing a spa_load will require the * clock to be set before we could (for example) do * something better by looking at the timestamp on * an uberblock, so just set it to -1. */ clkset(-1); if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { cmn_err(CE_NOTE, "spa_get_bootfs: can not get " "bootfs name"); return (EINVAL); } zfs_devid = spa_get_bootprop("diskdevid"); error = spa_import_rootpool(rootfs.bo_name, zfs_devid); if (zfs_devid) spa_free_bootprop(zfs_devid); if (error) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "spa_import_rootpool: error %d", error); return (error); } if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { spa_free_bootprop(zfs_bootfs); cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", error); return (error); } spa_free_bootprop(zfs_bootfs); if (error = vfs_lock(vfsp)) return (error); if (error = zfs_domount(vfsp, rootfs.bo_name)) { cmn_err(CE_NOTE, "zfs_domount: error %d", error); goto out; } zfsvfs = (zfsvfs_t *)vfsp->vfs_data; ASSERT(zfsvfs); if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) { cmn_err(CE_NOTE, "zfs_zget: error %d", error); goto out; } vp = ZTOV(zp); mutex_enter(&vp->v_lock); vp->v_flag |= VROOT; mutex_exit(&vp->v_lock); rootvp = vp; /* * Leave rootvp held. The root file system is never unmounted. */ vfs_add((struct vnode *)0, vfsp, (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); out: vfs_unlock(vfsp); return (error); } else if (why == ROOT_REMOUNT) { readonly_changed_cb(vfsp->vfs_data, B_FALSE); vfsp->vfs_flag |= VFS_REMOUNT; /* refresh mount options */ zfs_unregister_callbacks(vfsp->vfs_data); return (zfs_register_callbacks(vfsp)); } else if (why == ROOT_UNMOUNT) { zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); (void) zfs_sync(vfsp, 0, 0); return (0); } /* * if "why" is equal to anything else other than ROOT_INIT, * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. */ return (ENOTSUP); } #endif /* OPENSOLARIS_MOUNTROOT */ static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(poolname, osname, p - osname); poolname[p - osname] = '\0'; } return (0); } /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS)) return (EPERM); if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (EINVAL); /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK(mvp, 0); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK(mvp, 0); goto out; } VOP_UNLOCK(mvp, 0); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curthread) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { error = EPERM; goto out; } #ifdef SECLABEL error = zfs_mount_label_policy(vfsp, osname); if (error) goto out; #endif vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { /* refresh mount options */ zfs_unregister_callbacks(vfsp->vfs_data); error = zfs_register_callbacks(vfsp); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); #ifdef sun /* * Add an extra VFS_HOLD on our parent vfs so that it can't * disappear due to a forced unmount. */ if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) VFS_HOLD(mvp->v_vfsp); #endif /* sun */ out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; statp->f_version = STATFS_VERSION; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof(statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof(statp->f_mntonname)); statp->f_namemax = ZFS_MAXNAMELEN; ZFS_EXIT(zfsvfs); return (0); } int zfs_vnode_lock(vnode_t *vp, int flags) { int error; ASSERT(vp != NULL); error = vn_lock(vp, flags); return (error); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; ZFS_ENTER_NOERROR(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); ZFS_EXIT(zfsvfs); if (error == 0) { error = zfs_vnode_lock(*vpp, flags); if (error == 0) (*vpp)->v_vflag |= VV_ROOT; } if (error != 0) *vpp = NULL; return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); #ifdef FREEBSD_NAMECACHE cache_purgevfs(zfsvfs->z_parent->z_vfs); #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); rrw_exit(&zfsvfs->z_teardown_lock, FTAG); return (EIO); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relavent for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rrw_exit(&zfsvfs->z_teardown_lock, FTAG); rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); (void) dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * We purge the parent filesystem's vfsp as the parent filesystem * and all of its snapshots have their vnode's v_vfsp set to the * parent's filesystem's vfsp. Note, 'z_parent' is self * referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); ret = vflush(vfsp, 0, 0, td); ASSERT(ret == EBUSY); if (!(fflag & MS_FORCE)) { if (zfsvfs->z_ctldir->v_count > 1) return (EBUSY); ASSERT(zfsvfs->z_ctldir->v_count == 1); } zfsctl_destroy(zfsvfs); ASSERT(zfsvfs->z_ctldir == NULL); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); zfsvfs->z_unmounted = B_TRUE; rrw_exit(&zfsvfs->z_teardown_lock, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) { if (!zfsvfs->z_issnap) { zfsctl_create(zfsvfs); ASSERT(zfsvfs->z_ctldir != NULL); } return (ret); } if (!(fflag & MS_FORCE)) { /* * Check the number of active vnodes in the file system. * Our count is maintained in the vfs structure, but the * number is off by 1 to indicate a hold on the vfs * structure itself. * * The '.zfs' directory maintains a reference of its * own, and any active references underneath are * reflected in the vnode count. */ if (zfsvfs->z_ctldir == NULL) { if (vfsp->vfs_count > 1) return (EBUSY); } else { if (vfsp->vfs_count > 2 || zfsvfs->z_ctldir->v_count > 1) return (EBUSY); } } VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); if (zfsvfs->z_issnap) { vnode_t *svp = vfsp->mnt_vnodecovered; if (svp->v_count >= 2) VN_RELE(svp); } zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { VN_RELE(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (err == 0) err = zfs_vnode_lock(*vpp, flags); if (err != 0) *vpp = NULL; else (*vpp)->v_hash = ino; return (err); } static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; ZFS_ENTER(zfsvfs); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zfsvfs); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (EINVAL); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { *vpp = zfsvfs->z_ctldir; ASSERT(*vpp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 0, NULL, NULL, NULL, NULL, NULL) == 0); } else if (object == zfsvfs->z_shares_dir) { VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL, 0, NULL, NULL, NULL, NULL, NULL) == 0); } else { VN_HOLD(*vpp); } ZFS_EXIT(zfsvfs); err = zfs_vnode_lock(*vpp, flags); if (err != 0) *vpp = NULL; return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); if (err = zfs_zget(zfsvfs, object, &zp)) { ZFS_EXIT(zfsvfs); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); VN_RELE(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (EINVAL); } *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); err = zfs_vnode_lock(*vpp, flags | LK_RETRY); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); dmu_objset_disown(zfsvfs->z_os, zfsvfs); return (0); } /* * Reopen zfsvfs_t::z_os and release VOPs. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) { int err; ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, &zfsvfs->z_os); if (err) { zfsvfs->z_os = NULL; } else { znode_t *zp; uint64_t sa_obj = 0; /* * Make sure version hasn't changed */ err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (err) goto bail; err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (err && zfsvfs->z_version >= ZPL_VERSION_SA) goto bail; if ((err = sa_setup(zfsvfs->z_os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0) goto bail; if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(zfsvfs->z_os, zfs_sa_upgrade); VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via ZFS_ENTER_VERIFY_VP * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); } bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrw_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { /* * Since we couldn't reopen zfsvfs::z_os, or * setup the sa framework force unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; #ifdef sun /* * If this is a snapshot, we have an extra VFS_HOLD on our parent * from zfs_mount(). Release it here. If we came through * zfs_mountroot() instead, we didn't grab an extra hold, so * skip the VFS_RELE for rootvfs. */ if (zfsvfs->z_issnap && (vfsp != rootvfs)) VFS_RELE(zfsvfs->z_parent->z_vfs); #endif /* sun */ zfsvfs_free(zfsvfs); atomic_add_32(&zfs_active_fs_count, -1); } #ifdef __i386__ static int desiredvnodes_backup; #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); } void zfs_fini(void) { zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (EINVAL); if (newvers < zfsvfs->z_version) return (EINVAL); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (ENOTSUP); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal(LOG_DS_UPGRADE, dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", zfsvfs->z_version, newvers, dmu_objset_id(os)); dmu_tx_commit(tx); zfsvfs->z_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { const char *pname; int error = ENOENT; /* * Look up the file system's value for the property. For the * version property, we look up a slightly different string. */ if (prop == ZFS_PROP_VERSION) pname = ZPL_VERSION_STR; else pname = zfs_prop_to_name(prop); if (os != NULL) error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; default: return (error); } error = 0; } return (error); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void)strlcpy(fromname, newname, sizeof(mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", newname, fromname + oldlen); (void)strlcpy(fromname, tmpbuf, sizeof(mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif Index: user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c =================================================================== --- user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 247191) +++ user/attilio/vmobj-rwlock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c (revision 247192) @@ -1,6893 +1,6894 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). * This is critical because we don't want to block while holding locks. * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing to * use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (EPERM); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (EACCES); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; /* * Clean up any locks held by this process on the vp. */ cleanlocks(vp, ddi_get_pid(), 0); cleanshares(vp, ddi_get_pid()); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (ENXIO); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); /* end of file? */ if ((error == ESRCH) || (noff > file_sz)) { /* * Handle the virtual hole at the end of file. */ if (hole) { *off = file_sz; return (0); } return (ENXIO); } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ case _FIOGDIO: case _FIOSDIO: return (0); case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: #ifdef sun if (ddi_copyin((void *)data, &off, sizeof (off), flag)) return (EFAULT); #else off = *(offset_t *)data; #endif zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); #ifdef sun if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) return (EFAULT); #else *(offset_t *)data = off; #endif return (0); } return (ENOTTY); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if ((pp->oflags & VPO_BUSY) != 0) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_sleep(pp, "zfsmwb"); continue; } } else { pp = vm_page_alloc(obj, OFF_TO_IDX(start), VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED | VM_ALLOC_NOBUSY); } if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); vm_page_io_start(pp); pmap_remove_write(pp); vm_page_clear_dirty(pp, off, nbytes); } break; } return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_io_finish(pp); vm_object_pip_subtract(pp->object, 1); } static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if ((pp->oflags & VPO_BUSY) != 0) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_sleep(pp, "zfsmwb"); continue; } ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_lock(pp); vm_page_hold(pp); vm_page_unlock(pp); } else pp = NULL; break; } return (pp); } static void page_unhold(vm_page_t pp) { vm_page_lock(pp); vm_page_unhold(pp); vm_page_unlock(pp); } static caddr_t zfs_map_page(vm_page_t pp, struct sf_buf **sfp) { *sfp = sf_buf_alloc(pp, 0); return ((caddr_t)sf_buf_kva(*sfp)); } static void zfs_unmap_page(struct sf_buf *sf) { sf_buf_free(sf); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, int segflg, dmu_tx_t *tx) { vm_object_t obj; struct sf_buf *sf; caddr_t va; int off; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if (segflg == UIO_NOCOPY) { pp = vm_page_lookup(obj, OFF_TO_IDX(start)); KASSERT(pp != NULL, ("zfs update_pages: NULL page in putpages case")); KASSERT(off == 0, ("zfs update_pages: unaligned data in putpages case")); KASSERT(pp->valid == VM_PAGE_BITS_ALL, ("zfs update_pages: invalid page in putpages case")); KASSERT(pp->busy > 0, ("zfs update_pages: unbusy page in putpages case")); KASSERT(!pmap_page_is_write_mapped(pp), ("zfs update_pages: writable page in putpages case")); zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); (void) dmu_write(os, oid, start, nbytes, va, tx); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); vm_page_undirty(pp); } else if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH);; zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unbusy(pp); } len -= nbytes; off = 0; } if (segflg != UIO_NOCOPY) vm_object_pip_wakeupn(obj, 0); zfs_vmobject_wunlock(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain VPO_BUSY on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ static int mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(uio->uio_segflg == UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); zfs_vmobject_wlock(obj); for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY); if (pp->valid == 0) { vm_page_io_start(pp); zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); vm_page_io_finish(pp); vm_page_lock(pp); if (error) { vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_activate(pp); } vm_page_unlock(pp); } if (error) break; uio->uio_resid -= bytes; uio->uio_offset += bytes; len -= bytes; } zfs_vmobject_wunlock(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; vm_object_t obj; int64_t start; caddr_t va; int len = nbytes; int off; int error = 0; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if (pp = page_hold(vp, start)) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock(obj); va = zfs_map_page(pp, &sf); error = uiomove(va + off, bytes, UIO_READ, uio); zfs_unmap_page(sf); zfs_vmobject_wlock(obj); page_unhold(pp); } else { zfs_vmobject_wunlock(obj); error = dmu_read_uio(os, zp->z_id, uio, bytes); zfs_vmobject_wlock(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock(obj); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 if success * error code if failure * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; ssize_t n, nbytes; - int error; + int error = 0; rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); os = zfsvfs->z_os; if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (EACCES); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * Check for mandatory locks */ if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); return (error); } } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (zfsvfs->z_log && (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); #ifdef sun if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; int blksz = zp->z_blksz; uint64_t offset = uio->uio_loffset; xuio = (xuio_t *)uio; if ((ISP2(blksz))) { nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, blksz)) / blksz; } else { ASSERT(offset + n <= blksz); nblk = 1; } (void) dmu_xuio_init(xuio, nblk); if (vn_has_cached_data(vp)) { /* * For simplicity, we always allocate a full buffer * even if we only expect to read a portion of a block. */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), blksz), 0, blksz); } } } #endif /* sun */ while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); #ifdef __FreeBSD__ if (uio->uio_segflg == UIO_NOCOPY) error = mappedread_sf(vp, nbytes, uio); else #endif /* __FreeBSD__ */ if (vn_has_cached_data(vp)) error = mappedread(vp, nbytes, uio); else error = dmu_read_uio(os, zp->z_id, uio, nbytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = EIO; break; } n -= nbytes; } out: zfs_range_unlock(rl); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND flag set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; - int error; + int error = 0; arc_buf_t *abuf; - iovec_t *aiov; + iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * If immutable or not appending then return EPERM */ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (EPERM); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } #ifdef sun /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) xuio = (xuio_t *)uio; else uio_prefaultpages(MIN(n, max_blksz), uio); #endif /* sun */ /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); woff = rl->r_off; if (rl->r_len == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (EFBIG); } if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (EFBIG); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { abuf = NULL; woff = uio->uio_loffset; again: if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); error = EDQUOT; break; } if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto again; } dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (rl->r_len == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_range_reduce(rl, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_size) vnode_pager_setsize(vp, woff + nbytes); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id, uio->uio_segflg, tx); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the excute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(vp, cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; #ifdef sun if (!xuio && n > 0) uio_prefaultpages(MIN(n, max_blksz), uio); #endif /* sun */ } zfs_range_unlock(rl); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); return (0); } void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_range_unlock(zgd->zgd_rl); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } #ifdef DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT(zio != NULL); ASSERT(size != 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (ENOENT); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (ENOENT); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zfsvfs->z_log; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = ENOENT; } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = ENOENT; #ifdef DEBUG if (zil_fault_io) { error = EIO; zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *obp = dmu_buf_get_blkptr(db); if (obp) { ASSERT(BP_IS_HOLE(bp)); *bp = *obp; } zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= zp->z_blksz); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } /* * If vnode is for a device return a specfs vnode instead. */ static int specvp_check(vnode_t **vpp, cred_t *cr) { int error = 0; if (IS_DEVVP(*vpp)) { struct vnode *svp; svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); VN_RELE(*vpp); if (svp == NULL) error = ENOSYS; *vpp = svp; } return (error); } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 if success * error code if failure * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td, int flags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; int *direntflags = NULL; void *realpnp = NULL; /* fast path */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (dvp->v_type != VDIR) { return (ENOTDIR); } else if (zdp->z_sa_hdl == NULL) { return (EIO); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *vpp = dvp; VN_HOLD(*vpp); return (0); } return (error); } else { vnode_t *tvp = dnlc_lookup(dvp, nm); if (tvp) { error = zfs_fastaccesschk_execute(zdp, cr); if (error) { VN_RELE(tvp); return (error); } if (tvp == DNLC_NO_VNODE) { VN_RELE(tvp); return (ENOENT); } else { *vpp = tvp; return (specvp_check(vpp, cr)); } } } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { #ifdef TODO /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } #endif /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, B_FALSE, cr)) { VN_RELE(*vpp); *vpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } if (dvp->v_type != VDIR) { ZFS_EXIT(zfsvfs); return (ENOTDIR); } /* * Check accessibility of directory. */ if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (EILSEQ); } error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); if (error == 0) error = specvp_check(vpp, cr); /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { int ltype = 0; if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); } ZFS_EXIT(zfsvfs); error = zfs_vnode_lock(*vpp, cnp->cn_lkflags); if (cnp->cn_flags & ISDOTDOT) vn_lock(dvp, ltype | LK_RETRY); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; return (error); } } else { ZFS_EXIT(zfsvfs); } #ifdef FREEBSD_NAMECACHE /* * Insert name into cache (as non-existent) if appropriate. */ if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) cache_enter(dvp, *vpp, cnp); /* * Insert name into cache if appropriate. */ if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } #endif return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; void *vsecp = NULL; int flag = 0; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (EINVAL); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (EILSEQ); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *vpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; if (*name == '\0') { /* * Null component name refers to the directory itself. */ VN_HOLD(dvp); zp = dzp; dl = NULL; error = 0; } else { /* possible VN_HOLD(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); return (error); } } if (zp == NULL) { uint64_t txtype; /* * Create a new file object and update the directory * to reference it. */ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = EINVAL; goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); error = EDQUOT; goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dl, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); have_acl = B_FALSE; /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl == EXCL) { error = EEXIST; goto out; } /* * Can't open a directory for writing. */ if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { error = EISDIR; goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if ((ZTOV(zp)->v_type == VREG) && (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ zfs_dirent_unlock(dl); dl = NULL; error = zfs_freesp(zp, 0, 0, mode, TRUE); if (error == 0) { vnevent_create(ZTOV(zp), ct); } } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) VN_RELE(ZTOV(zp)); } else { *vpp = ZTOV(zp); error = specvp_check(vpp, cr); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ uint64_t null_xattr = 0; /*ARGSUSED*/ static int zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *zp, *dzp = VTOZ(dvp); znode_t *xzp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp)) { if (realnmp) pn_free(realnmp); ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = EPERM; goto out; } vnevent_remove(vp, dvp, name, ct); if (realnmp) dnlc_remove(dvp, realnmp->pn_buf); else dnlc_remove(dvp, name); VI_LOCK(vp); may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); VI_UNLOCK(vp); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); if (xzp) VN_RELE(ZTOV(xzp)); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); VI_LOCK(vp); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && vp->v_count == 1 && !vn_has_cached_data(vp) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; VI_UNLOCK(vp); } if (delete_now) { #ifdef __FreeBSD__ panic("zfs_remove: delete_now branch taken"); #endif if (xattr_obj_unlinked) { ASSERT3U(xzp->z_links, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = 1; xzp->z_links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &xzp->z_links, sizeof (xzp->z_links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } VI_LOCK(vp); vp->v_count--; ASSERT0(vp->v_count); VI_UNLOCK(vp); mutex_exit(&zp->z_lock); zfs_znode_delete(zp, tx); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); #ifdef __FreeBSD__ vp->v_vflag |= VV_NOSYNC; #endif } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); if (!delete_now) VN_RELE(vp); if (xzp) VN_RELE(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ static int zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (EINVAL); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (EILSEQ); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *vpp = NULL; if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (EDQUOT); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dl, zp, tx, ZNEW); *vpp = ZTOV(zp); txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(dvp); znode_t *zp; vnode_t *vp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL)) { ZFS_EXIT(zfsvfs); return (error); } vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } if (vp == cwd) { error = EINVAL; goto out; } vnevent_rmdir(vp, dvp, name, ct); /* * Grab a lock on the directory to make sure that noone is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); VN_RELE(vp); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } #ifdef FREEBSD_NAMECACHE cache_purge(dvp); #endif error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); #ifdef FREEBSD_NAMECACHE cache_purge(vp); #endif out: zfs_dirent_unlock(dl); VN_RELE(vp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure. * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; u_long *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; + outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if (error = zap_cursor_retrieve(&zc, &zap)) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = ENXIO; goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { VN_RELE(ZTOV(ezp)); goto skip_entry; } VN_RELE(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = EINVAL; goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry */ next = &(eodp->ed_off); eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * ct - caller context * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds) */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t links; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ mutex_enter(&zp->z_lock); vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; #ifdef sun vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; #else vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; #endif vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) links = zp->z_links + 1; else links = zp->z_links; vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ vap->va_size = zp->z_size; #ifdef sun vap->va_rdev = vp->v_rdev; #else if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); #endif vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), times, sizeof (times)); ZFS_TIME_DECODE(&xoap->xoa_createtime, times); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); mutex_exit(&zp->z_lock); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: vp - vnode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; - uint_t saved_mask; + uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (EINVAL); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (EINVAL); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (EISDIR); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (EPERM); } if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (EPERM); } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (EOVERFLOW); } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (EROFS); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (EPERM); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID))) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { if (attrzp) VN_RELE(ZTOV(attrzp)); err = EDQUOT; goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { if (attrzp) VN_RELE(ZTOV(attrzp)); err = EDQUOT; goto out2; } } } tx = dmu_tx_create(zfsvfs->z_os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = EPERM; goto out; } if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err) { if (err == ERESTART) dmu_tx_wait(tx); goto out; } count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime, B_TRUE); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) VN_RELE(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (err == ERESTART) goto top; } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) VN_RELE(ZTOV(zl->zl_znode)); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = zp->z_zfsvfs->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (EINVAL); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 if success * error code if failure * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, caller_context_t *ct, int flags) { znode_t *tdzp, *szp, *tzp; znode_t *sdzp = VTOZ(sdvp); zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; zilog_t *zilog; vnode_t *realvp; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(sdzp); zilog = zfsvfs->z_log; /* * Make sure we have the real vp for the target directory. */ if (VOP_REALVP(tdvp, &realvp, ct) == 0) tdvp = realvp; if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) { ZFS_EXIT(zfsvfs); return (EXDEV); } tdzp = VTOZ(tdvp); ZFS_VERIFY_ZP(tdzp); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (EILSEQ); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ ZFS_EXIT(zfsvfs); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) VN_RELE(ZTOV(tzp)); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); /* * FreeBSD: In OpenSolaris they only check if rename source is * ".." here, because "." is handled in their lookup. This is * not the case for FreeBSD, so we check for "." explicitly. */ if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) serr = EINVAL; ZFS_EXIT(zfsvfs); return (serr); } if (terr) { zfs_dirent_unlock(sdl); VN_RELE(ZTOV(szp)); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; ZFS_EXIT(zfsvfs); return (terr); } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) goto out; if (ZTOV(szp)->v_type == VDIR) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if (ZTOV(szp)->v_type == VDIR) { if (ZTOV(tzp)->v_type != VDIR) { error = ENOTDIR; goto out; } } else { if (ZTOV(tzp)->v_type == VDIR) { error = EISDIR; goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); if (tzp) vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); } } #ifdef FREEBSD_NAMECACHE if (error == 0) { cache_purge(sdvp); cache_purge(tdvp); cache_purge(ZTOV(szp)); if (tzp) cache_purge(ZTOV(tzp)); } #endif } dmu_tx_commit(tx); out: if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * target - Target path of new symlink. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 if success * error code if failure * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; int flags = 0; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (EILSEQ); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (ENAMETOOLONG); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (EDQUOT); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *vpp = ZTOV(zp); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uoip - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure to contain the link path. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * ct - caller context * * RETURN: 0 if success * error code if failure * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ static int zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, caller_context_t *ct, int flags) { znode_t *dzp = VTOZ(tdvp); znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; vnode_t *realvp; int error; int zf = ZNEW; uint64_t parent; uid_t owner; ASSERT(tdvp->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (VOP_REALVP(svp, &realvp, ct) == 0) svp = realvp; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (EPERM); } if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) { ZFS_EXIT(zfsvfs); return (EXDEV); } szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (EPERM); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (EILSEQ); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (EINVAL); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); return (EPERM); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, dzp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { vnevent_link(svp, ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } #ifdef sun /* * zfs_null_putapage() is used when the file system has been force * unmounted. It just drops the pages. */ /* ARGSUSED */ static int zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, int flags, cred_t *cr) { pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); return (0); } /* * Push a page out to disk, klustering if possible. * * IN: vp - file to push page to. * pp - page to push. * flags - additional flags. * cr - credentials of caller. * * OUT: offp - start of range pushed. * lenp - len of range pushed. * * RETURN: 0 if success * error code if failure * * NOTE: callers must have locked the page to be pushed. On * exit, the page (and all other pages in the kluster) must be * unlocked. */ /* ARGSUSED */ static int zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; u_offset_t off, koff; size_t len, klen; int err; off = pp->p_offset; len = PAGESIZE; /* * If our blocksize is bigger than the page size, try to kluster * multiple pages so that we write a full block (thus avoiding * a read-modify-write). */ if (off < zp->z_size && zp->z_blksz > PAGESIZE) { klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; ASSERT(koff <= zp->z_size); if (koff + klen > zp->z_size) klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); } ASSERT3U(btop(len), ==, btopr(len)); /* * Can't push pages past end-of-file. */ if (off >= zp->z_size) { /* ignore all pages */ err = 0; goto out; } else if (off + len > zp->z_size) { int npages = btopr(zp->z_size - off); page_t *trunc; page_list_break(&pp, &trunc, npages); /* ignore pages past end of file */ if (trunc) pvn_write_done(trunc, flags); len = zp->z_size - off; } if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { err = EDQUOT; goto out; } top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); goto out; } if (zp->z_blksz <= PAGESIZE) { caddr_t va = zfs_map_page(pp, S_READ); ASSERT3U(len, <=, PAGESIZE); dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); zfs_unmap_page(pp, va); } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); out: pvn_write_done(pp, (err ? B_ERROR : 0) | flags); if (offp) *offp = off; if (lenp) *lenp = len; return (err); } /* * Copy the portion of the file indicated from pages into the file. * The pages are stored in a page list attached to the files vnode. * * IN: vp - vnode of file to push page data to. * off - position in file to put data. * len - amount of data to write. * flags - flags to control the operation. * cr - credentials of caller. * ct - caller context. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; page_t *pp; size_t io_len; u_offset_t io_off; uint_t blksz; rl_t *rl; int error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * Align this request to the file block size in case we kluster. * XXX - this can result in pretty aggresive locking, which can * impact simultanious read/write access. One option might be * to break up long requests (len == 0) into block-by-block * operations to get narrower locking. */ blksz = zp->z_blksz; if (ISP2(blksz)) io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); else io_off = 0; if (len > 0 && ISP2(blksz)) io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); else io_len = 0; if (io_len == 0) { /* * Search the entire vp list for pages >= io_off. */ rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); goto out; } rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); if (off > zp->z_size) { /* past end of file */ zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (0); } len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); for (off = io_off; io_off < off + len; io_off += io_len) { if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { pp = page_lookup(vp, io_off, (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); } else { pp = page_lookup_nowait(vp, io_off, (flags & B_FREE) ? SE_EXCL : SE_SHARED); } if (pp != NULL && pvn_getdirty(pp, flags)) { int err; /* * Found a dirty page to push */ err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); if (err) error = err; } else { io_len = PAGESIZE; } } out: zfs_range_unlock(rl); if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (error); } #endif /* sun */ /*ARGSUSED*/ void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } mutex_enter(&zp->z_lock); if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ mutex_exit(&zp->z_lock); rw_exit(&zfsvfs->z_teardown_inactive_lock); vrecycle(vp); return; } mutex_exit(&zp->z_lock); if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } rw_exit(&zfsvfs->z_teardown_inactive_lock); } #ifdef sun /* * Bounds-check the seek operation. * * IN: vp - vnode seeking within * ooff - old file offset * noffp - pointer to new file offset * ct - caller context * * RETURN: 0 if success * EINVAL if new offset invalid */ /* ARGSUSED */ static int zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) { if (vp->v_type == VDIR) return (0); return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); } /* * Pre-filter the generic locking function to trap attempts to place * a mandatory lock on a memory mapped file. */ static int zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * We are following the UFS semantics with respect to mapcnt * here: If we see that the file is mapped already, then we will * return an error, but we don't worry about races between this * function and zfs_map(). */ if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { ZFS_EXIT(zfsvfs); return (EAGAIN); } ZFS_EXIT(zfsvfs); return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); } /* * If we can't find a page in the cache, we will create a new page * and fill it with file data. For efficiency, we may try to fill * multiple pages at once (klustering) to fill up the supplied page * list. Note that the pages to be filled are held with an exclusive * lock to prevent access by other threads while they are being filled. */ static int zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) { znode_t *zp = VTOZ(vp); page_t *pp, *cur_pp; objset_t *os = zp->z_zfsvfs->z_os; u_offset_t io_off, total; size_t io_len; int err; if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { /* * We only have a single page, don't bother klustering */ io_off = off; io_len = PAGESIZE; pp = page_create_va(vp, io_off, io_len, PG_EXCL | PG_WAIT, seg, addr); } else { /* * Try to find enough pages to fill the page list */ pp = pvn_read_kluster(vp, off, seg, addr, &io_off, &io_len, off, plsz, 0); } if (pp == NULL) { /* * The page already exists, nothing to do here. */ *pl = NULL; return (0); } /* * Fill the pages in the kluster. */ cur_pp = pp; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { caddr_t va; ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, DMU_READ_PREFETCH); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ pvn_read_done(pp, B_ERROR); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = EIO; return (err); } cur_pp = cur_pp->p_next; } /* * Fill in the page list array from the kluster starting * from the desired offset `off'. * NOTE: the page list will always be null terminated. */ pvn_plist_init(pp, pl, plsz, off, io_len, rw); ASSERT(pl == NULL || (*pl)->p_offset == off); return (0); } /* * Return pointers to the pages for the file region [off, off + len] * in the pl array. If plsz is greater than len, this function may * also return page pointers from after the specified region * (i.e. the region [off, off + plsz]). These additional pages are * only returned if they are already in the cache, or were created as * part of a klustered read. * * IN: vp - vnode of file to get data from. * off - position in file to get data from. * len - amount of data to retrieve. * plsz - length of provided page list. * seg - segment to obtain pages for. * addr - virtual address of fault. * rw - mode of created pages. * cr - credentials of caller. * ct - caller context. * * OUT: protp - protection mode of created pages. * pl - list of pages created. * * RETURN: 0 if success * error code if failure * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; page_t **pl0 = pl; int err = 0; /* we do our own caching, faultahead is unnecessary */ if (pl == NULL) return (0); else if (len > plsz) len = plsz; else len = P2ROUNDUP(len, PAGESIZE); ASSERT(plsz >= len); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (protp) *protp = PROT_ALL; /* * Loop through the requested range [off, off + len) looking * for pages. If we don't find a page, we will need to create * a new page and fill it with data from the file. */ while (len > 0) { if (*pl = page_lookup(vp, off, SE_SHARED)) *(pl+1) = NULL; else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) goto out; while (*pl) { ASSERT3U((*pl)->p_offset, ==, off); off += PAGESIZE; addr += PAGESIZE; if (len > 0) { ASSERT3U(len, >=, PAGESIZE); len -= PAGESIZE; } ASSERT3U(plsz, >=, PAGESIZE); plsz -= PAGESIZE; pl++; } } /* * Fill out the page array with any pages already in the cache. */ while (plsz > 0 && (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { off += PAGESIZE; plsz -= PAGESIZE; } out: if (err) { /* * Release any pages we have previously locked. */ while (pl > pl0) page_unlock(*--pl); } else { ZFS_ACCESSTIME_STAMP(zfsvfs, zp); } *pl = NULL; ZFS_EXIT(zfsvfs); return (err); } /* * Request a memory map for a section of a file. This code interacts * with common code and the VM system as follows: * * common code calls mmap(), which ends up in smmap_common() * * this calls VOP_MAP(), which takes you into (say) zfs * * zfs_map() calls as_map(), passing segvn_create() as the callback * * segvn_create() creates the new segment and calls VOP_ADDMAP() * * zfs_addmap() updates z_mapcnt */ /*ARGSUSED*/ static int zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; segvn_crargs_t vn_a; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((prot & PROT_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { ZFS_EXIT(zfsvfs); return (EPERM); } if ((prot & (PROT_READ | PROT_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { ZFS_EXIT(zfsvfs); return (EACCES); } if (vp->v_flag & VNOMAP) { ZFS_EXIT(zfsvfs); return (ENOSYS); } if (off < 0 || len > MAXOFFSET_T - off) { ZFS_EXIT(zfsvfs); return (ENXIO); } if (vp->v_type != VREG) { ZFS_EXIT(zfsvfs); return (ENODEV); } /* * If file is locked, disallow mapping. */ if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { ZFS_EXIT(zfsvfs); return (EAGAIN); } as_rangelock(as); error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); if (error != 0) { as_rangeunlock(as); ZFS_EXIT(zfsvfs); return (error); } vn_a.vp = vp; vn_a.offset = (u_offset_t)off; vn_a.type = flags & MAP_TYPE; vn_a.prot = prot; vn_a.maxprot = maxprot; vn_a.cred = cr; vn_a.amp = NULL; vn_a.flags = flags & ~MAP_TYPE; vn_a.szc = 0; vn_a.lgrp_mem_policy_flags = 0; error = as_map(as, *addrp, len, segvn_create, &vn_a); as_rangeunlock(as); ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ static int zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, caller_context_t *ct) { uint64_t pages = btopr(len); atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); return (0); } /* * The reason we push dirty pages as part of zfs_delmap() is so that we get a * more accurate mtime for the associated file. Since we don't have a way of * detecting when the data was actually modified, we have to resort to * heuristics. If an explicit msync() is done, then we mark the mtime when the * last page is pushed. The problem occurs when the msync() call is omitted, * which by far the most common case: * * open() * mmap() * * munmap() * close() *